<a href="https://colab.research.google.com/github/siddtheshah/vc_modeling/blob/master/region_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
!pip install cityhash
import cityhash

from copy import deepcopy



In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
file_names = os.listdir("/content/gdrive/My Drive/vc_modeling/data/crunchbase_bulk_export/")
df_names = [x[:-4] for x in file_names]
print(df_names)

dfs = [pd.read_csv("/content/gdrive/My Drive/vc_modeling/data/crunchbase_bulk_export/"+x) for x in file_names]
df_dict = dict(zip(df_names, dfs))
print(df_dict.keys())

['category_groups', 'funding_rounds', 'people', 'checksum', 'people_descriptions', 'investors', 'organization_descriptions', 'investment_partners', 'event_appearances', 'organizations', 'org_parents', 'jobs', 'acquisitions', 'funds', 'ipos', 'degrees', 'investments', 'events']
dict_keys(['category_groups', 'funding_rounds', 'people', 'checksum', 'people_descriptions', 'investors', 'organization_descriptions', 'investment_partners', 'event_appearances', 'organizations', 'org_parents', 'jobs', 'acquisitions', 'funds', 'ipos', 'degrees', 'investments', 'events'])


In [4]:
import collections

org_info = df_dict['organizations']

# Filter down the data purely to investees
org_info = org_info[org_info['primary_role'] == 'company']

# Select organization data
# Clean the df

comma_sep_list_vars = ['category_list', 'category_groups_list']
categorical_vars = ['region']
other_vars = ['uuid']
org_info_selected = org_info[other_vars + categorical_vars + comma_sep_list_vars]
org_info_selected = org_info_selected.dropna()
org_info_selected = org_info_selected.reset_index(drop=True)

vocabulary = collections.Counter()

iter = 0
test_stop = -1
total_rows = np.shape(org_info_selected)[0]
print("Processing ", total_rows, " rows.")
for (i, row) in org_info_selected.iterrows():
  if iter == test_stop:
    break
  iter += 1
  if iter % 100000 == 0:
    print(iter, " of ", total_rows)

  for col, _ in row.iteritems():
    if col in comma_sep_list_vars:
      # Break categories into unigrams
      token_lists = [x.split(' ') for x in row[col].strip().split(',')]
      tokens = []
      for token_list in token_lists:
        for token in token_list:
          tokens.append(token)

      for token in tokens:
        vocabulary[token] += 1

# Filter low count words
low_count_threshold = 15
keys_to_remove = []
for word, count in vocabulary.items():
  if count < low_count_threshold:
    keys_to_remove.append(word)

for key in keys_to_remove:
  del vocabulary[key]

# Now rank and filter by document frequency. The top 5% gets dropped due to being extremely low information.
key_list = list(vocabulary.keys())
fraction_to_drop = .15
chop = sorted(key_list, key=lambda x: vocabulary[x])[-int(fraction_to_drop*len(key_list)):]
for key in chop:
  del vocabulary[key]

# Create index to word mapping
word_to_index = {}
word_index = 0
for word, count in vocabulary.items():
  word_to_index[word] = word_index
  word_index += 1

print(word_to_index)
vocabulary_size = word_index

Processing  808944  rows.
100000  of  808944
200000  of  808944
300000  of  808944
400000  of  808944
500000  of  808944
600000  of  808944
700000  of  808944
800000  of  808944
{'Computing': 0, 'Collaboration': 1, 'CRM': 2, 'Developer': 3, 'Tools': 4, 'Project': 5, 'Streaming': 6, 'File': 7, 'Sharing': 8, 'Hosting': 9, 'Photography': 10, 'Blogging': 11, 'SMS': 12, 'Personalization': 13, '3D': 14, 'EBooks': 15, 'Subscription': 16, 'Auctions': 17, 'Marketplace': 18, 'Search': 19, 'Engine': 20, 'Big': 21, 'Online': 22, 'Portals': 23, 'Location': 24, 'Based': 25, 'Time': 26, 'Navigation': 27, 'Mapping': 28, 'Cyber': 29, 'TV': 30, 'Mining': 31, 'Embedded': 32, 'Hedge': 33, 'Funds': 34, 'Precious': 35, 'Metals': 36, 'Broadcasting': 37, 'Contact': 38, 'Communications': 39, 'Infrastructure': 40, 'Language': 41, 'Processing': 42, 'Public': 43, 'Relations': 44, 'Knowledge': 45, 'Virtual': 46, 'Currency': 47, 'Delivery': 48, 'Film': 49, 'Brand': 50, 'Production': 51, 'Machine': 52, 'Personal': 5

In [5]:
from scipy.sparse import coo_matrix
import collections

# Load our filtered vocabulary if we've saved it on disk.

row_indices = []
col_indices = []
data_values = []

iter = 0
test_stop = -1
total_rows = np.shape(org_info_selected)[0]
print("Processing ", total_rows, " rows.")
for (i, row) in org_info_selected.iterrows():
  if iter == test_stop:
    break
  iter += 1
  if iter % 100000 == 0:
    print(iter, " of ", total_rows)

  for col, _ in row.iteritems():
    if col in comma_sep_list_vars:
      # Break categories into unigrams
      unigram_counts = collections.Counter()
      token_lists = [x.split(' ') for x in row[col].strip().split(',')]
      for token_list in token_lists:
        for token in token_list:
          unigram_counts[token] += 1
      
      for unigram, count in unigram_counts.items():
        if unigram in vocabulary:
          row_indices.append(i)
          col_indices.append(word_to_index[unigram])
          data_values.append(count)
    elif col in categorical_vars:
      value = row[col]
      if value in vocabulary:
        row_indices.append(i)
        col_indices.append(word_to_index[value])
        data_values.append(1)

unclustered = coo_matrix((data_values, (row_indices, col_indices)))
print(unclustered.toarray().shape)

Processing  808944  rows.
100000  of  808944
200000  of  808944
300000  of  808944
400000  of  808944
500000  of  808944
600000  of  808944
700000  of  808944
800000  of  808944
(808944, 637)


In [6]:
import scipy.sparse

companies = org_info_selected['uuid']

lookup_index = []
lookup_value = []

used_hashes = set()

row_indices = []
col_indices = []
data_values = []
for (i, row) in companies.iteritems():
  uuid = row
  hash = cityhash.CityHash64(uuid)

  if hash in used_hashes:  # Hash collision!
    print(hash)
    continue          # which shouldn't really happen
  used_hashes.add(hash)

  lookup_value.append(uuid)
  lookup_index.append(hash)

  row_indices.append(i)
  col_indices.append(0)
  data_values.append(hash)

hashed_uuids = coo_matrix((data_values, (row_indices, col_indices)))
print(np.shape(unclustered))
print(np.shape(hashed_uuids))
joined = scipy.sparse.hstack([hashed_uuids, unclustered])
  
## Generate company lookup
lookup = pd.DataFrame.from_dict({'uuid':lookup_index, 'hash':data_values})

lookup.to_csv('/lookup.csv')

category_features = pd.DataFrame.sparse.from_spmatrix(joined)
category_features.to_pickle('/category_features.pkl')



(808944, 637)
(808944, 1)
