<a href="https://colab.research.google.com/github/siddtheshah/vc_modeling/blob/master/category_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
!pip install cityhash
import cityhash
import pandas

from copy import deepcopy



In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [16]:
import collections

org_info = pd.read_csv("/content/gdrive/My Drive/vc_modeling/data/crunchbase_bulk_export/organizations.csv")

# Filter down the data purely to investees
org_info = org_info[org_info['primary_role'] == 'company']

# Select organization data
# Clean the df

comma_sep_list_vars = ['category_list', 'category_groups_list']
categorical_vars = ['region']
other_vars = ['uuid']
org_info_selected = org_info[other_vars + categorical_vars + comma_sep_list_vars]
org_info_selected = org_info_selected.dropna()
org_info_selected = org_info_selected.reset_index(drop=True)

vocabulary = collections.Counter()

iter = 0
test_stop = -1
total_rows = np.shape(org_info_selected)[0]
print("Processing ", total_rows, " rows.")
for (i, row) in org_info_selected.iterrows():
  if iter == test_stop:
    break
  iter += 1
  if iter % 100000 == 0:
    print(iter, " of ", total_rows)

  for col, _ in row.iteritems():
    if col in comma_sep_list_vars:
      # Break categories into unigrams
      token_lists = [x.split(' ') for x in row[col].strip().split(',')]
      tokens = []
      for token_list in token_lists:
        for token in token_list:
          tokens.append(token)

      for token in tokens:
        vocabulary[token] += 1

# Show the unfiltered vocabulary counts
print(vocabulary.most_common())

# Filter low count words
low_count_threshold = 15
keys_to_remove = []
for word, count in vocabulary.items():
  if count < low_count_threshold:
    keys_to_remove.append(word)

for key in keys_to_remove:
  del vocabulary[key]

# Now rank and filter by document frequency. The top half frequent words will be dropped
key_list = list(vocabulary.keys())
fraction_to_drop = .25
chop = sorted(key_list, key=lambda x: vocabulary[x])[-int(fraction_to_drop*len(key_list)):]
for key in chop:
  del vocabulary[key]

# Create index to word mapping
word_to_index = {}
word_index = 0
for word, count in vocabulary.items():
  word_to_index[word] = word_index
  word_index += 1

print(word_to_index)
vocabulary_size = word_index

Processing  808944  rows.
100000  of  808944
200000  of  808944
300000  of  808944
400000  of  808944
500000  of  808944
600000  of  808944
700000  of  808944
800000  of  808944
[('and', 663808), ('Software', 367507), ('Services', 319433), ('Information', 278916), ('Technology', 263815), ('Internet', 187091), ('Marketing', 167899), ('Manufacturing', 149479), ('Health', 146421), ('Media', 141939), ('Care', 140941), ('Advertising', 113301), ('Entertainment', 112999), ('Sales', 108883), ('Hardware', 108658), ('Financial', 106409), ('Design', 105993), ('Mobile', 100457), ('Shopping', 90106), ('Consumer', 86965), ('Real', 85025), ('Commerce', 84574), ('Estate', 83013), ('Engineering', 81794), ('Apps', 80045), ('Science', 71213), ('Electronics', 69568), ('Education', 67499), ('Data', 66787), ('Analytics', 64631), ('Food', 59242), ('E-Commerce', 57608), ('Transportation', 54731), ('Energy', 54196), ('Consulting', 53943), ('Professional', 51243), ('Web', 50645), ('Beverage', 50281), ('Manageme

In [17]:
# Show words from our filtered vocabulary.

print(vocabulary.most_common())

[('Small', 3614), ('Medium', 3614), ('Businesses', 3614), ('Outsourcing', 3568), ('Mechanical', 3529), ('Planning', 3495), ('Beauty', 3488), ('Venture', 3479), ('Capital', 3479), ('Life', 3455), ('Rental', 3361), ('Sharing', 3331), ('Therapeutics', 3319), ('Email', 3302), ('Location', 3229), ('Based', 3229), ('Local', 3100), ('Water', 3094), ('Architecture', 3011), ('Art', 2802), ('Cryptocurrency', 2735), ('Production', 2733), ('Distribution', 2664), ('Crowdfunding', 2646), ('Wine', 2570), ('And', 2570), ('Spirits', 2570), ('Advice', 2523), ('Semiconductor', 2513), ('Packaging', 2475), ('Collaboration', 2469), ('Electrical', 2419), ('Streaming', 2407), ('Furniture', 2406), ('Investment', 2402), ('Interior', 2383), ('Smart', 2338), ('Employment', 2318), ('Risk', 2293), ('Precious', 2292), ('Metals', 2292), ('Document', 2277), ('Database', 2267), ('Support', 2256), ('Augmented', 2167), ('Language', 2148), ('Resource', 2146), ('(ERP)', 2146), ('Storage', 2110), ('Textiles', 2086), ('Netwo

In [18]:
from scipy.sparse import coo_matrix
import collections

# Load our filtered vocabulary if we've saved it on disk.

row_indices = []
col_indices = []
data_values = []

iter = 0
test_stop = -1
total_rows = np.shape(org_info_selected)[0]
print("Processing ", total_rows, " rows.")
for (i, row) in org_info_selected.iterrows():
  if iter == test_stop:
    break
  iter += 1
  if iter % 100000 == 0:
    print(iter, " of ", total_rows)
    
  for col, _ in row.iteritems():
    if col in comma_sep_list_vars:
      # Break categories into unigrams
      unigram_counts = collections.Counter()
      token_lists = [x.split(' ') for x in row[col].strip().split(',')]
      for token_list in token_lists:
        for token in token_list:
          unigram_counts[token] += 1
      
      for unigram, count in unigram_counts.items():
        if unigram in vocabulary:
          row_indices.append(i)
          col_indices.append(word_to_index[unigram])
          data_values.append(count)
    elif col in categorical_vars:
      value = row[col]
      if value in vocabulary:
        row_indices.append(i)
        col_indices.append(word_to_index[value])
        data_values.append(1)

unclustered = coo_matrix((data_values, (row_indices, col_indices)))
print(unclustered.toarray().shape)

Processing  808944  rows.
100000  of  808944
200000  of  808944
300000  of  808944
400000  of  808944
500000  of  808944
600000  of  808944
700000  of  808944
800000  of  808944
(808944, 562)


In [19]:
import scipy.sparse

companies = org_info_selected['uuid']

lookup_index = []
lookup_value = []

used_hashes = set()

row_indices = []
col_indices = []
data_values = []
for (i, row) in companies.iteritems():
  uuid = row
  hash = cityhash.CityHash64(uuid)

  if hash in used_hashes:  # Hash collision!
    print(hash)
    continue          # which shouldn't really happen
  used_hashes.add(hash)

  lookup_value.append(uuid)
  lookup_index.append(hash)

  row_indices.append(i)
  col_indices.append(0)
  data_values.append(hash)

hashed_uuids = coo_matrix((data_values, (row_indices, col_indices)))
print(np.shape(unclustered))
print(np.shape(hashed_uuids))
joined = scipy.sparse.hstack([hashed_uuids, unclustered])
  
## Generate company lookup
lookup = pd.DataFrame.from_dict({'uuid':lookup_index, 'hash':data_values})

lookup.to_csv('/lookup.csv')

category_features = pd.DataFrame.sparse.from_spmatrix(joined)
category_features.to_pickle('/category_features.pkl')



(808944, 562)
(808944, 1)


In [20]:
print(category_features)

                 0    1    2    3    4    5    ...  557  558  559  560  561  562
0       1.368553e+19  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
1       7.640156e+17  1.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2       1.084655e+19  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
3       5.087507e+18  0.0  1.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
4       9.094535e+18  0.0  0.0  1.0  1.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
...              ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...
808939  1.689286e+19  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
808940  7.229717e+18  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
808941  4.626564e+18  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
808942  2.978566e+18  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
808943  9.744251e+18  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0

[808944 rows x 563 columns]