<a href="https://colab.research.google.com/github/siddtheshah/vc_modeling/blob/master/regression_targets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

!pip install cityhash
import cityhash
from copy import deepcopy



In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
file_names = os.listdir("/content/gdrive/My Drive/vc_modeling/data/crunchbase_bulk_export/")
df_names = [x[:-4] for x in file_names]
print(df_names)

dfs = [pd.read_csv("/content/gdrive/My Drive/vc_modeling/data/crunchbase_bulk_export/"+x) for x in file_names]
df_dict = dict(zip(df_names, dfs))
print(df_dict.keys())

['category_groups', 'funding_rounds', 'people', 'checksum', 'people_descriptions', 'investors', 'organization_descriptions', 'investment_partners', 'event_appearances', 'organizations', 'org_parents', 'jobs', 'acquisitions', 'funds', 'ipos', 'degrees', 'investments', 'events']
dict_keys(['category_groups', 'funding_rounds', 'people', 'checksum', 'people_descriptions', 'investors', 'organization_descriptions', 'investment_partners', 'event_appearances', 'organizations', 'org_parents', 'jobs', 'acquisitions', 'funds', 'ipos', 'degrees', 'investments', 'events'])


In [None]:
df_dict["organizations"][:200]

### Validation

In [71]:
print(np.shape(df_dict['organizations']))
print(np.shape(df_dict['funding_rounds']))

fr = df_dict['funding_rounds']['org_uuid'].apply(cityhash.CityHash64)
orgs = df_dict['organizations']['uuid'].apply(cityhash.CityHash64)
print(fr, orgs)

frogs = fr[fr.isin(orgs)]
print(frogs[0])
print(np.shape(frogs))

orgfrs = orgs[orgs.isin(fr)]
print(np.shape(orgfrs))

(1089733, 41)
(333176, 24)
0          5087506707876194815
1          5087506707876194815
2          5087506707876194815
3          1053843699762228743
4         17418110891307706341
                  ...         
333171     1693115704498827940
333172    11932824214984234785
333173    11932824214984234785
333174    11939097437904463371
333175     9204522222235410490
Name: org_uuid, Length: 333176, dtype: uint64 0          13685534557686295101
1            764015621929367586
2          10846552445983457719
3          10693046220981818130
4           5087506707876194815
                   ...         
1089728     7648380604111671063
1089729    15062640940509416191
1089730     9929788675484222851
1089731     2009917061565156091
1089732    12539503581336717831
Name: uuid, Length: 1089733, dtype: uint64
5087506707876194815
(333176,)
(177522,)


In [9]:
def maybe_add_funding_estimates(row):
  # print(row)
  # print("Row printed")
  # print(np.isnan(row['post_money_valuation_usd']))
  if np.isnan(row['post_money_valuation_usd']) and not np.isnan(row['raised_amount_usd']):
    if row['raised_amount_usd'] < 1:
      row['post_money_valuation_usd'] = 10000
    else:
      # print("Replaced: ", row['post_money_valuation_usd'], " ", row['raised_amount_usd']*3)
      row['post_money_valuation_usd'] = row['raised_amount_usd']*5
  return row['post_money_valuation_usd']

df = pd.DataFrame(list(zip([np.nan, np.nan], ['1000', '1000'])) ,columns =['post_money_valuation_usd', 'raised_amount_usd']).astype(np.float32)
df['post_money_valuation_usd'] = df.apply(maybe_add_funding_estimates, axis=1)
print(df)

   post_money_valuation_usd  raised_amount_usd
0                    5000.0             1000.0
1                    5000.0             1000.0


In [10]:
imputed_funding_rounds = deepcopy(df_dict["funding_rounds"])
# print(imputed_funding_rounds)
imputed_funding_rounds['post_money_valuation_usd'] = imputed_funding_rounds.apply(maybe_add_funding_estimates, axis=1)
# print(imputed_funding_rounds[['post_money_valuation_usd', 'raised_amount_usd']])

In [12]:
print(imputed_funding_rounds.head())

funding_data = imputed_funding_rounds[['org_uuid', 'announced_on', 'post_money_valuation_usd']].rename(
    columns={"announced_on": "date", "post_money_valuation_usd": "valuation"})

acquisitions_data = deepcopy(df_dict["acquisitions"][['acquiree_uuid', 'acquired_on', 'price_usd']]).rename(
    columns={"acquiree_uuid": "org_uuid", "acquired_on": "date", "price_usd": "valuation"})
ipos_data = deepcopy(df_dict["ipos"][['org_uuid', 'went_public_on', 'valuation_price_usd']]).rename(
    columns={"went_public_on": "date", "valuation_price_usd": "valuation"})

types = df_dict["acquisitions"][['type']].values
acquisition_types = df_dict["acquisitions"][['acquisition_type']].values
acquisitions_data['event'] = [types[x][0] if not isinstance(acquisition_types[x][0], str) 
                              else acquisition_types[x][0] for x in range(len(acquisitions_data))]
funding_data['event'] = pd.Series([x[0] for x in df_dict["funding_rounds"]['name'].str.split(" - ")])
ipos_data['event'] = df_dict['ipos']['stock_exchange_symbol']



                                   uuid  ...                                lead_investor_uuids
0  8a945939-18e0-cc9d-27b9-bf33817b2818  ...               3f47be49-2e32-8118-01a0-31685a4d0fd7
1  d950d7a5-79ff-fb93-ca87-13386b0e2feb  ...               b08efc27-da40-505a-6f9d-c9e14247bf36
2  6fae3958-a001-27c0-fb7e-666266aedd78  ...  e2006571-6b7a-e477-002a-f7014f48a7e3,8d5c7e48-...
3  bcd5a63d-ed99-6963-0dd2-e36f6582f846  ...                                                NaN
4  60e6afd9-1215-465a-dd17-0ed600d4e29b  ...               fb2f8884-ec07-895a-48d7-d9a9d4d7175c

[5 rows x 24 columns]


In [86]:
imputed_funding_rounds.head()

Unnamed: 0,uuid,name,type,permalink,cb_url,rank,created_at,updated_at,country_code,state_code,region,city,investment_type,announced_on,raised_amount_usd,raised_amount,raised_amount_currency_code,post_money_valuation_usd,post_money_valuation,post_money_valuation_currency_code,investor_count,org_uuid,org_name,lead_investor_uuids
0,8a945939-18e0-cc9d-27b9-bf33817b2818,Angel Round - Facebook,funding_round,facebook-angel--8a945939,https://www.crunchbase.com/funding_round/faceb...,143236.0,2007-05-27 06:08:18,2018-02-12 23:05:39,USA,CA,California,Menlo Park,angel,2004-09-01,500000.0,500000.0,USD,2500000.0,,USD,4.0,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,3f47be49-2e32-8118-01a0-31685a4d0fd7
1,d950d7a5-79ff-fb93-ca87-13386b0e2feb,Series A - Facebook,funding_round,facebook-series-a--d950d7a5,https://www.crunchbase.com/funding_round/faceb...,145939.0,2007-05-27 06:09:10,2018-02-12 23:52:16,USA,CA,California,Menlo Park,series_a,2005-05-01,12700000.0,12700000.0,USD,98000000.0,98000000.0,USD,4.0,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,b08efc27-da40-505a-6f9d-c9e14247bf36
2,6fae3958-a001-27c0-fb7e-666266aedd78,Series B - Facebook,funding_round,facebook-series-b--6fae3958,https://www.crunchbase.com/funding_round/faceb...,273250.0,2007-05-27 06:09:36,2018-02-12 23:30:46,USA,CA,California,Menlo Park,series_b,2006-04-01,27500000.0,27500000.0,USD,502500000.0,502500000.0,USD,5.0,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,"e2006571-6b7a-e477-002a-f7014f48a7e3,8d5c7e48-..."
3,bcd5a63d-ed99-6963-0dd2-e36f6582f846,Series B - Photobucket,funding_round,photobucket-series-b--bcd5a63d,https://www.crunchbase.com/funding_round/photo...,149479.0,2007-05-29 11:05:59,2018-02-12 23:27:36,USA,CO,Colorado,Denver,series_b,2006-05-01,10500000.0,10500000.0,USD,52500000.0,,USD,2.0,f53cb4de-236e-0b1b-dee8-7104a8b018f9,Photobucket,
4,60e6afd9-1215-465a-dd17-0ed600d4e29b,Series A - Geni,funding_round,geni-series-a--60e6afd9,https://www.crunchbase.com/funding_round/geni-...,313010.0,2007-05-31 20:19:28,2018-02-12 23:41:29,USA,CA,California,West Hollywood,series_a,2007-01-17,,,,10000000.0,10000000.0,USD,1.0,4111dc8b-c0df-2d24-ed33-30cd137b3098,Geni,fb2f8884-ec07-895a-48d7-d9a9d4d7175c


In [19]:
timelines = pd.concat([funding_data, acquisitions_data, ipos_data])

# Normalize dates

timelines.sort_values(by=['org_uuid', 'date'], inplace=True)
timelines.reset_index(inplace=True, drop=True)

timelines.set_index(['org_uuid'])
grouped_timelines = timelines.groupby(['org_uuid'])

In [32]:
grouped_timelines.head()
print(grouped_timelines['org_uuid'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f41298c8d68>


AttributeError: ignored

In [21]:
inspect_id = 'ffffabce-6d4a-b3d1-13c0-4e90cedf5270'
funding_data[funding_data['org_uuid'] == inspect_id].head()
# organizations[organizations['uuid'] == inspect_id].head()

Unnamed: 0,org_uuid,date,valuation,event
132839,ffffabce-6d4a-b3d1-13c0-4e90cedf5270,2015-05-01,400000.0,Angel Round
132840,ffffabce-6d4a-b3d1-13c0-4e90cedf5270,2015-01-01,75000.0,Grant
132841,ffffabce-6d4a-b3d1-13c0-4e90cedf5270,2015-09-15,410000.0,Product Crowdfunding
132842,ffffabce-6d4a-b3d1-13c0-4e90cedf5270,2016-05-15,1050000.0,Seed Round
150066,ffffabce-6d4a-b3d1-13c0-4e90cedf5270,2017-02-09,3500000.0,Seed Round


In [22]:
import datetime
company_timelines = {}

milli_per_day = 86400  # Used for converting Unix time
dropped = 0

for company_id in timelines['org_uuid']:
  timeline = grouped_timelines.get_group(company_id)
  timeline = timeline[timeline['valuation'].notnull()]               # We only care about times at which there is a valuation of the company.
  if timeline.empty:
    dropped += 1
    continue
  
  timeline = timeline.copy()
  timeline.reset_index(inplace=True, drop=True)

  day_zero = datetime.datetime.strptime(timeline['date'].iloc[0], '%Y-%m-%d').timestamp()
  parsed_dates = pd.Series([datetime.datetime.strptime(x, '%Y-%m-%d').timestamp() for x in timeline['date']])
  norm_dates = (parsed_dates - day_zero)/milli_per_day
  timeline['norm_dates'] = norm_dates
  company_timelines[company_id] = timeline

print("Retained: ", len(company_timelines))
print("Dropped: ", dropped)

Retained:  144569
Dropped:  152322


In [44]:
iter = 0
for x, y in company_timelines.items():
  print(x)
  iter += 1
  if iter > 5:
    break

00000aa4-ba42-9b68-a9c3-040c9f3bf9b9
00002470-bff7-6226-5800-0ca1b3787b6f
000095de-8e2b-82f1-32a7-c222ba3d5682
0000d497-c93a-eea3-eeb0-a943dfb4f71e
0001a8cc-0cdc-4a30-b4d3-da1b425069e1
0001eae7-077d-4d0b-a717-f67bcf2a09fa


### Validation

In [64]:
check_val = '00002470-bff7-6226-5800-0ca1b3787b6f'
print(company_timelines[check_val])
fr_data = df_dict['funding_rounds']
view = fr_data[fr_data['org_uuid'] == check_val]
print(np.shape(view))
hash_val = cityhash.CityHash64(check_val)
print("Hash Value: ", hash_val)
orgfrs_view = orgfrs[orgfrs == hash_val]
print(orgfrs_view)
print(orgfrs.eq(hash_val).any())

                               org_uuid  ... norm_valuations
0  00002470-bff7-6226-5800-0ca1b3787b6f  ...        0.000000
1  00002470-bff7-6226-5800-0ca1b3787b6f  ...        0.693147
2  00002470-bff7-6226-5800-0ca1b3787b6f  ...        0.980829

[3 rows x 7 columns]
(5, 24)
Hash Value:  13360469805707984821
130002    13360469805707984821
Name: uuid, dtype: uint64
True


In [26]:
for company_id, timeline in company_timelines.items():
  initial_valuation = timeline['valuation'].iloc[0]
  timeline['initial_val'] = initial_valuation.tolist()
  norm_valuations = pd.Series([np.log(value/initial_valuation) for value in timeline['valuation']])
  timeline['norm_valuations'] = norm_valuations

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [69]:
from scipy import interpolate
from google.colab import files

def get_valuation_interpolations(timelines, day):
  company_hashes = []
  regression_targets = []
  initial_valuations = []
  for company, timeline in timelines.items():
    x = timeline['norm_dates'].tolist()
    y = timeline['norm_valuations'].tolist()

    if len(x) < 2 or len(y) < 2:
      regression_targets.append(0)     # Single point timeline. Leave decision to drop for downstream.
      continue
    
    if day > x[-1]:
      regression_targets.append(y[-1])  # Don't extrapolate the data at all.
      continue

    interpolator = interpolate.interp1d(x, y)
    regression_target = interpolator(day)
    # print(company)
    hash = cityhash.CityHash64(company)
    # print(frogs)
    if not orgfrs.eq(hash_val).any():
      print("Bad Hash")
      break
    company_hashes.append(hash)
    regression_targets.append(regression_target)
    initial_valuations.append(timeline['initial_val'].tolist()[0])

  return pd.DataFrame(list(zip(company_hashes, initial_valuations, regression_targets)), columns=["hash", "initial_valuation", "log_valuation_factor"])


In [70]:
# for day in [200, 500, 1000, 2000]:
for day in [200]:
  frame = get_valuation_interpolations(company_timelines, day)
  print(frame)
  print(np.count_nonzero(frame['log_valuation_factor']))
  frame.to_pickle('/' + str(day) + '.pkl')


                       hash  initial_valuation  log_valuation_factor
0      13360469805707984821          3000000.0                     0
1       7551169957279540846         45000000.0    0.2665950694461328
2      17638643441008354186          2280520.0                     0
3      14753292511968607343          3000000.0                     0
4      18303053280205650499         20400150.0                     0
...                     ...                ...                   ...
49484   1328871151505778773           600000.0     0.183442214772801
49485  16862606919743425243           162360.0                     0
49486  14711304329054892014         65500000.0   0.34439256698059867
49487  17393885764651115266         20000000.0  -0.18929482403648876
49488  12723450708549610702            75000.0                     0

[49489 rows x 3 columns]
18291
