<a href="https://colab.research.google.com/github/siddtheshah/vc_modeling/blob/master/regression_targets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from copy import deepcopy

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
file_names = os.listdir("/content/gdrive/My Drive/vc_modeling/data/crunchbase_bulk_export/")
df_names = [x[:-4] for x in file_names]
print(df_names)

dfs = [pd.read_csv("/content/gdrive/My Drive/vc_modeling/data/crunchbase_bulk_export/"+x) for x in file_names]
df_dict = dict(zip(df_names, dfs))
print(df_dict.keys())

['category_groups', 'funding_rounds', 'people', 'checksum', 'people_descriptions', 'investors', 'organization_descriptions', 'investment_partners', 'event_appearances', 'organizations', 'org_parents', 'jobs', 'acquisitions', 'funds', 'ipos', 'degrees', 'investments', 'events']
dict_keys(['category_groups', 'funding_rounds', 'people', 'checksum', 'people_descriptions', 'investors', 'organization_descriptions', 'investment_partners', 'event_appearances', 'organizations', 'org_parents', 'jobs', 'acquisitions', 'funds', 'ipos', 'degrees', 'investments', 'events'])


In [4]:
funding_data = deepcopy(
    df_dict["funding_rounds"][['org_uuid', 'announced_on', 'post_money_valuation_usd']]).rename(
    columns={"announced_on": "date", "post_money_valuation_usd": "valuation"})
acquisitions_data = deepcopy(df_dict["acquisitions"][['acquiree_uuid', 'acquired_on', 'price_usd']]).rename(
    columns={"acquiree_uuid": "org_uuid", "acquired_on": "date", "price_usd": "valuation"})
ipos_data = deepcopy(df_dict["ipos"][['org_uuid', 'went_public_on', 'valuation_price_usd']]).rename(
    columns={"went_public_on": "date", "valuation_price_usd": "valuation"})

types = df_dict["acquisitions"][['type']].values
acquisition_types = df_dict["acquisitions"][['acquisition_type']].values
acquisitions_data['event'] = [types[x][0] if not isinstance(acquisition_types[x][0], str) 
                              else acquisition_types[x][0] for x in range(len(acquisitions_data))]
funding_data['event'] = pd.Series([x[0] for x in df_dict["funding_rounds"]['name'].str.split(" - ")])
ipos_data['event'] = df_dict['ipos']['stock_exchange_symbol']

timelines = pd.concat([funding_data, acquisitions_data, ipos_data])

# Normalize dates

timelines.sort_values(by=['org_uuid', 'date'], inplace=True)
timelines.reset_index(inplace=True, drop=True)

timelines.set_index(['org_uuid'])
grouped_timelines = timelines.groupby(['org_uuid'])

In [None]:
import datetime
company_timelines = {}

milli_per_day = 86400  # Used for converting Unix time

for company_id in timelines['org_uuid']:
  timeline = grouped_timelines.get_group(company_id).copy()
  timeline = timeline[timeline['valuation'].notnull()]               # We only care about times at which there is a valuation of the company.
  timeline.reset_index(inplace=True, drop=True)
  if timeline.empty:
    continue

  # print(timeline['date'])
  day_zero = datetime.datetime.strptime(timeline['date'].iloc[0], '%Y-%m-%d').timestamp()
  parsed_dates = pd.Series([datetime.datetime.strptime(x, '%Y-%m-%d').timestamp() for x in timeline['date']])
  norm_dates = (parsed_dates - day_zero)/milli_per_day
  timeline['norm_dates'] = norm_dates
  company_timelines[company_id] = timeline

In [None]:

for company_id, timeline in company_timelines.items():
  initial_valuation = timeline['valuation'].iloc[0]
  timeline['initial_val'] = initial_valuation
  norm_valuations = pd.Series([np.log(value/initial_valuation) for value in timeline['valuation']])
  print(norm_valuations)
  print(timeline)
  break
  timeline['norm_valuations'] = norm_valuations

In [None]:
from scipy import interpolate
from google.colab import files

def get_valuation_interpolations(timelines, day):
  company_names = []
  regression_targets = []
  initial_valuations = []
  for company, timeline in timelines.items():
    company_names.append(company)
    x = timeline['norm_dates'].tolist()
    y = timeline['norm_valuations'].tolist()
    # print("X:", x)
    # print("Y:", y)
    if len(x) < 2 or len(y) < 2:
      regression_targets.append(0)
      continue
    
    if day > x[-1]:
      regression_targets.append(y[-1])  # Don't extrapolate the data at all.
      continue

    interpolator = interpolate.interp1d(x, y)
    regression_target = interpolator(day)
    regression_targets.append(regression_target)
    initial_valuations.append(timeline['initial_valuation'])

  return pd.DataFrame(list(zip(company_names, initial_valuations, regression_targets)), columns=["company", "initial_valuation", "log_valuation_factor"])


In [41]:
for day in [200, 500, 1000, 2000]:
  frame = get_valuation_interpolations(company_timelines, day)
  frame.to_csv(str(day) + '.csv')


AttributeError: ignored