<a href="https://colab.research.google.com/github/siddtheshah/vc_modeling/blob/master/regressor_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse
!pip install cityhash
import cityhash
import sklearn.decomposition
import pickle

print(pd.__version__)

from copy import deepcopy

1.0.5


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Read Regression Targets

In [4]:
target_folder = "/content/gdrive/My Drive/vc_modeling/regression_targets/"
file_names = os.listdir(target_folder)
marks = [int(x.replace('.csv', '')) for x in file_names]
print(marks)
regression_marks = {}
for mark in marks:
  rm = pd.read_csv(target_folder + str(mark) + '.csv')[['hash', 'initial_valuation', 'log_valuation_factor']].set_index('hash')
  rm = rm[~rm.isin([np.nan, np.inf, -np.inf]).any(1)]
  regression_marks[mark] = rm


[200, 500, 1000, 2000]


In [6]:
mark_data = regression_marks[200] # pd.read_pickle("/content/gdrive/My Drive/vc_modeling/regression_targets/200.pkl")
# mark_data = mark_data[mark_data['log_valuation_factor'] > 0]
print(mark_data)
# print(np.count_nonzero(mark_data.index.isin(features.index)))

                      initial_valuation  log_valuation_factor
hash                                                         
2053339725337568679         413036820.0              0.000000
13360469805707984821          3000000.0              0.266595
12201126308526847683         47500000.0              0.000000
17482404514494389050          2157880.0              0.000000
16923506324318240851          7500000.0              0.000000
...                                 ...                   ...
4057326795460754576            552105.0              0.000000
17393885764651115266         20000000.0              1.011831
14785394360939257924        125000000.0              0.000000
15207057269115911424         31150000.0              0.000000
12723450708549610702            75000.0              1.688395

[144569 rows x 2 columns]


# Read/Join Features

## Sparse Features
These need to go through dimensionality reduction

In [7]:
feature_folder = '/content/gdrive/My Drive/vc_modeling/feature_extraction'

sparse_category_features_array = scipy.sparse.load_npz(feature_folder + "/category_features/category_features_large.npz")
sparse_region_features_array = scipy.sparse.load_npz(feature_folder + "/region_features/region_features.npz")

In [8]:
category_features_array = scipy.sparse.coo_matrix(sparse_category_features_array, dtype=np.uint64)
region_features_array = scipy.sparse.coo_matrix(sparse_region_features_array, dtype=np.uint64)

print(category_features_array.getnnz())
print(region_features_array.getnnz())

category_features_df = pd.DataFrame.sparse.from_spmatrix(category_features_array)
region_features_df = pd.DataFrame.sparse.from_spmatrix(region_features_array)

print(np.shape(category_features_df))
print(np.shape(region_features_df))

print("{}".format(category_features_df.iloc[0][0]))

2355537
1681588
(963967, 676)
(842699, 1032)
13685534557686295101


In [9]:
region_uuid = region_features_df.iloc[:, 0]
category_uuid = category_features_df.iloc[:, 0]
print(region_uuid)

print(np.count_nonzero(category_uuid.isin(region_uuid)))
check_value = 7551169957279540846
# check_value = cityhash.CityHash64('ffffabce-6d4a-b3d1-13c0-4e90cedf5270')
print(check_value)
print(np.size(region_uuid[region_uuid == check_value]))
print(np.size(category_uuid[category_uuid == check_value]))

0         13685534557686295101
1           764015621929367586
2         10846552445983457719
3          5087506707876194815
4          9094535307341385563
                  ...         
842694     4626564123199390189
842695     2978566027619600648
842696     1747954284855665241
842697     9744251496066876000
842698     7648380604111671063
Name: 0, Length: 842699, dtype: Sparse[uint64, 0]
808944
7551169957279540846
1
1


In [10]:
join_base = category_features_df.set_index(0)
join1 = region_features_df.set_index(0)
sparse_join = join_base.join(join1, lsuffix='category_features', rsuffix='region_features')
sparse_join = sparse_join.dropna()
# features = category_features_df
print(sparse_join)

                      1category_features  2category_features  ...  1030  1031
0                                                             ...            
13685534557686295101                 0.0                 0.0  ...   0.0   0.0
764015621929367586                   1.0                 1.0  ...   0.0   0.0
10846552445983457719                 0.0                 0.0  ...   0.0   0.0
5087506707876194815                  0.0                 0.0  ...   0.0   0.0
9094535307341385563                  0.0                 0.0  ...   0.0   0.0
...                                  ...                 ...  ...   ...   ...
16892862651199510638                 0.0                 0.0  ...   0.0   0.0
7229716827056183679                  0.0                 0.0  ...   0.0   0.0
4626564123199390189                  0.0                 0.0  ...   0.0   0.0
2978566027619600648                  0.0                 0.0  ...   0.0   0.0
9744251496066876000                  0.0                 0.0  ..

## Dimensionality Reduction

In [11]:
svd = sklearn.decomposition.TruncatedSVD(n_components=100, n_iter=10)
# Can't fit more than 10k samples or SVD will crash.
# If the samples are well distributed, this might be OK.
svd.fit(sparse_join[:15000])

print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)

reduced_features = svd.transform(sparse_join)

# lda = sklearn.decomposition.LatentDirichletAllocation(n_components=100,random_state=0, learning_method='online', total_samples=2e5)
# lda.partial_fit(sparse_join)
# reduced_features = lda.transform(sparse_join)


[0.06201002 0.04136426 0.03052181 0.02412492 0.02133094 0.01809638
 0.01821206 0.01757454 0.01589228 0.01511112 0.01381074 0.01349057
 0.01274772 0.01224814 0.01171326 0.01124586 0.01098865 0.01016308
 0.00945056 0.00909412 0.00893294 0.00866497 0.0080689  0.0079854
 0.00778647 0.00739946 0.00723822 0.00673296 0.00643826 0.00631925
 0.0061844  0.00595846 0.00580656 0.00570427 0.00569912 0.00543318
 0.00519543 0.00515399 0.00512568 0.00492261 0.00489803 0.00476558
 0.00470362 0.00468774 0.00454391 0.00444036 0.0044279  0.00432053
 0.00428805 0.00414274 0.00409159 0.00406363 0.00397662 0.00383421
 0.00373743 0.00367794 0.00361784 0.00349269 0.00344724 0.00342722
 0.00338314 0.00330977 0.00321618 0.00313721 0.00310981 0.00311287
 0.00307049 0.0030493  0.00302579 0.00296635 0.00293532 0.00289815
 0.00284957 0.0028189  0.00278416 0.00274529 0.00269564 0.00269153
 0.00265261 0.00263682 0.00262926 0.00259163 0.00256322 0.00254514
 0.00252417 0.00249792 0.00248335 0.00246793 0.0024561  0.00242

In [12]:
print(reduced_features)

[[ 1.93442333e-02  1.01881367e-01  2.75295137e-01 ...  1.32277651e-03
  -5.34846346e-04 -1.85159042e-03]
 [ 1.22762868e+00 -5.75667147e-02  4.53293139e-02 ...  2.47123163e-01
  -2.34556558e-01  4.68148900e-02]
 [ 8.52595440e-02  1.09923636e-01  2.99163741e-01 ...  3.44003093e-03
   1.87952605e-02 -3.24884271e-03]
 ...
 [ 4.96430409e-04  1.41098458e-03  2.81446000e-03 ... -2.82644256e-03
  -1.03249768e-02  5.79463403e-03]
 [ 9.65193781e-01 -2.14868786e-01 -1.12641455e-01 ... -1.37730784e-02
   8.38709834e-02 -1.87295398e-02]
 [ 7.08646222e-03  2.48330429e-03  7.73408886e-03 ...  7.81372334e-02
   7.31232707e-02 -1.97713586e-01]]


## Dense Time Invariant features

In [13]:
founder_features = pd.read_csv('/content/gdrive/My Drive/vc_modeling/feature_extraction/founder_features/organization_founders_features.csv')
founder_features['hash'] = founder_features['org_uuid'].apply(cityhash.CityHash64)
founder_features = founder_features.set_index(['hash'], drop=True).drop(['org_uuid'], axis=1)


founder_features_only = founder_features.dropna()


In [19]:
sparse_data = pd.DataFrame(data=reduced_features, index=sparse_join.index, columns=range(np.shape(reduced_features)[1]))
time_invariant_features = sparse_data.join(founder_features, lsuffix='sparse', rsuffix='dense').fillna(0)
print(time_invariant_features.dropna())

                             0  ...  founders_count
0                               ...                
13685534557686295101  0.019344  ...             2.0
764015621929367586    1.227629  ...             1.0
10846552445983457719  0.085260  ...             1.0
5087506707876194815   1.041919  ...             9.0
9094535307341385563   1.041080  ...             2.0
...                        ...  ...             ...
16892862651199510638  0.013808  ...             0.0
7229716827056183679   0.006609  ...             2.0
4626564123199390189   0.000496  ...             0.0
2978566027619600648   0.965194  ...             0.0
9744251496066876000   0.007086  ...             0.0

[808944 rows x 106 columns]


## Time Variant Features

In [49]:
funding_features = pd.read_csv('/content/gdrive/My Drive/vc_modeling/data/funding_features_for_regression.csv')
funding_features['hash'] = funding_features['org_uuid'].apply(cityhash.CityHash64)
funding_features = funding_features.set_index('hash').drop(['org_uuid'], axis=1)

investor_features = pd.read_csv('/content/gdrive/My Drive/vc_modeling/data/investor_features_for_regression.csv')
investor_features['hash'] = investor_features['org_uuid'].apply(cityhash.CityHash64)
investor_features = investor_features.set_index('hash').drop(['org_uuid'], axis=1).drop(['lead_investor_first_fund_announce_date_max', 'lead_investor_fund_age_max', 'lead_investor_fund_age_min', 'lead_investor_first_fund_announce_date_min'], axis=1)
investor_features.head()

Unnamed: 0_level_0,pct_leads_invested_at_primary_type,lead_investor_total_fund_size_max,lead_investor_total_fund_size_min,lead_investor_total_fund_size_mean,n_unique_investors,n_unique_lead_investors,days_forward
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4186118337114422631,1.0,20123330.0,20123329.0,20123330.0,3,1,2000
12333937817470221909,1.0,276000000.0,276000000.0,276000000.0,2,2,2000
8833180824184546855,1.0,216472200.0,216472233.0,216472200.0,3,1,2000
13059672054285866807,0.75,7629000000.0,650000000.0,4139500000.0,4,4,200
13059672054285866807,0.75,7629000000.0,650000000.0,4139500000.0,4,4,500


In [50]:
feature_dict = {}

for mark in marks:
  funding_marked = funding_features[funding_features['days_forward'] == mark].drop('days_forward', axis=1)
  investor_marked = investor_features[investor_features['days_forward'] == mark].drop('days_forward', axis=1)
  join = time_invariant_features.join([funding_marked, investor_marked]).fillna(0)
  feature_dict[mark] = join

## Feature Normalization

In [51]:
import sklearn.preprocessing

for mark, unnorm_features in feature_dict.items():
  features_array = sklearn.preprocessing.normalize(unnorm_features, norm='max', axis=0, copy=False)
  feature_dict[mark] = pd.DataFrame(data=features_array, index=unnorm_features.index, columns=range(np.shape(unnorm_features)[1]))

# Train Model

In [75]:
# Train Models
import sklearn.metrics
import sklearn.model_selection

model_save_path = '/content/gdrive/My Drive/vc_modeling/models/' 
stats_save_path = "/content/gdrive/My Drive/vc_modeling/models/model_stats.csv"

def regression_analysis(model, train_data, train_values, test_data, test_values):
    predicted_train_values = model.predict(train_data)
    predicted_test_values = model.predict(test_data)

    print("Sample values: ", predicted_test_values[:5], test_values[:5])

    train_mse = sklearn.metrics.mean_squared_error(train_values, predicted_train_values)
    test_mse = sklearn.metrics.mean_squared_error(test_values, predicted_test_values)
    train_explained_variance = sklearn.metrics.explained_variance_score(train_values, predicted_train_values)
    test_explained_variance = sklearn.metrics.explained_variance_score(test_values, predicted_test_values)

    print("Train MSE: {:.3f}".format(train_mse))
    print("Train Explained Variance Score: {:.3f}".format(train_explained_variance))
    print("Test MSE: {:.3f}".format(test_mse))
    print("Test Explained Variance Score: {:.3f}".format(test_explained_variance))

    stats = pd.DataFrame(data=[[train_mse, test_mse, train_explained_variance, test_explained_variance]], columns=['Train MSE', 'Test MSE', 'Train EVS', 'Test EVS'])
    stats = stats.round(3)
    return stats

def classification_analysis(model, train_data, train_values, test_data, test_values):
    train_values_predicted = model.predict(train_data)
    threshold = np.average(train_values_predicted)
    train_prediction = train_values_predicted > threshold
    train_prediction = train_prediction.astype(np.int32)

    test_values_predicted = model.predict(test_data)
    test_prediction = test_values_predicted > threshold
    test_prediction = test_prediction.astype(np.int32)

    train_labels = train_values > threshold
    train_labels = train_labels.astype(np.int32)

    test_labels = test_values > threshold
    test_labels = test_labels.astype(np.int32)

    confusion_matrix_large = pd.DataFrame(sklearn.metrics.confusion_matrix(test_labels, test_prediction, labels=[1, 0]),
                                    columns=['positive', 'negative'], index=['Truth is +', 'Truth is -'])
    print("Confusion:\n", confusion_matrix_large)
    test_acc = sum(test_labels==test_prediction)/len(test_labels)
    print("Test accuracy: ", test_acc)
    train_acc = sum(train_labels==train_prediction)/len(train_labels)
    print("Train accuracy: ", train_acc)

def train_model_over_mark(model, input_data, mark, filter_unknown=True, hyperparams=None):
  print("\nRESULTS FOR", str(mark), "DAY MARK:\n")
  # print(features)
  mark_data = regression_marks[mark].dropna()
  # print(mark_data)
  if filter_unknown:
    mark_data = mark_data[mark_data['log_valuation_factor'] != 0]
  # Select the data that we have regression targets for

  data = input_data[input_data.index.isin(mark_data.index)].sort_index()
  # print(data)


  # Select the column with log_valuation_factor.
  values = mark_data[mark_data.index.isin(input_data.index)]['log_valuation_factor'].sort_index()
  # print(values)

  train_data, test_data, train_values, test_values = sklearn.model_selection.train_test_split(data, values, test_size=0.25)
  if hyperparams:
    print("Conducting Grid Search")
    search = sklearn.model_selection.GridSearchCV(model, hyperparams)
    search.fit(train_data, train_values)
    model = search.best_estimator_
  
  print("Trained on", str(np.shape(train_data)[0]), "rows.")
  model.fit(train_data, train_values)
  regression_stats = regression_analysis(model, train_data, train_values, test_data, test_values)
  classification_analysis(model, train_data, train_values, test_data, test_values)

  regression_stats['Mark'] = str(mark)
  return regression_stats

def train_analyze_save_models(model, features_for_mark, marks, filter_unknown=True, hyperparams=None, save=False):
  model_full_stats = pd.DataFrame(columns=['Mark', 'Train MSE', 'Test MSE', 'Train EVS', 'Test EVS'])
  for mark in marks:
    input_data = features_for_mark[mark]
    model_stats_for_mark = train_model_over_mark(model, input_data, mark, filter_unknown=True, hyperparams=hyperparams)
    # print(model_stats_for_mark)
    model_full_stats = model_full_stats.append(model_stats_for_mark)
    if save:
      pickle.dump(model, open('/content/gdrive/My Drive/vc_modeling/models/' + str(mark) + '.pkl', 'wb'))
      print("Models Saved")
  model_full_stats = model_full_stats.set_index('Mark')
  model_full_stats.to_csv(stats_save_path)
  # print(model_full_stats)

## Lasso

In [67]:
import sklearn.linear_model

lasso_model = sklearn.linear_model.LassoCV()
train_analyze_save_models(lasso_model, feature_dict, marks, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 39445 rows.
Sample values:  [0.64126864 0.30666821 0.3911232  0.23407    0.41645579] hash
3291231844864306042    0.153570
9976458301325477657    0.398534
2303864897546522496    0.409922
6287798531766950800    1.111673
7136319215372392419   -0.190253
Name: log_valuation_factor, dtype: float64
Train MSE:  0.8493490842296043
Train Explained Variance Score:  0.02172603461283651
Test MSE:  0.8571101727978573
Test Explained Variance Score:  0.017573166037010313
Confusion:
             positive  negative
Truth is +      2930      2637
Truth is -      3037      4545
Test accuracy:  0.5684842953836794
Train accuracy:  0.5739890987450881

RESULTS FOR 500 DAY MARK:

Trained on 39573 rows.
Sample values:  [0.74763434 0.65918996 1.02088102 0.55147755 0.79248982] hash
4592894162732364899    -1.856298
807037566504748297      0.107883
4370454419282417132     0.979091
12113291689500107828    1.456063
12369472916305385412   -1.992430
Name: log_valuation_factor, dty

## Random Forest Regressor

In [71]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

models = []
hyperdict = {'n_estimators': [20, 40, 60, 80, 100], 'min_samples_split':[10, 20, 30]}

tree_model = RandomForestRegressor(criterion='mse', max_depth=4)
train_analyze_save_models(tree_model, feature_dict, marks, filter_unknown=True, save=True)


RESULTS FOR 200 DAY MARK:

Trained on 39445 rows.
Sample values:  [0.31925402 0.35966338 0.35638019 0.26478428 0.44441283] hash
12967725012194857396    0.619039
14620084754324474648    0.989508
7912719370858775577     1.009854
12223350019333453504    0.144823
15134400659880834136    0.729089
Name: log_valuation_factor, dtype: float64
Train MSE:  0.8418660242590001
Train Explained Variance Score:  0.02941030572040848
Test MSE:  0.8604332054692484
Test Explained Variance Score:  0.016585236683386628
Confusion:
             positive  negative
Truth is +      2360      3229
Truth is -      2206      5354
Test accuracy:  0.5866605825538064
Train accuracy:  0.5838002281658005
Models Saved

RESULTS FOR 500 DAY MARK:

Trained on 39573 rows.
Sample values:  [0.61405334 1.01941093 0.54330933 0.56196048 0.57201004] hash
4625501801419729884     1.310307
18240614743343886873    0.325675
17736543360186249312   -2.525729
6540539227947698538     0.210414
12959011461965358348    0.326911
Name: log_val

## SGD

In [76]:
import sklearn.linear_model

sgd = sklearn.linear_model.SGDRegressor()
train_analyze_save_models(sgd, feature_dict, marks, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 39445 rows.
Sample values:  [0.33698467 0.34874977 0.39386501 0.26269778 0.23263379] hash
14894358104566661802   -1.473885
15083015550980876671   -0.988859
12685724931713173461    0.129168
17722711653460329522   -0.606368
9909612658117869938     1.791759
Name: log_valuation_factor, dtype: float64
Train MSE: 0.844
Train Explained Variance Score: 0.017
Test MSE: 0.891
Test Explained Variance Score: 0.014
Confusion:
             positive  negative
Truth is +      2931      2557
Truth is -      3191      4470
Test accuracy:  0.5628564909879078
Train accuracy:  0.5627329192546584

RESULTS FOR 500 DAY MARK:

Trained on 39573 rows.
Sample values:  [0.77302175 0.68157137 0.2481682  0.89521302 0.60190574] hash
5009456926064808446     0.273395
6243086595766519728     2.559496
3420630711294121179    -0.852579
10117803114513585573    4.838412
7945781846637633809     1.133511
Name: log_valuation_factor, dtype: float64
Train MSE: 1.604
Train Explained Variance 

# SVR

In [65]:
import sklearn.svm
# Warning: this takes a really long time.

svm = sklearn.svm.SVR(C=0.001)
train_analyze_save_models(svm, feature_dict, marks, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 39445 rows.
Sample values:  [0.45241493 0.30676156 0.46187886 0.3016241  0.21871206] hash
1073426932052885759     1.814059
8236489033748084964    -1.253349
3978734686931138051     0.199607
771894773710356089     -0.963970
13472161592308974541    0.330907
Name: log_valuation_factor, dtype: float64
Train MSE:  0.8580275876984779
Train Explained Variance Score:  0.015545939610884685
Test MSE:  0.8535608402977765
Test Explained Variance Score:  0.012867489059910753
Confusion:
             positive  negative
Truth is +      2775      2630
Truth is -      3154      4590
Test accuracy:  0.5601186402007757
Train accuracy:  0.5640765623019394

RESULTS FOR 500 DAY MARK:

Trained on 39573 rows.
Sample values:  [0.46153436 0.44809743 0.833686   0.53306258 0.6566857 ] hash
16900522960309454065   -1.504252
10652106539247817863    1.049822
4811827137043678520     2.038283
3869186477924654134    -0.333147
13468394499944240952   -2.965796
Name: log_valuation_facto

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

## MultiLayer Perceptron

In [None]:
import sklearn.neural_network

models = []

mlp = sklearn.neural_network.MLPRegressor(alpha=1e-2, hidden_layer_sizes=(100, 50, 20), max_iter=500)
train_analyze_save_models(mlp, feature_dict, marks, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 39445 rows.


# Export Predictions

In [31]:
# Load the models
import pickle
models = []
for mark in marks:
  model = pickle.load(open('/content/gdrive/My Drive/vc_modeling/models/' + str(int(mark)) + '.pkl', 'rb'))
  print(model)
  models.append(model)


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=4, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=4, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
RandomForest

In [32]:
org_info = pd.read_csv("/content/gdrive/My Drive/vc_modeling/data/crunchbase_bulk_export/organizations.csv")

In [33]:
org_info['hash'] = org_info['uuid'].apply(cityhash.CityHash64)
org_info = org_info.set_index('hash')
org_info = org_info[['uuid', 'name', 'created_at']]
org_info.head()

Unnamed: 0_level_0,uuid,name,created_at
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
13685534557686295101,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,2007-05-25 13:51:27
764015621929367586,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,2007-05-26 02:30:28
10846552445983457719,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,2007-05-26 03:03:23
10693046220981818130,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,2007-05-26 03:21:34
5087506707876194815,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,2007-05-26 04:22:15


In [34]:
org_info_join = org_info.copy()

print(marks)
for model, mark in zip(models, marks):
  truth = regression_marks[mark][['initial_valuation', 'log_valuation_factor']].copy()
  # print(truth)
  features = feature_dict[mark]
  prediction_array = model.predict(features)

  prediction = pd.DataFrame(data=prediction_array, index=features.index, columns=['prediction_' + str(mark)])
  uuid_hash = features.index
  pred_truth = truth.join(prediction)
  pred_truth = pred_truth.rename(columns={'log_valuation_factor':'truth_' + str(mark)})
  org_info_join = org_info_join.join(pred_truth, rsuffix=mark)

org_info_join.set_index('uuid', drop=True)
org_info_join.head()

[200, 500, 1000, 2000]


Unnamed: 0_level_0,uuid,name,created_at,initial_valuation,truth_200,prediction_200,initial_valuation500,truth_500,prediction_500,initial_valuation1000,truth_1000,prediction_1000,initial_valuation2000,truth_2000,prediction_2000
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13685534557686295101,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,2007-05-25 13:51:27,26250000.0,0.259546,0.56849,26250000.0,0.675616,1.017972,26250000.0,1.533324,1.38036,26250000.0,0.832727,1.579223
764015621929367586,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,2007-05-26 02:30:28,,,,,,,,,,,,
10846552445983457719,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,2007-05-26 03:03:23,14000000.0,0.556615,0.266237,14000000.0,1.295816,0.493032,14000000.0,2.213487,0.614898,14000000.0,0.721874,0.87491
10693046220981818130,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,2007-05-26 03:21:34,,,,,,,,,,,,
5087506707876194815,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,2007-05-26 04:22:15,2500000.0,3.031964,0.727375,2500000.0,4.927584,1.646667,2500000.0,7.819236,2.819469,2500000.0,8.520084,4.063588


In [35]:
org_info_join.to_csv('/content/gdrive/My Drive/vc_modeling/model_output/predictions.csv')

# Comparison to Random

In [36]:
import math

def compare_to_random(org_info_with_pred, mark, num_used):
  clean_org_info = org_info_with_pred.dropna()
  truth_col = 'truth_' + str(mark)
  pred_col = 'prediction_' + str(mark)

  truth_valuations = clean_org_info[truth_col]
  pred_valuations = clean_org_info[pred_col]

  # Sort pred_valuations, grab the indices at the far end.

  sort_indices = np.argsort(pred_valuations)
  indices_selected = sort_indices[-num_used:]

  true_returns_for_selected = truth_valuations.iloc[indices_selected]
  pred_return_percentages = np.expm1(true_returns_for_selected)*100

  # Calculate the returns on an investment strategy where we equally invest
  # among the top N.

  mean_model_return_log_factor = np.mean(true_returns_for_selected)
  mean_model_return_percentage = np.mean(pred_return_percentages)

  # Sample the random investor's returns 100 times.
  random_return_log_factors = []
  random_return_percentages = []
  for i in range(1000):
    sample = truth_valuations.sample(n=num_used)
    sample_percentages = np.expm1(sample) * 100

    random_return_log_factors.append(np.mean(sample))
    random_return_percentages.append(np.mean(sample_percentages))
  
  random_return_std = np.std(np.asarray(random_return_log_factors))
  mean_random_return_percentage = np.mean(np.asarray(random_return_percentages))
  mean_random_return_log_factor = np.mean(np.asarray(random_return_log_factors))

  print("Actual Model Return Percentage: ", mean_model_return_percentage)
  print("Actual Model Return Log Factor: ", mean_model_return_log_factor)
  print("Mean Random Return Percentage: ", mean_random_return_percentage)
  print("Mean Random Return Log Factor: ", mean_random_return_log_factor)
  print("Random Return StDev: ", random_return_std)
  return pd.DataFrame(data=[[mark, mean_model_return_percentage, mean_model_return_log_factor,
                              mean_random_return_percentage, mean_random_return_log_factor,
                              random_return_std]],
                        columns=['mark', 'Model Return Percentage', 'Model Return Log Factor', 'Random Return Percentage', 'Random Return Factor', 'Random Return StdDev'])
    

In [37]:
portfolio_size = 1000

print("Using Portfolio of size ", portfolio_size)

full_compare_stats = pd.DataFrame(columns=['mark', 'Model Return Percentage', 'Model Return Log Factor', 'Random Return Percentage', 'Random Return Factor', 'Random Return StdDev'])
for mark in marks:
  print("Mark: ", mark)
  compare_stats = compare_to_random(org_info_join, mark, portfolio_size)
  full_compare_stats = full_compare_stats.append(compare_stats)

full_compare_stats = full_compare_stats.set_index('mark')
full_compare_stats.to_csv('/content/gdrive/My Drive/vc_modeling/models/compare_stats.csv')

Using Portfolio of size  1000
Mark:  200
Actual Model Return Percentage:  17766.476609826965
Actual Model Return Log Factor:  0.8175645613599648
Mean Random Return Percentage:  718.4434342791843
Mean Random Return Log Factor:  0.13725245495071212
Random Return StDev:  0.01803300611945034
Mark:  500
Actual Model Return Percentage:  2586.525177133392
Actual Model Return Log Factor:  1.1683953276907735
Mean Random Return Percentage:  601.94017271607
Mean Random Return Log Factor:  0.2509040807546842
Random Return StDev:  0.027910852196607886
Mark:  1000
Actual Model Return Percentage:  9333.070864765568
Actual Model Return Log Factor:  2.0643122389543698
Mean Random Return Percentage:  851.889843633793
Mean Random Return Log Factor:  0.33502180880268373
Random Return StDev:  0.03240975321119626
Mark:  2000
Actual Model Return Percentage:  46660.66283950317
Actual Model Return Log Factor:  2.7510684299274306
Mean Random Return Percentage:  1433.8157775444822
Mean Random Return Log Factor: 