<a href="https://colab.research.google.com/github/siddtheshah/vc_modeling/blob/master/regressor_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse
!pip install cityhash
import cityhash
import sklearn.decomposition

print(pd.__version__)

from copy import deepcopy

1.0.5


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Read/Join Features

## Sparse Features
These need to go through dimensionality reduction

In [4]:
feature_folder = '/content/gdrive/My Drive/vc_modeling/feature_extraction'

sparse_category_features_array = scipy.sparse.load_npz(feature_folder + "/category_features/category_features_large.npz")
sparse_region_features_array = scipy.sparse.load_npz(feature_folder + "/region_features/region_features.npz")

In [5]:
# print(sparse_category_features_array)
## Other features here!! Remember to sparsify the dataframes if they're dense!

# print(sparse_category_features_array)

category_features_array = scipy.sparse.coo_matrix(sparse_category_features_array, dtype=np.uint64)
region_features_array = scipy.sparse.coo_matrix(sparse_region_features_array, dtype=np.uint64)

print(category_features_array.getnnz())
print(region_features_array.getnnz())

category_features_df = pd.DataFrame.sparse.from_spmatrix(category_features_array)
region_features_df = pd.DataFrame.sparse.from_spmatrix(region_features_array)

print(np.shape(category_features_df))
print(np.shape(region_features_df))

print("{}".format(category_features_df.iloc[0][0]))

2355537
1681588
(963967, 676)
(842699, 1032)
13685534557686295101


In [6]:
region_uuid = region_features_df.iloc[:, 0]
category_uuid = category_features_df.iloc[:, 0]
print(region_uuid)

print(np.count_nonzero(category_uuid.isin(region_uuid)))
check_value = 7551169957279540846
# check_value = cityhash.CityHash64('ffffabce-6d4a-b3d1-13c0-4e90cedf5270')
print(check_value)
print(np.size(region_uuid[region_uuid == check_value]))
print(np.size(category_uuid[category_uuid == check_value]))

0         13685534557686295101
1           764015621929367586
2         10846552445983457719
3          5087506707876194815
4          9094535307341385563
                  ...         
842694     4626564123199390189
842695     2978566027619600648
842696     1747954284855665241
842697     9744251496066876000
842698     7648380604111671063
Name: 0, Length: 842699, dtype: Sparse[uint64, 0]
808944
7551169957279540846
1
1


In [7]:
join_base = category_features_df.set_index(0)
join1 = region_features_df.set_index(0)
sparse_join = join_base.join(join1, lsuffix='category_features', rsuffix='region_features')
sparse_join = sparse_join.dropna()
# features = category_features_df
print(sparse_join)

                      1category_features  2category_features  ...  1030  1031
0                                                             ...            
13685534557686295101                 0.0                 0.0  ...   0.0   0.0
764015621929367586                   1.0                 1.0  ...   0.0   0.0
10846552445983457719                 0.0                 0.0  ...   0.0   0.0
5087506707876194815                  0.0                 0.0  ...   0.0   0.0
9094535307341385563                  0.0                 0.0  ...   0.0   0.0
...                                  ...                 ...  ...   ...   ...
16892862651199510638                 0.0                 0.0  ...   0.0   0.0
7229716827056183679                  0.0                 0.0  ...   0.0   0.0
4626564123199390189                  0.0                 0.0  ...   0.0   0.0
2978566027619600648                  0.0                 0.0  ...   0.0   0.0
9744251496066876000                  0.0                 0.0  ..

## Dimensionality Reduction

In [8]:
svd = sklearn.decomposition.TruncatedSVD(n_components=100, n_iter=10)
# Can't fit more than 10k samples or SVD will crash.
# If the samples are well distributed, this might be OK.
svd.fit(sparse_join[:15000])

print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)

reduced_features = svd.transform(sparse_join)

# lda = sklearn.decomposition.LatentDirichletAllocation(n_components=100,random_state=0, learning_method='online', total_samples=2e5)
# lda.partial_fit(sparse_join)
# reduced_features = lda.transform(sparse_join)


[0.06201002 0.04136426 0.03052181 0.02412492 0.02133094 0.01809638
 0.01821206 0.01757454 0.01589228 0.01511112 0.01381074 0.01349057
 0.01274772 0.01224814 0.01171326 0.01124586 0.01098865 0.01016308
 0.00945056 0.00909412 0.00893294 0.00866497 0.0080689  0.0079854
 0.00778647 0.00739946 0.00723822 0.00673296 0.00643826 0.00631926
 0.0061844  0.00595846 0.00580656 0.00570427 0.00569912 0.00543318
 0.00519543 0.00515399 0.00512568 0.00492261 0.00489803 0.00476558
 0.00470362 0.00468774 0.00454391 0.00444037 0.00442791 0.00432053
 0.00428805 0.00414272 0.00409159 0.00406363 0.00397661 0.00383421
 0.00373743 0.00367794 0.00361783 0.00349267 0.00344719 0.00342721
 0.00338314 0.00330971 0.00321613 0.00313719 0.00310964 0.003113
 0.00307043 0.00304931 0.0030259  0.00296645 0.00293466 0.00289825
 0.00285019 0.00282051 0.00278403 0.00274516 0.00269483 0.00269146
 0.00265022 0.00263723 0.00262737 0.00258918 0.00256648 0.00254527
 0.00252416 0.00249816 0.00247924 0.00246801 0.00246343 0.0024215

In [9]:
print(reduced_features)

[[ 1.93442333e-02  1.01881367e-01  2.75295137e-01 ...  8.59447196e-04
  -4.05432491e-03  1.17735883e-03]
 [ 1.22762868e+00 -5.75667147e-02  4.53293139e-02 ... -4.58215680e-02
   2.28733265e-01 -1.39480246e-01]
 [ 8.52595440e-02  1.09923636e-01  2.99163741e-01 ... -4.33367911e-03
  -1.68230248e-02  1.63597215e-03]
 ...
 [ 4.96430409e-04  1.41098458e-03  2.81446000e-03 ... -6.41585508e-03
   1.39028755e-02  2.01998477e-02]
 [ 9.65193781e-01 -2.14868786e-01 -1.12641455e-01 ...  3.01775653e-02
  -4.95447618e-02 -4.53177044e-02]
 [ 7.08646222e-03  2.48330429e-03  7.73408886e-03 ... -1.15429708e-01
  -1.25753963e-01  3.36705470e-02]]


## Dense features

In [10]:
founder_features = pd.read_csv('/content/gdrive/My Drive/vc_modeling/feature_extraction/founder_features/organization_founders_features.csv')
founder_features['hash'] = founder_features['org_uuid'].apply(cityhash.CityHash64)
founder_features = founder_features.set_index(['hash'], drop=True).drop(['org_uuid'], axis=1)

founder_features_only = founder_features.dropna()

print(founder_features.index)
print(np.count_nonzero(founder_features.index.isin(sparse_join.index)))


UInt64Index([ 2705467411384211821, 13360469805707984821,  5744847760615345245,
             13990853631299335829, 13073125021883633741, 17482404514494389050,
              5766560289832673038,  8860273864704446424, 13093060529635406942,
              3545623260843038609,
             ...
             15362439443771027468,  7042924928552646182,  4495539880758712748,
               800095033165514664, 17142127829573402143, 11045203277162125921,
             17622773241843276753,  9745467974249593237, 11854798340435595808,
             12723450708549610702],
            dtype='uint64', name='hash', length=198449)
165327


In [11]:
sparse_data = pd.DataFrame(data=reduced_features, index=sparse_join.index, columns=range(np.shape(reduced_features)[1]))
print(sparse_data)

                            0         1   ...        98        99
0                                         ...                    
13685534557686295101  0.019344  0.101881  ... -0.004054  0.001177
764015621929367586    1.227629 -0.057567  ...  0.228733 -0.139480
10846552445983457719  0.085260  0.109924  ... -0.016823  0.001636
5087506707876194815   1.041919 -0.202393  ... -0.019382  0.009674
9094535307341385563   1.041080 -0.160387  ...  0.054871  0.070294
...                        ...       ...  ...       ...       ...
16892862651199510638  0.013808  0.030725  ... -0.215998  0.033216
7229716827056183679   0.006609  0.029821  ... -0.001496  0.005391
4626564123199390189   0.000496  0.001411  ...  0.013903  0.020200
2978566027619600648   0.965194 -0.214869  ... -0.049545 -0.045318
9744251496066876000   0.007086  0.002483  ... -0.125754  0.033671

[808944 rows x 100 columns]


In [12]:
all_features = sparse_data.join(founder_features, lsuffix='sparse', rsuffix='dense').fillna(0)
all_features.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,founders_top_rank,founders_top_college,founders_max_degree_type_ordinal,founders_max_degree_count,founders_max_founded_other_org,founders_count
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
13685534557686295101,0.019344,0.101881,0.275295,0.770317,-0.392017,-0.308171,-0.074005,-0.09828,-0.03007,-0.144071,-0.008071,0.014282,-0.037295,-0.020789,0.036092,-0.071545,-0.08607,-0.031744,0.064637,0.036955,-0.036448,0.007317,0.003496,-0.015807,0.031912,-0.018393,-0.005094,-0.011109,-0.011276,-0.008199,0.02324,0.009102,-0.018469,-0.016408,-0.002777,0.00147,-0.022306,0.001711,-0.005529,-0.001948,...,-0.006966,0.005197,0.001029,0.003348,-0.007175,-0.006484,0.003138,0.006962,-0.010271,0.003618,0.001228,0.001735,0.004345,-0.005511,-0.008217,-0.007242,-0.005369,0.006239,0.002315,-0.005934,-0.004468,-0.00059,0.001548,0.000188,-0.00261,0.001855,0.014508,-0.004643,-0.011534,-0.004141,0.002195,0.000859,-0.004054,0.001177,20738.0,1.0,1.0,2.0,1.0,2.0
764015621929367586,1.227629,-0.057567,0.045329,0.307684,-0.378097,1.342761,-0.47541,-0.082669,0.789706,0.067754,-0.074613,-0.046426,0.686327,-0.644056,0.845123,0.151952,0.006688,0.120635,-0.16638,0.068958,-0.07817,-0.011791,0.130785,0.080263,-0.067059,0.004475,0.004838,0.048233,-0.042637,-0.096127,-0.144642,0.276433,0.397258,-0.492175,0.95797,0.143199,0.104934,-0.237836,0.468006,0.037817,...,-0.06079,0.024611,-0.192208,0.032803,-0.115683,0.071774,0.012529,0.049272,0.017279,-0.026792,0.063296,-0.039074,0.065978,0.003437,0.000671,-0.051815,0.037982,0.140371,-0.049215,-0.023172,0.061289,-0.005625,0.028237,0.085048,-0.08488,-0.021828,0.128904,-0.070762,-0.00264,-0.247386,-0.087691,-0.045822,0.228733,-0.13948,122.0,1.0,3.0,2.0,1.0,1.0
10846552445983457719,0.08526,0.109924,0.299164,0.86129,-0.430507,-0.285675,-0.125108,-0.085212,-0.052841,0.301601,0.113324,0.230529,0.696884,-0.310273,0.093155,-0.124344,-0.026946,0.032483,-0.001762,0.030034,-0.029834,-0.043316,-0.029376,-0.015402,-0.018304,-0.009062,-0.062101,-0.009256,-0.007697,-0.00374,-0.002182,-0.015565,-0.001157,-0.017888,0.00307,-0.019593,-0.045027,-0.029899,-0.008037,-0.001272,...,0.014909,0.021263,-0.023598,0.009107,-0.033579,0.01384,0.00635,0.0436,-0.041662,0.012428,0.013264,-0.05569,-0.008123,-0.004781,0.008703,-0.024012,-0.029231,0.032106,0.003477,-0.028051,0.022581,0.005066,0.004184,0.006659,-0.023893,-0.012475,0.029539,-0.008385,-0.011695,-0.001828,0.031314,-0.004334,-0.016823,0.001636,1735.0,0.0,0.0,1.0,0.0,1.0
5087506707876194815,1.041919,-0.202393,-0.075857,0.109147,0.023643,-0.070532,-0.040474,-0.010898,-0.029603,0.387249,0.120527,0.245035,0.727058,-0.284426,0.063152,-0.060805,0.008412,0.042996,-0.034869,0.028239,-0.026684,-0.047513,-0.02773,-0.02448,-0.071931,-0.020039,-0.105923,-0.000254,0.024882,0.021468,-0.005955,-0.043904,0.056714,-0.095702,-0.036477,0.125357,0.43235,0.663625,-0.037359,-0.277945,...,-0.078399,0.097107,0.041287,0.014654,-0.063546,-0.011747,0.008767,0.047182,-0.041398,-0.000322,0.019262,-0.057947,-0.013459,-0.009087,0.016498,-0.01702,-0.0376,0.020123,0.019066,-0.009101,0.014595,0.005995,0.007386,0.011111,-0.037189,-0.020381,0.032332,-0.017438,-0.00902,-0.003751,0.043953,-0.012299,-0.019382,0.009674,59.0,1.0,2.0,2.0,2.0,9.0
9094535307341385563,1.04108,-0.160387,-0.084188,0.061716,-0.073297,0.128333,-0.087326,-0.078266,0.009992,0.070557,0.931831,-0.155165,-0.294538,-0.06599,-0.291644,-0.196866,0.069036,-0.085623,0.043731,-0.161204,0.021061,-0.309758,0.87417,0.387983,-0.360148,-0.023082,-0.078195,-0.031849,0.035606,0.024938,-0.125703,-0.039902,-0.044831,0.07649,0.027981,-0.290259,-0.022896,0.109365,-0.068195,-0.214506,...,0.036618,0.000151,-0.015262,0.048621,0.01139,0.009998,-0.006563,0.012637,-0.003535,-0.046648,-0.018343,0.003269,-0.028208,-0.055698,0.027523,-0.023774,0.016121,0.015818,-0.031063,0.011825,-0.006525,-0.00055,0.03503,0.006653,0.030208,-0.002718,-0.039509,0.036822,0.032012,-0.039576,0.057122,-0.030367,0.054871,0.070294,38172.0,0.0,2.0,2.0,1.0,2.0


In [13]:
import sklearn.preprocessing

features_array = sklearn.preprocessing.normalize(all_features, norm='max', axis=0, copy=False)
features = pd.DataFrame(data=features_array, index=all_features.index, columns=range(np.shape(all_features)[1]))
print(features)

                           0         1    ...       104       105
0                                         ...                    
13685534557686295101  0.011911  0.043331  ...  0.012821  0.060606
764015621929367586    0.755909 -0.024484  ...  0.012821  0.030303
10846552445983457719  0.052498  0.046752  ...  0.000000  0.030303
5087506707876194815   0.641559 -0.086080  ...  0.025641  0.272727
9094535307341385563   0.641042 -0.068214  ...  0.012821  0.060606
...                        ...       ...  ...       ...       ...
16892862651199510638  0.008502  0.013068  ...  0.000000  0.000000
7229716827056183679   0.004069  0.012683  ...  0.000000  0.060606
4626564123199390189   0.000306  0.000600  ...  0.000000  0.000000
2978566027619600648   0.594315 -0.091386  ...  0.000000  0.000000
9744251496066876000   0.004363  0.001056  ...  0.000000  0.000000

[808944 rows x 106 columns]


# Read Regression Targets

In [16]:
file_names = os.listdir("/content/gdrive/My Drive/vc_modeling/regression_targets/")
marks = [int(x.replace('.csv', '')) for x in file_names]
print(marks)
regression_marks = {}
for mark in marks:
  rm = pd.read_csv(target_folder + str(mark) + '.csv')[['hash', 'initial_valuation', 'log_valuation_factor']].set_index('hash')
  rm = rm[~rm.isin([np.nan, np.inf, -np.inf]).any(1)]
  regression_marks[mark] = rm


[200, 500, 1000, 2000]


In [14]:
mark_data = regression_marks[200] # pd.read_pickle("/content/gdrive/My Drive/vc_modeling/regression_targets/200.pkl")
# mark_data = mark_data[mark_data['log_valuation_factor'] > 0]
print(mark_data)
print(features)
print(np.count_nonzero(mark_data.index.isin(features.index)))

                      initial_valuation  log_valuation_factor
hash                                                         
2053339725337568679         413036820.0              0.000000
13360469805707984821          3000000.0              0.266595
12201126308526847683         47500000.0              0.000000
17482404514494389050          2157880.0              0.000000
16923506324318240851          7500000.0              0.000000
...                                 ...                   ...
4057326795460754576            552105.0              0.000000
17393885764651115266         20000000.0              1.011831
14785394360939257924        125000000.0              0.000000
15207057269115911424         31150000.0              0.000000
12723450708549610702            75000.0              1.688395

[144569 rows x 2 columns]
                           0         1    ...       104       105
0                                         ...                    
13685534557686295101  0.011911  0.0

# Train Model

In [15]:
# Train Models
import sklearn.metrics
import sklearn.model_selection

def regression_analysis(model, train_data, train_values, test_data, test_values):
    predicted_train_values = model.predict(train_data)
    predicted_test_values = model.predict(test_data)

    print("Sample values: ", predicted_test_values[:5], test_values[:5])

    train_mse = sklearn.metrics.mean_squared_error(train_values, predicted_train_values)
    test_mse = sklearn.metrics.mean_squared_error(test_values, predicted_test_values)
    train_explained_variance = sklearn.metrics.explained_variance_score(train_values, predicted_train_values)
    test_explained_variance = sklearn.metrics.explained_variance_score(test_values, predicted_test_values)

    print("Train MSE: ", train_mse)
    print("Train Explained Variance Score: ", train_explained_variance)
    print("Test MSE: ", test_mse)
    print("Test Explained Variance Score: ", test_explained_variance)

    return model

def classification_analysis(model, train_data, train_values, test_data, test_values):
    train_values_predicted = model.predict(train_data)
    threshold = np.average(train_values_predicted)
    train_prediction = train_values_predicted > threshold
    train_prediction = train_prediction.astype(np.int32)

    test_values_predicted = model.predict(test_data)
    test_prediction = test_values_predicted > threshold
    test_prediction = test_prediction.astype(np.int32)

    train_labels = train_values > threshold
    train_labels = train_labels.astype(np.int32)

    test_labels = test_values > threshold
    test_labels = test_labels.astype(np.int32)

    confusion_matrix_large = pd.DataFrame(sklearn.metrics.confusion_matrix(test_labels, test_prediction, labels=[1, 0]),
                                    columns=['positive', 'negative'], index=['Truth is +', 'Truth is -'])
    print("Confusion:\n", confusion_matrix_large)
    test_acc = sum(test_labels==test_prediction)/len(test_labels)
    print("Test accuracy: ", test_acc)
    train_acc = sum(train_labels==train_prediction)/len(train_labels)
    print("Train accuracy: ", train_acc)


    # # Use the metrics.roc_curve function to get the true positive rate (tpr) and false positive rate (fpr)
    # fpr, tpr, thresholds = sklearn.metrics.roc_curve(test_labels, test_probabilities)

    # # Get the area under the curve (AUC)
    # auc = np.mean(cross_val_score(model, test_data, test_labels, scoring="roc_auc", cv=5))
    # print("AUC = " , str(round(auc, 2)))

    # # Plot the ROC curve

    # plt.xlabel("False positive rate (fpr)")
    # plt.ylabel("True positive rate (tpr)")
    # plt.plot(fpr, tpr, label='model')
    # plt.plot([0, 1], [0, 1], color='k', label="random")
    # plt.legend(loc='best')

    # plt.figure()
    # plt.xlabel("Recall")
    # plt.ylabel("Precision")
    # precision, recall, _ = sklearn.metrics.precision_recall_curve(test_labels, test_probabilities)
    # plt.plot(recall, precision)

def train_model_over_mark(model, input_data, mark, filter_unknown=True, hyperparams=None):
  print("\nRESULTS FOR", str(mark), "DAY MARK:\n")
  # print(features)
  mark_data = regression_marks[mark].dropna()
  # print(mark_data)
  if filter_unknown:
    mark_data = mark_data[mark_data['log_valuation_factor'] != 0]
  # Select the data that we have regression targets for

  data = input_data[input_data.index.isin(mark_data.index)].sort_index()
  # print(data)


  # Select the column with log_valuation_factor.
  values = mark_data[mark_data.index.isin(input_data.index)]['log_valuation_factor'].sort_index()
  # print(values)

  train_data, test_data, train_values, test_values = sklearn.model_selection.train_test_split(data, values, test_size=0.25)
  if hyperparams:
    print("Conducting Grid Search")
    search = sklearn.model_selection.GridSearchCV(model, hyperparams)
    search.fit(train_data, train_values)
    model = search.best_estimator_
  
  print("Trained on", str(np.shape(train_data)[0]), "rows.")
  model.fit(train_data, train_values)
  regression_analysis(model, train_data, train_values, test_data, test_values)
  classification_analysis(model, train_data, train_values, test_data, test_values)

In [16]:
print(founder_features_only[:5])

                      founders_top_rank  ...  founders_count
hash                                     ...                
2705467411384211821            265041.0  ...               4
13360469805707984821             6551.0  ...               2
13990853631299335829           147007.0  ...               1
17482404514494389050           785716.0  ...               1
15337030219814864514             1672.0  ...               1

[5 rows x 6 columns]


## Lasso

In [17]:
import sklearn.linear_model

for mark in marks:
  lasso_model = sklearn.linear_model.LassoCV()
  train_model_over_mark(lasso_model, founder_features_only, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 15194 rows.
Sample values:  [0.49915806 0.31551135 0.49741146 0.50119549 0.47879544] hash
13060534818121295168    0.547134
7812462598091645679    -0.284776
16246232976548058226    0.414903
2058377462233030        0.872386
8263846538183928843     2.535350
Name: log_valuation_factor, dtype: float64
Train MSE:  0.9349399938240103
Train Explained Variance Score:  0.006140830689470689
Test MSE:  0.9186549200293753
Test Explained Variance Score:  0.006851877397404671
Confusion:
             positive  negative
Truth is +      1483       586
Truth is -      1913      1083
Test accuracy:  0.5066140177690029
Train accuracy:  0.503751480847703

RESULTS FOR 500 DAY MARK:

Trained on 15258 rows.
Sample values:  [0.87062421 0.58488363 0.87230511 0.85062968 0.92516098] hash
16089231626721301270    1.278165
12559834587190061872    0.983259
15068939385736794845    1.659286
18262917739268146752    1.294492
2916497670721281501     1.289241
Name: log_valuation_factor

## Random Forest Regressor

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

models = []
hyperdict = {'n_estimators': [20, 40, 60, 80, 100], 'min_samples_split':[10, 20, 30]}

for mark in marks:
  tree_model = RandomForestRegressor(criterion='mse', max_depth=4)
  train_model_over_mark(tree_model, features, mark, filter_unknown=True)
  models.append(tree_model)


RESULTS FOR 200 DAY MARK:

Trained on 39445 rows.
Sample values:  [0.29468278 0.26968576 0.27967579 0.34657118 0.26414289] hash
17196882803872297573    0.793497
17437293347975446381    0.391368
12524796904998413897    0.603349
9915292093174236763     0.377074
6806748359671183443     1.061899
Name: log_valuation_factor, dtype: float64
Train MSE:  0.8365104172391014
Train Explained Variance Score:  0.0264739351958716
Test MSE:  0.886216702116822
Test Explained Variance Score:  0.014646187393217325
Confusion:
             positive  negative
Truth is +      2342      3275
Truth is -      2326      5206
Test accuracy:  0.5740360483686973
Train accuracy:  0.5833692483204462

RESULTS FOR 500 DAY MARK:

Trained on 39573 rows.
Sample values:  [0.54162186 0.4728713  0.47605769 0.50817659 1.24851547] hash
17986535554932535408    0.369003
13682655561331062899    0.034386
7220143968930273359     1.203973
1814416024560022572     0.405465
3907786077573467582     0.556100
Name: log_valuation_factor, 

## SGD

In [None]:
import sklearn.linear_model

sgd = sklearn.linear_model.SGDRegressor()

for mark in marks:
  train_model_over_mark(sgd, features, mark, filter_unknown=True)

# SVR

In [None]:
import sklearn.svm
# Warning: this takes a really long time.

for mark in marks:
  svm = sklearn.svm.SVR(C=0.001)
  train_model_over_mark(svm, features, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 39445 rows.
Sample values:  [0.28225426 0.2599009  0.24700179 0.28501875 0.24145089] hash
9922409125951720504    0.142623
3723860927443974661    2.646436
944842188742108698     1.171183
9874696254773956310    0.020181
2913120074975965592   -0.017817
Name: log_valuation_factor, dtype: float64
Train MSE:  0.867838754693294
Train Explained Variance Score:  0.00768389167151462
Test MSE:  0.8890430168748725
Test Explained Variance Score:  0.005207680788291258
Confusion:
             positive  negative
Truth is +      3604      2924
Truth is -      3014      3607
Test accuracy:  0.5484067229447106
Train accuracy:  0.5563696285967803

RESULTS FOR 500 DAY MARK:

Trained on 39573 rows.


## MultiLayer Perceptron

In [None]:
import sklearn.neural_network

models = []
for mark in marks:
  mlp = sklearn.neural_network.MLPRegressor(alpha=1e-2, hidden_layer_sizes=(100, 50, 20), max_iter=500)
  train_model_over_mark(mlp, features, mark, filter_unknown=True)
  models.append(mlp)

## Save Models

In [None]:
# Save the models
import pickle
print(models)
for mark, model in zip(marks, models):
  print(model)
  pickle.dump(model, open('/content/gdrive/My Drive/vc_modeling/models/' + str(mark) + '.pkl', 'wb'))

# Export Predictions

In [None]:
# Load the models
import pickle
models = []
for mark in marks:

  model = pickle.load(open('/content/gdrive/My Drive/vc_modeling/models/' + str(int(mark)) + '.pkl', 'rb'))
  models.append(model)


In [None]:
org_info = pd.read_csv("/content/gdrive/My Drive/vc_modeling/data/crunchbase_bulk_export/organizations.csv")

In [None]:
org_info['hash'] = org_info['uuid'].apply(cityhash.CityHash64)
org_info = org_info.set_index('hash')
org_info = org_info[['uuid', 'name', 'created_at']]
org_info.head()

In [None]:
import math

org_info_join = org_info.copy()

print(marks)
for model, mark in zip(models, marks):
  truth = regression_marks[mark][['initial_valuation', 'log_valuation_factor']].copy()
  # print(truth)
  prediction_array = model.predict(features)

  prediction = pd.DataFrame(data=prediction_array, index=features.index, columns=['prediction_' + str(mark)])
  uuid_hash = features.index
  pred_truth = truth.join(prediction)
  pred_truth = pred_truth.rename(columns={'log_valuation_factor':'truth_' + str(mark)})
  org_info_join = org_info_join.join(pred_truth, rsuffix=mark)

org_info_join org_info_join.set_index('uuid', drop=True)
org_info_join.head()

org_info_join.to_csv('/content/gdrive/My Drive/vc_modeling/model_output/predictions.csv')

# Comparison to Random

In [None]:

def compare_to_random(org_info_with_pred):
  clean_org_info = org_info_with_pred.dropna()

