<a href="https://colab.research.google.com/github/siddtheshah/vc_modeling/blob/master/regressor_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse
!pip install cityhash
import cityhash
import sklearn.decomposition

print(pd.__version__)

from copy import deepcopy

1.0.5


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Read/Join Features

## Sparse Features
These need to go through dimensionality reduction

In [3]:
feature_folder = '/content/gdrive/My Drive/vc_modeling/feature_extraction'

sparse_category_features_array = scipy.sparse.load_npz(feature_folder + "/category_features/category_features_large.npz")
sparse_region_features_array = scipy.sparse.load_npz(feature_folder + "/region_features/region_features.npz")

In [4]:
# print(sparse_category_features_array)
## Other features here!! Remember to sparsify the dataframes if they're dense!

# print(sparse_category_features_array)

category_features_array = scipy.sparse.coo_matrix(sparse_category_features_array, dtype=np.uint64)
region_features_array = scipy.sparse.coo_matrix(sparse_region_features_array, dtype=np.uint64)

print(category_features_array.getnnz())
print(region_features_array.getnnz())

category_features_df = pd.DataFrame.sparse.from_spmatrix(category_features_array)
region_features_df = pd.DataFrame.sparse.from_spmatrix(region_features_array)

print(np.shape(category_features_df))
print(np.shape(region_features_df))

print("{}".format(category_features_df.iloc[0][0]))

2355537
1681588
(963967, 676)
(842699, 1032)
13685534557686295101


In [5]:
region_uuid = region_features_df.iloc[:, 0]
category_uuid = category_features_df.iloc[:, 0]
print(region_uuid)

print(np.count_nonzero(category_uuid.isin(region_uuid)))
check_value = 7551169957279540846
# check_value = cityhash.CityHash64('ffffabce-6d4a-b3d1-13c0-4e90cedf5270')
print(check_value)
print(np.size(region_uuid[region_uuid == check_value]))
print(np.size(category_uuid[category_uuid == check_value]))

0         13685534557686295101
1           764015621929367586
2         10846552445983457719
3          5087506707876194815
4          9094535307341385563
                  ...         
842694     4626564123199390189
842695     2978566027619600648
842696     1747954284855665241
842697     9744251496066876000
842698     7648380604111671063
Name: 0, Length: 842699, dtype: Sparse[uint64, 0]
808944
7551169957279540846
1
1


In [6]:
join_base = category_features_df.set_index(0)
join1 = region_features_df.set_index(0)
sparse_join = join_base.join(join1, lsuffix='category_features', rsuffix='region_features')
sparse_join = sparse_join.dropna()
# features = category_features_df
print(sparse_join)

                      1category_features  2category_features  ...  1030  1031
0                                                             ...            
13685534557686295101                 0.0                 0.0  ...   0.0   0.0
764015621929367586                   1.0                 1.0  ...   0.0   0.0
10846552445983457719                 0.0                 0.0  ...   0.0   0.0
5087506707876194815                  0.0                 0.0  ...   0.0   0.0
9094535307341385563                  0.0                 0.0  ...   0.0   0.0
...                                  ...                 ...  ...   ...   ...
16892862651199510638                 0.0                 0.0  ...   0.0   0.0
7229716827056183679                  0.0                 0.0  ...   0.0   0.0
4626564123199390189                  0.0                 0.0  ...   0.0   0.0
2978566027619600648                  0.0                 0.0  ...   0.0   0.0
9744251496066876000                  0.0                 0.0  ..

## Dimensionality Reduction

In [7]:
svd = sklearn.decomposition.TruncatedSVD(n_components=100, n_iter=10)
# Can't fit more than 10k samples or SVD will crash.
# If the samples are well distributed, this might be OK.
svd.fit(sparse_join[:15000])

print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)

reduced_features = svd.transform(sparse_join)

# lda = sklearn.decomposition.LatentDirichletAllocation(n_components=100,random_state=0, learning_method='online', total_samples=2e5)
# lda.partial_fit(sparse_join)
# reduced_features = lda.transform(sparse_join)


[0.06201002 0.04136426 0.03052181 0.02412492 0.02133094 0.01809638
 0.01821206 0.01757454 0.01589228 0.01511112 0.01381074 0.01349057
 0.01274772 0.01224814 0.01171326 0.01124586 0.01098865 0.01016308
 0.00945056 0.00909412 0.00893294 0.00866497 0.0080689  0.0079854
 0.00778647 0.00739946 0.00723822 0.00673296 0.00643826 0.00631925
 0.0061844  0.00595846 0.00580656 0.00570427 0.00569912 0.00543318
 0.00519543 0.00515399 0.00512568 0.00492261 0.00489803 0.00476558
 0.00470362 0.00468774 0.00454391 0.00444037 0.0044279  0.00432053
 0.00428805 0.00414271 0.00409159 0.00406363 0.0039766  0.0038342
 0.00373743 0.00367793 0.00361784 0.00349264 0.00344725 0.00342722
 0.00338313 0.00330974 0.00321618 0.00313729 0.00310979 0.00311294
 0.00307047 0.00304941 0.00302574 0.00296641 0.00293458 0.00289812
 0.00285028 0.00282038 0.00278354 0.00274544 0.00269379 0.0026913
 0.0026516  0.00263526 0.00261924 0.00258917 0.00256717 0.00254455
 0.00252492 0.00249671 0.00248229 0.00246316 0.00246282 0.0024156

In [8]:
print(reduced_features)

[[ 1.93442333e-02  1.01881367e-01  2.75295137e-01 ...  9.90355734e-04
   1.03442860e-04 -2.44477256e-03]
 [ 1.22762868e+00 -5.75667147e-02  4.53293139e-02 ...  3.58285199e-01
   1.35233061e-01  2.38092595e-01]
 [ 8.52595440e-02  1.09923636e-01  2.99163741e-01 ... -1.56577650e-02
   1.12609245e-02 -1.18120588e-02]
 ...
 [ 4.96430409e-04  1.41098458e-03  2.81446000e-03 ...  4.50013167e-03
   4.31491863e-03 -1.57369644e-02]
 [ 9.65193781e-01 -2.14868786e-01 -1.12641455e-01 ... -1.81714359e-02
  -4.06872360e-02 -6.60325441e-02]
 [ 7.08646222e-03  2.48330429e-03  7.73408886e-03 ... -4.28757966e-02
   6.22093873e-02 -7.39474761e-02]]


## Dense features

In [9]:
founder_features = pd.read_csv('/content/gdrive/My Drive/vc_modeling/feature_extraction/founder_features/organization_founders_features.csv')
founder_features['hash'] = founder_features['org_uuid'].apply(cityhash.CityHash64)
founder_features = founder_features.set_index(['hash'], drop=True).drop(['org_uuid'], axis=1)

founder_features_only = founder_features.dropna()

print(founder_features.index)
print(np.count_nonzero(founder_features.index.isin(sparse_join.index)))


UInt64Index([ 2705467411384211821, 13360469805707984821,  5744847760615345245,
             13990853631299335829, 13073125021883633741, 17482404514494389050,
              5766560289832673038,  8860273864704446424, 13093060529635406942,
              3545623260843038609,
             ...
             15362439443771027468,  7042924928552646182,  4495539880758712748,
               800095033165514664, 17142127829573402143, 11045203277162125921,
             17622773241843276753,  9745467974249593237, 11854798340435595808,
             12723450708549610702],
            dtype='uint64', name='hash', length=198449)
165327


In [10]:
sparse_data = pd.DataFrame(data=reduced_features, index=sparse_join.index, columns=range(np.shape(reduced_features)[1]))
print(sparse_data)

                            0         1   ...        98        99
0                                         ...                    
13685534557686295101  0.019344  0.101881  ...  0.000103 -0.002445
764015621929367586    1.227629 -0.057567  ...  0.135233  0.238093
10846552445983457719  0.085260  0.109924  ...  0.011261 -0.011812
5087506707876194815   1.041919 -0.202393  ...  0.017518 -0.016291
9094535307341385563   1.041080 -0.160387  ...  0.035044 -0.072997
...                        ...       ...  ...       ...       ...
16892862651199510638  0.013808  0.030725  ...  0.015662 -0.106075
7229716827056183679   0.006609  0.029821  ... -0.009665 -0.005766
4626564123199390189   0.000496  0.001411  ...  0.004315 -0.015737
2978566027619600648   0.965194 -0.214869  ... -0.040687 -0.066033
9744251496066876000   0.007086  0.002483  ...  0.062209 -0.073947

[808944 rows x 100 columns]


In [11]:
all_features = sparse_data.join(founder_features, lsuffix='sparse', rsuffix='dense').fillna(0)
all_features.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,founders_top_rank,founders_top_college,founders_max_degree_type_ordinal,founders_max_degree_count,founders_max_founded_other_org,founders_count
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
13685534557686295101,0.019344,0.101881,0.275295,0.770317,-0.392017,-0.308171,-0.074005,-0.09828,-0.03007,-0.144071,-0.008071,0.014282,-0.037295,-0.020789,0.036092,-0.071545,-0.08607,-0.031744,0.064637,0.036955,-0.036448,0.007317,0.003496,-0.015807,0.031912,-0.018393,-0.005094,-0.011109,-0.011276,-0.008199,0.02324,0.009101,-0.018469,-0.016408,-0.002777,0.001469,-0.022306,0.001711,-0.005529,-0.001948,...,-0.006982,0.005339,0.000958,0.003348,-0.007092,-0.006458,0.003187,0.006638,-0.010313,0.003567,0.001011,0.00209,0.004177,-0.005406,-0.009683,-0.007059,-0.005882,0.007048,0.000751,-0.006488,-0.002915,-0.001661,-0.000802,-0.003254,0.001892,0.011217,0.007619,-0.00049,0.006228,-0.0121,-0.002852,0.00099,0.000103,-0.002445,20738.0,1.0,1.0,2.0,1.0,2.0
764015621929367586,1.227629,-0.057567,0.045329,0.307684,-0.378097,1.342761,-0.47541,-0.082669,0.789706,0.067754,-0.074613,-0.046426,0.686327,-0.644056,0.845123,0.151952,0.006688,0.120635,-0.16638,0.068958,-0.07817,-0.011791,0.130785,0.080263,-0.067059,0.004475,0.004838,0.048233,-0.042637,-0.096127,-0.144642,0.276434,0.397257,-0.492176,0.957972,0.143198,0.104933,-0.237833,0.468003,0.037826,...,-0.058554,0.022259,-0.196136,0.032926,-0.113093,0.075936,0.016,0.046141,0.014271,-0.026534,0.07035,-0.035244,0.053826,0.006383,-0.00054,-0.034751,0.025831,0.133176,-0.033924,0.021923,0.074978,-0.005313,0.037225,0.03683,0.021764,0.065129,0.081417,-0.040097,-0.047375,0.04456,-0.000213,0.358285,0.135233,0.238093,122.0,1.0,3.0,2.0,1.0,1.0
10846552445983457719,0.08526,0.109924,0.299164,0.86129,-0.430507,-0.285675,-0.125108,-0.085212,-0.052841,0.301601,0.113324,0.230529,0.696884,-0.310273,0.093155,-0.124344,-0.026946,0.032483,-0.001762,0.030034,-0.029834,-0.043316,-0.029376,-0.015402,-0.018304,-0.009062,-0.062101,-0.009256,-0.007697,-0.00374,-0.002182,-0.015566,-0.001157,-0.017887,0.00307,-0.019593,-0.045027,-0.029899,-0.008036,-0.001273,...,0.014729,0.021237,-0.023485,0.009018,-0.034136,0.01375,0.006658,0.04348,-0.04283,0.012494,0.014439,-0.056944,-0.008811,-0.007815,0.002135,-0.022008,-0.027706,0.0327,-0.003397,-0.02452,0.02598,0.007757,0.006042,-0.004871,0.018151,0.006267,0.021333,-0.003879,0.022098,-0.016109,-0.034081,-0.015658,0.011261,-0.011812,1735.0,0.0,0.0,1.0,0.0,1.0
5087506707876194815,1.041919,-0.202393,-0.075857,0.109147,0.023643,-0.070532,-0.040474,-0.010898,-0.029603,0.387249,0.120527,0.245035,0.727058,-0.284426,0.063152,-0.060805,0.008412,0.042996,-0.034869,0.028239,-0.026684,-0.047513,-0.02773,-0.02448,-0.071931,-0.020039,-0.105923,-0.000254,0.024882,0.021468,-0.005955,-0.043904,0.056714,-0.095702,-0.036477,0.125357,0.432349,0.663624,-0.037358,-0.277945,...,-0.078504,0.097168,0.041029,0.014589,-0.06372,-0.011753,0.009311,0.046876,-0.042936,-0.00085,0.020588,-0.059308,-0.014789,-0.011849,0.009994,-0.012979,-0.035963,0.021746,0.014598,-0.006213,0.016699,0.004098,-0.000359,-0.000718,0.025184,0.003007,0.024879,-0.018749,0.031179,-0.013121,-0.049312,-0.015142,0.017518,-0.016291,59.0,1.0,2.0,2.0,2.0,9.0
9094535307341385563,1.04108,-0.160387,-0.084188,0.061716,-0.073297,0.128333,-0.087326,-0.078266,0.009992,0.070557,0.931831,-0.155165,-0.294538,-0.06599,-0.291644,-0.196866,0.069036,-0.085623,0.043731,-0.161204,0.021061,-0.309758,0.87417,0.387983,-0.360148,-0.023082,-0.078195,-0.031849,0.035605,0.024938,-0.125703,-0.039902,-0.044831,0.07649,0.027984,-0.290262,-0.022897,0.109367,-0.068194,-0.214504,...,0.036264,0.000696,-0.014525,0.048228,0.01194,0.007767,-0.007675,0.009413,-0.006231,-0.045147,-0.017146,0.007086,-0.035567,-0.063065,0.021831,-0.021657,0.017533,0.016628,-0.033597,0.015264,-0.011288,0.001769,-0.026708,0.011427,-0.0178,-0.024264,-0.007385,0.02539,-0.015832,-0.00434,-0.066482,0.082375,0.035044,-0.072997,38172.0,0.0,2.0,2.0,1.0,2.0


In [12]:
import sklearn.preprocessing

features_array = sklearn.preprocessing.normalize(all_features, norm='max', axis=0, copy=False)
features = pd.DataFrame(data=features_array, index=all_features.index, columns=range(np.shape(all_features)[1]))
print(features)

                           0         1    ...       104       105
0                                         ...                    
13685534557686295101  0.011911  0.043331  ...  0.012821  0.060606
764015621929367586    0.755909 -0.024484  ...  0.012821  0.030303
10846552445983457719  0.052498  0.046752  ...  0.000000  0.030303
5087506707876194815   0.641559 -0.086080  ...  0.025641  0.272727
9094535307341385563   0.641042 -0.068214  ...  0.012821  0.060606
...                        ...       ...  ...       ...       ...
16892862651199510638  0.008502  0.013068  ...  0.000000  0.000000
7229716827056183679   0.004069  0.012683  ...  0.000000  0.060606
4626564123199390189   0.000306  0.000600  ...  0.000000  0.000000
2978566027619600648   0.594315 -0.091386  ...  0.000000  0.000000
9744251496066876000   0.004363  0.001056  ...  0.000000  0.000000

[808944 rows x 106 columns]


# Read Regression Targets

In [13]:
target_folder = '/content/gdrive/My Drive/vc_modeling/regression_targets/'
marks = [200, 500, 1000, 2000]

regression_marks = {}
for mark in marks:
  regression_marks[mark] = pd.read_csv(target_folder + str(mark) + '.csv')[['hash', 'initial_valuation', 'log_valuation_factor']].set_index('hash')


In [14]:
mark_data = regression_marks[200] # pd.read_pickle("/content/gdrive/My Drive/vc_modeling/regression_targets/200.pkl")
# mark_data = mark_data[mark_data['log_valuation_factor'] > 0]
print(mark_data)
print(category_features_df)
print(np.count_nonzero(mark_data.index.isin(region_features_df.index)))

                      initial_valuation  log_valuation_factor
hash                                                         
13360469805707984821          3000000.0              0.000000
7551169957279540846          45000000.0              0.266595
17638643441008354186          2280520.0              0.000000
14753292511968607343          3000000.0              0.000000
18303053280205650499         20400150.0              0.000000
...                                 ...                   ...
1328871151505778773            600000.0              0.183442
16862606919743425243           162360.0              0.000000
14711304329054892014         65500000.0              0.344393
17393885764651115266         20000000.0             -0.189295
12723450708549610702            75000.0              0.000000

[49489 rows x 2 columns]
                         0    1    2    3    4    ...  671  672  673  674  675
0       13685534557686295101  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0
1         

# Train Model

In [15]:
# Train Models
import sklearn.metrics
import sklearn.model_selection

def regression_analysis(model, train_data, train_values, test_data, test_values):
    predicted_train_values = model.predict(train_data)
    predicted_test_values = model.predict(test_data)

    print("Sample values: ", predicted_test_values[:5], test_values[:5])

    train_mse = sklearn.metrics.mean_squared_error(train_values, predicted_train_values)
    test_mse = sklearn.metrics.mean_squared_error(test_values, predicted_test_values)
    train_explained_variance = sklearn.metrics.explained_variance_score(train_values, predicted_train_values)
    test_explained_variance = sklearn.metrics.explained_variance_score(test_values, predicted_test_values)

    print("Train MSE: ", train_mse)
    print("Train Explained Variance Score: ", train_explained_variance)
    print("Test MSE: ", test_mse)
    print("Test Explained Variance Score: ", test_explained_variance)

    return model

def classification_analysis(model, train_data, train_values, test_data, test_values):
    train_values_predicted = model.predict(train_data)
    threshold = np.average(train_values_predicted)
    train_prediction = train_values_predicted > threshold
    train_prediction = train_prediction.astype(np.int32)

    test_values_predicted = model.predict(test_data)
    test_prediction = test_values_predicted > threshold
    test_prediction = test_prediction.astype(np.int32)

    train_labels = train_values > threshold
    train_labels = train_labels.astype(np.int32)

    test_labels = test_values > threshold
    test_labels = test_labels.astype(np.int32)

    confusion_matrix_large = pd.DataFrame(sklearn.metrics.confusion_matrix(test_labels, test_prediction, labels=[1, 0]),
                                    columns=['positive', 'negative'], index=['Truth is +', 'Truth is -'])
    print("Confusion:\n", confusion_matrix_large)
    test_acc = sum(test_labels==test_prediction)/len(test_labels)
    print("Test accuracy: ", test_acc)
    train_acc = sum(train_labels==train_prediction)/len(train_labels)
    print("Train accuracy: ", train_acc)


    # # Use the metrics.roc_curve function to get the true positive rate (tpr) and false positive rate (fpr)
    # fpr, tpr, thresholds = sklearn.metrics.roc_curve(test_labels, test_probabilities)

    # # Get the area under the curve (AUC)
    # auc = np.mean(cross_val_score(model, test_data, test_labels, scoring="roc_auc", cv=5))
    # print("AUC = " , str(round(auc, 2)))

    # # Plot the ROC curve

    # plt.xlabel("False positive rate (fpr)")
    # plt.ylabel("True positive rate (tpr)")
    # plt.plot(fpr, tpr, label='model')
    # plt.plot([0, 1], [0, 1], color='k', label="random")
    # plt.legend(loc='best')

    # plt.figure()
    # plt.xlabel("Recall")
    # plt.ylabel("Precision")
    # precision, recall, _ = sklearn.metrics.precision_recall_curve(test_labels, test_probabilities)
    # plt.plot(recall, precision)

def train_model_over_mark(model, input_data, mark, filter_unknown=True, hyperparams=None):
  print("\nRESULTS FOR", str(mark), "DAY MARK:\n")
  # print(features)
  mark_data = regression_marks[mark]
  # print(mark_data)
  if filter_unknown:
    mark_data = mark_data[mark_data['log_valuation_factor'] != 0]
  # Select the data that we have regression targets for

  data = input_data[input_data.index.isin(mark_data.index)].sort_index()
  # print(data)

  # Select the column with log_valuation_factor.
  values = mark_data[mark_data.index.isin(input_data.index)]['log_valuation_factor'].sort_index()
  # print(values)

  train_data, test_data, train_values, test_values = sklearn.model_selection.train_test_split(np.asarray(data), np.asarray(values), test_size=0.25)
  if hyperparams:
    print("Conducting Grid Search")
    search = sklearn.model_selection.GridSearchCV(model, hyperparams)
    search.fit(train_data, train_values)
    model = search.best_estimator_
  
  print("Trained on", str(np.shape(train_data)[0]), "rows.")
  model.fit(train_data, train_values)
  regression_analysis(model, train_data, train_values, test_data, test_values)
  classification_analysis(model, train_data, train_values, test_data, test_values)

In [16]:
print(founder_features_only[:5])

                      founders_top_rank  ...  founders_count
hash                                     ...                
2705467411384211821            265041.0  ...               4
13360469805707984821             6551.0  ...               2
13990853631299335829           147007.0  ...               1
17482404514494389050           785716.0  ...               1
15337030219814864514             1672.0  ...               1

[5 rows x 6 columns]


## Lasso

In [17]:
import sklearn.linear_model

for mark in marks:
  lasso_model = sklearn.linear_model.LassoCV()
  train_model_over_mark(lasso_model, founder_features_only, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 5319 rows.
Sample values:  [0.35392981 0.35392981 0.35392981 0.35392981 0.35392981] hash
9978728151871015461     4.855537
12476570258238710491   -0.444032
17111326624788415720   -0.897083
7505899250196078072     0.112444
7227951020934067813    -0.834952
Name: log_valuation_factor, dtype: float64
Train MSE:  0.9018234851102647
Train Explained Variance Score:  0.0
Test MSE:  0.8765482329935472
Test Explained Variance Score:  0.0
Confusion:
             positive  negative
Truth is +         0       747
Truth is -         0      1027
Test accuracy:  0.5789177001127396
Train accuracy:  0.582816318856928

RESULTS FOR 500 DAY MARK:

Trained on 4303 rows.
Sample values:  [0.62807308 0.62807308 0.62807308 0.62807308 0.62807308] hash
8732360641011294275     0.198648
5605258309685920735     0.168095
290158562275609277     -0.035082
10849058716469799279    1.448655
18086272839273084385    0.406185
Name: log_valuation_factor, dtype: float64
Train MSE:  1.78887

## Random Forest Regressor

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

models = []
hyperdict = {'n_estimators': [20, 40, 60, 80, 100], 'min_samples_split':[10, 20, 30]}

for mark in marks:
  tree_model = RandomForestRegressor(criterion='mae', max_depth=4)
  train_model_over_mark(tree_model, features, mark, filter_unknown=True)
  models.append(tree_model)


RESULTS FOR 200 DAY MARK:

Trained on 13498 rows.


KeyboardInterrupt: ignored

## SGD

In [41]:
import sklearn.linear_model

sgd = sklearn.linear_model.SGDRegressor()

for mark in marks:
  train_model_over_mark(sgd, features, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 13498 rows.
Sample values:  [ 3.94586408e+17  9.64039891e+08  5.76578874e+18  2.71382169e+17
 -5.36096440e+09] hash
15306837969549259379    1.100389
5334555704732941581    -0.126597
11008182987929823240    0.515114
5665247046980276841     0.822849
18189146274041990537   -0.045707
Name: log_valuation_factor, dtype: float64
Train MSE:  4.1250781194626704e+36
Train Explained Variance Score:  -3.039325405446828e+36
Test MSE:  3.956668384704159e+36
Test Explained Variance Score:  -3.128693984176764e+36
Confusion:
             positive  negative
Truth is +      2758       452
Truth is -      1121       169
Test accuracy:  0.6504444444444445
Train accuracy:  0.6575789005778634

RESULTS FOR 500 DAY MARK:

Trained on 10535 rows.
Sample values:  [ 4.84214805e+17  3.18637375e+17 -3.17194588e+10  1.56430148e+18
  1.80446366e+15] hash
11458485165009085745    0.144849
1709515464774886424     0.557192
14532713142401210515    1.341234
3110753611647226347     0.41

# SVR

In [45]:
import sklearn.svm

for mark in marks:
  svm = sklearn.svm.SVR(C=0.001)
  train_model_over_mark(svm, features, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 13498 rows.
Sample values:  [0.25839902 0.25458862 0.25465205 0.28823923 0.25503005] hash
7105424532577421832     0.236775
17938072677234173160   -0.504827
8536989815064248433     2.225420
6703614601926715990     1.256371
6768047497154698784    -0.057384
Name: log_valuation_factor, dtype: float64
Train MSE:  0.9011099336782011
Train Explained Variance Score:  0.0002646850600577233
Test MSE:  0.9756831000569784
Test Explained Variance Score:  0.00036385642749237057
Confusion:
             positive  negative
Truth is +      3189         0
Truth is -      1311         0
Test accuracy:  0.7086666666666667
Train accuracy:  0.7164765150392651

RESULTS FOR 500 DAY MARK:

Trained on 10535 rows.
Sample values:  [0.570796   0.57246984 0.57298961 0.57298961 0.57298961] hash
1064142604451815243    -2.685577
705979575210094084      2.635994
3878315151291552270     1.401807
5469848291511864632    -0.234240
10046406058429193438    0.534614
Name: log_valuation_fa

## MultiLayer Perceptron

In [None]:
import sklearn.neural_network

models = []
for mark in marks:
  mlp = sklearn.neural_network.MLPRegressor(alpha=1e-2, hidden_layer_sizes=(100, 50, 20), max_iter=500)
  train_model_over_mark(mlp, features, mark, filter_unknown=True)
  models.append(mlp)


RESULTS FOR 200 DAY MARK:

Trained on 13498 rows.


In [None]:
# Save the models
import pickle
for mark, model in zip(marks, models):
  pickle.dump(model, '/content/gdrive/My Drive/vc_modeling/models/' + str(mark) + '.pkl')

# Export Predictions

In [None]:
# Load the models
import pickle
models = []
for mark in zip(marks):
  model = pickle.load('/content/gdrive/My Drive/vc_modeling/models/' + str(mark) + '.pkl')
  models.append(model)


In [39]:
days = [200, 500, 1000, 2000]
org_info = pd.read_csv("/content/gdrive/My Drive/vc_modeling/data/crunchbase_bulk_export/organizations.csv")

Unnamed: 0,uuid,name,type,permalink,cb_url,rank,created_at,updated_at,legal_name,roles,domain,homepage_url,country_code,state_code,region,city,address,postal_code,status,short_description,category_list,category_groups_list,num_funding_rounds,total_funding_usd,total_funding,total_funding_currency_code,founded_on,last_funding_on,closed_on,employee_count,email,phone,facebook_url,linkedin_url,twitter_url,logo_url,alias1,alias2,alias3,primary_role,num_exits,hash
0,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,organization,wetpaint,https://www.crunchbase.com/organization/wetpaint,123607.0,2007-05-25 13:51:27,2019-06-24 22:19:25,,company,wetpaint.com,http://www.wetpaint.com/,USA,NY,New York,New York,902 Broadway 11th Floor New,10010.0,acquired,Wetpaint offers an online social publishing pl...,"Publishing,Social Media,Social Media Management","Content and Publishing,Internet Services,Media...",3.0,39750000.0,39750000.0,USD,2005-06-01,2008-05-19,,51-100,info@wetpaint.com,206-859-6300,https://www.facebook.com/Wetpaint,https://www.linkedin.com/company/wetpaint,https://twitter.com/wetpainttv,https://crunchbase-production-res.cloudinary.c...,,,,company,,13685534557686295101
1,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,organization,zoho,https://www.crunchbase.com/organization/zoho,8777.0,2007-05-26 02:30:28,2018-10-27 00:29:49,,"investor,company",zoho.com,https://www.zoho.com/,USA,CA,California,Pleasanton,4141 Hacienda Drive,94588.0,operating,"Zoho offers a suite of business, collaboration...","Cloud Computing,Collaboration,CRM,Developer To...","Information Technology,Internet Services,Priva...",,,,,1996-09-15,,,1001-5000,info@zohocorp.com,,http://www.facebook.com/zoho,http://www.linkedin.com/company/zoho-corporati...,http://twitter.com/zoho,https://crunchbase-production-res.cloudinary.c...,,,,company,1.0,764015621929367586
2,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,organization,digg,https://www.crunchbase.com/organization/digg,11537.0,2007-05-26 03:03:23,2018-12-10 10:09:14,"Digg Holdings, LLC",company,digg.com,http://www.digg.com,USA,NY,New York,New York,,,acquired,Digg Inc. operates a website that enables its ...,"Internet,Social Media,Social Network","Internet Services,Media and Entertainment",6.0,49000000.0,49000000.0,USD,2004-10-11,2016-09-13,,51-100,feedback@digg.com,877-342-7222,http://www.facebook.com/digg,http://www.linkedin.com/company/digg,http://twitter.com/digg,https://crunchbase-production-res.cloudinary.c...,,,,company,,10846552445983457719
3,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,organization,omidyar-network,https://www.crunchbase.com/organization/omidya...,6844.0,2007-05-26 03:21:34,2019-06-19 12:17:48,,investor,omidyar.com,http://www.omidyar.com,USA,CA,California,Redwood City,1991 Broadway Suite 200,94063.0,operating,Omidyar Network is an investment firm.,"Enterprise Software,Financial Services,Venture...","Financial Services,Lending and Investments,Sof...",,,,,2004-01-01,,,101-250,info@omidyar.com,650.482.2500,http://www.facebook.com/OmidyarNetwork,http://www.linkedin.com/company/22806,http://twitter.com/OmidyarNetwork,https://crunchbase-production-res.cloudinary.c...,,,,investor,33.0,10693046220981818130
4,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,organization,facebook,https://www.crunchbase.com/organization/facebook,15.0,2007-05-26 04:22:15,2020-03-30 18:33:35,"Facebook, Inc.","investor,company",facebook.com,http://www.facebook.com,USA,CA,California,Menlo Park,1 Hacker Way,94025.0,ipo,Facebook is an online social networking servic...,"E-Commerce,Mobile Apps,Social,Social Media,Soc...","Apps,Commerce and Shopping,Content and Publish...",15.0,2335700000.0,2335700000.0,USD,2004-02-04,2013-06-30,,10000+,,,https://www.facebook.com/facebook/,http://www.linkedin.com/company/facebook,https://twitter.com/facebook,https://crunchbase-production-res.cloudinary.c...,,,,company,,5087506707876194815


In [42]:
org_info['hash'] = org_info['uuid'].apply(cityhash.CityHash64)
org_info = org_info.set_index('hash')
org_info = org_info[['uuid', 'name', 'created_at', ]]
org_info.head()

Unnamed: 0_level_0,uuid,name,type,permalink,cb_url,rank,created_at,updated_at,legal_name,roles,domain,homepage_url,country_code,state_code,region,city,address,postal_code,status,short_description,category_list,category_groups_list,num_funding_rounds,total_funding_usd,total_funding,total_funding_currency_code,founded_on,last_funding_on,closed_on,employee_count,email,phone,facebook_url,linkedin_url,twitter_url,logo_url,alias1,alias2,alias3,primary_role,num_exits
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
13685534557686295101,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,organization,wetpaint,https://www.crunchbase.com/organization/wetpaint,123607.0,2007-05-25 13:51:27,2019-06-24 22:19:25,,company,wetpaint.com,http://www.wetpaint.com/,USA,NY,New York,New York,902 Broadway 11th Floor New,10010.0,acquired,Wetpaint offers an online social publishing pl...,"Publishing,Social Media,Social Media Management","Content and Publishing,Internet Services,Media...",3.0,39750000.0,39750000.0,USD,2005-06-01,2008-05-19,,51-100,info@wetpaint.com,206-859-6300,https://www.facebook.com/Wetpaint,https://www.linkedin.com/company/wetpaint,https://twitter.com/wetpainttv,https://crunchbase-production-res.cloudinary.c...,,,,company,
764015621929367586,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,organization,zoho,https://www.crunchbase.com/organization/zoho,8777.0,2007-05-26 02:30:28,2018-10-27 00:29:49,,"investor,company",zoho.com,https://www.zoho.com/,USA,CA,California,Pleasanton,4141 Hacienda Drive,94588.0,operating,"Zoho offers a suite of business, collaboration...","Cloud Computing,Collaboration,CRM,Developer To...","Information Technology,Internet Services,Priva...",,,,,1996-09-15,,,1001-5000,info@zohocorp.com,,http://www.facebook.com/zoho,http://www.linkedin.com/company/zoho-corporati...,http://twitter.com/zoho,https://crunchbase-production-res.cloudinary.c...,,,,company,1.0
10846552445983457719,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,organization,digg,https://www.crunchbase.com/organization/digg,11537.0,2007-05-26 03:03:23,2018-12-10 10:09:14,"Digg Holdings, LLC",company,digg.com,http://www.digg.com,USA,NY,New York,New York,,,acquired,Digg Inc. operates a website that enables its ...,"Internet,Social Media,Social Network","Internet Services,Media and Entertainment",6.0,49000000.0,49000000.0,USD,2004-10-11,2016-09-13,,51-100,feedback@digg.com,877-342-7222,http://www.facebook.com/digg,http://www.linkedin.com/company/digg,http://twitter.com/digg,https://crunchbase-production-res.cloudinary.c...,,,,company,
10693046220981818130,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,organization,omidyar-network,https://www.crunchbase.com/organization/omidya...,6844.0,2007-05-26 03:21:34,2019-06-19 12:17:48,,investor,omidyar.com,http://www.omidyar.com,USA,CA,California,Redwood City,1991 Broadway Suite 200,94063.0,operating,Omidyar Network is an investment firm.,"Enterprise Software,Financial Services,Venture...","Financial Services,Lending and Investments,Sof...",,,,,2004-01-01,,,101-250,info@omidyar.com,650.482.2500,http://www.facebook.com/OmidyarNetwork,http://www.linkedin.com/company/22806,http://twitter.com/OmidyarNetwork,https://crunchbase-production-res.cloudinary.c...,,,,investor,33.0
5087506707876194815,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,organization,facebook,https://www.crunchbase.com/organization/facebook,15.0,2007-05-26 04:22:15,2020-03-30 18:33:35,"Facebook, Inc.","investor,company",facebook.com,http://www.facebook.com,USA,CA,California,Menlo Park,1 Hacker Way,94025.0,ipo,Facebook is an online social networking servic...,"E-Commerce,Mobile Apps,Social,Social Media,Soc...","Apps,Commerce and Shopping,Content and Publish...",15.0,2335700000.0,2335700000.0,USD,2004-02-04,2013-06-30,,10000+,,,https://www.facebook.com/facebook/,http://www.linkedin.com/company/facebook,https://twitter.com/facebook,https://crunchbase-production-res.cloudinary.c...,,,,company,


In [16]:
import math

print(marks)
for model, mark in zip(models, marks):
  truth = regression_marks[mark][['initial_valuation', 'log_valuation_factor']].copy()
  # print(truth)
  prediction_array = model.predict(features)

  prediction = pd.DataFrame(data=prediction_array, index=features.index, columns=['prediction_' + str(mark)])
  uuid_hash = features.index
  pred_truth = truth.join(prediction)
  pred_truth = pred_truth.rename(columns={'log_valuation_factor':'truth_' + str(mark)})
  org_info = org_info.merge(pred_truth)

print(org_info)

[200, 500, 1000, 2000]


NameError: ignored

# Comparison to Random