<a href="https://colab.research.google.com/github/siddtheshah/vc_modeling/blob/master/regressor_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse
!pip install cityhash
import cityhash
import sklearn.decomposition

print(pd.__version__)

from copy import deepcopy

1.0.5


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Read/Join Features

## Sparse Features
These need to go through dimensionality reduction

In [3]:
feature_folder = '/content/gdrive/My Drive/vc_modeling/feature_extraction'

sparse_category_features_array = scipy.sparse.load_npz(feature_folder + "/category_features/category_features_large.npz")
sparse_region_features_array = scipy.sparse.load_npz(feature_folder + "/region_features/region_features.npz")

In [4]:
# print(sparse_category_features_array)
## Other features here!! Remember to sparsify the dataframes if they're dense!

# print(sparse_category_features_array)

category_features_array = scipy.sparse.coo_matrix(sparse_category_features_array, dtype=np.uint64)
region_features_array = scipy.sparse.coo_matrix(sparse_region_features_array, dtype=np.uint64)

print(category_features_array.getnnz())
print(region_features_array.getnnz())

category_features_df = pd.DataFrame.sparse.from_spmatrix(category_features_array)
region_features_df = pd.DataFrame.sparse.from_spmatrix(region_features_array)

print(np.shape(category_features_df))
print(np.shape(region_features_df))

print("{}".format(category_features_df.iloc[0][0]))

2355537
1681588
(963967, 676)
(842699, 1032)
13685534557686295101


In [5]:
region_uuid = region_features_df.iloc[:, 0]
category_uuid = category_features_df.iloc[:, 0]
print(region_uuid)

print(np.count_nonzero(category_uuid.isin(region_uuid)))
check_value = 7551169957279540846
# check_value = cityhash.CityHash64('ffffabce-6d4a-b3d1-13c0-4e90cedf5270')
print(check_value)
print(np.size(region_uuid[region_uuid == check_value]))
print(np.size(category_uuid[category_uuid == check_value]))

0         13685534557686295101
1           764015621929367586
2         10846552445983457719
3          5087506707876194815
4          9094535307341385563
                  ...         
842694     4626564123199390189
842695     2978566027619600648
842696     1747954284855665241
842697     9744251496066876000
842698     7648380604111671063
Name: 0, Length: 842699, dtype: Sparse[uint64, 0]
808944
7551169957279540846
1
1


In [6]:
join_base = category_features_df.set_index(0)
join1 = region_features_df.set_index(0)
sparse_join = join_base.join(join1, lsuffix='category_features', rsuffix='region_features')
sparse_join = sparse_join.dropna()
# features = category_features_df
print(sparse_join)

                      1category_features  2category_features  ...  1030  1031
0                                                             ...            
13685534557686295101                 0.0                 0.0  ...   0.0   0.0
764015621929367586                   1.0                 1.0  ...   0.0   0.0
10846552445983457719                 0.0                 0.0  ...   0.0   0.0
5087506707876194815                  0.0                 0.0  ...   0.0   0.0
9094535307341385563                  0.0                 0.0  ...   0.0   0.0
...                                  ...                 ...  ...   ...   ...
16892862651199510638                 0.0                 0.0  ...   0.0   0.0
7229716827056183679                  0.0                 0.0  ...   0.0   0.0
4626564123199390189                  0.0                 0.0  ...   0.0   0.0
2978566027619600648                  0.0                 0.0  ...   0.0   0.0
9744251496066876000                  0.0                 0.0  ..

## Dimensionality Reduction

In [7]:
svd = sklearn.decomposition.TruncatedSVD(n_components=100, n_iter=10)
# Can't fit more than 10k samples or SVD will crash.
# If the samples are well distributed, this might be OK.
svd.fit(sparse_join[:15000])

print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)

reduced_features = svd.transform(sparse_join)

# lda = sklearn.decomposition.LatentDirichletAllocation(n_components=100,random_state=0, learning_method='online', total_samples=2e5)
# lda.partial_fit(sparse_join)
# reduced_features = lda.transform(sparse_join)


[0.06201002 0.04136426 0.03052181 0.02412492 0.02133094 0.01809638
 0.01821206 0.01757454 0.01589228 0.01511112 0.01381074 0.01349057
 0.01274772 0.01224814 0.01171326 0.01124586 0.01098865 0.01016308
 0.00945056 0.00909412 0.00893294 0.00866497 0.0080689  0.0079854
 0.00778647 0.00739946 0.00723822 0.00673296 0.00643826 0.00631925
 0.0061844  0.00595845 0.00580656 0.00570427 0.00569912 0.00543318
 0.00519543 0.00515399 0.00512568 0.00492261 0.00489803 0.00476558
 0.00470362 0.00468775 0.00454391 0.00444036 0.0044279  0.00432054
 0.00428805 0.00414271 0.0040916  0.00406363 0.0039766  0.00383421
 0.00373742 0.00367794 0.00361784 0.0034927  0.00344725 0.00342721
 0.00338314 0.00330981 0.00321606 0.00313725 0.00310952 0.00311294
 0.00307007 0.00304925 0.00302563 0.00296626 0.00293493 0.00289766
 0.00285058 0.00282081 0.00278331 0.00274471 0.00269561 0.00268947
 0.00265179 0.00263574 0.00262881 0.00259017 0.00256825 0.00254525
 0.00252571 0.00249433 0.00248445 0.00246696 0.00246281 0.00242

In [8]:
print(reduced_features)

[[ 1.93442333e-02  1.01881367e-01  2.75295137e-01 ... -5.01619430e-04
   3.16123014e-03 -3.39205537e-03]
 [ 1.22762868e+00 -5.75667147e-02  4.53293139e-02 ... -2.48072988e-01
   2.74366063e-01  1.51284765e-01]
 [ 8.52595440e-02  1.09923636e-01  2.99163741e-01 ...  2.18662279e-02
   3.88661477e-04 -7.44998317e-03]
 ...
 [ 4.96430409e-04  1.41098458e-03  2.81446000e-03 ... -5.01053818e-03
  -8.24090503e-03 -1.08722485e-02]
 [ 9.65193781e-01 -2.14868786e-01 -1.12641455e-01 ...  1.01631988e-01
  -1.98974917e-02  6.11482808e-02]
 [ 7.08646222e-03  2.48330429e-03  7.73408886e-03 ...  6.30110464e-02
   1.69103963e-01 -1.27973087e-01]]


## Dense features

In [9]:
founder_features = pd.read_csv('/content/gdrive/My Drive/vc_modeling/feature_extraction/founder_features/organization_founders_features.csv')
founder_features['hash'] = founder_features['org_uuid'].apply(cityhash.CityHash64)
founder_features = founder_features.set_index(['hash'], drop=True).drop(['org_uuid'], axis=1)

founder_features_only = founder_features.dropna()

print(founder_features.index)
print(np.count_nonzero(founder_features.index.isin(sparse_join.index)))


UInt64Index([ 2705467411384211821, 13360469805707984821,  5744847760615345245,
             13990853631299335829, 13073125021883633741, 17482404514494389050,
              5766560289832673038,  8860273864704446424, 13093060529635406942,
              3545623260843038609,
             ...
             15362439443771027468,  7042924928552646182,  4495539880758712748,
               800095033165514664, 17142127829573402143, 11045203277162125921,
             17622773241843276753,  9745467974249593237, 11854798340435595808,
             12723450708549610702],
            dtype='uint64', name='hash', length=198449)
165327


In [10]:
sparse_data = pd.DataFrame(data=reduced_features, index=sparse_join.index, columns=range(np.shape(reduced_features)[1]))
print(sparse_data)

                            0         1   ...        98        99
0                                         ...                    
13685534557686295101  0.019344  0.101881  ...  0.003161 -0.003392
764015621929367586    1.227629 -0.057567  ...  0.274366  0.151285
10846552445983457719  0.085260  0.109924  ...  0.000389 -0.007450
5087506707876194815   1.041919 -0.202393  ... -0.000872 -0.016956
9094535307341385563   1.041080 -0.160387  ...  0.060246 -0.040377
...                        ...       ...  ...       ...       ...
16892862651199510638  0.013808  0.030725  ... -0.148291 -0.149886
7229716827056183679   0.006609  0.029821  ... -0.006852 -0.002768
4626564123199390189   0.000496  0.001411  ... -0.008241 -0.010872
2978566027619600648   0.965194 -0.214869  ... -0.019897  0.061148
9744251496066876000   0.007086  0.002483  ...  0.169104 -0.127973

[808944 rows x 100 columns]


In [11]:
all_features = sparse_data.join(founder_features, lsuffix='sparse', rsuffix='dense').fillna(0)
all_features.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,founders_top_rank,founders_top_college,founders_max_degree_type_ordinal,founders_max_degree_count,founders_max_founded_other_org,founders_count
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
13685534557686295101,0.019344,0.101881,0.275295,0.770317,-0.392017,-0.308171,-0.074005,-0.09828,-0.03007,-0.144071,-0.008071,0.014282,-0.037295,-0.020789,0.036092,-0.071545,-0.08607,-0.031744,0.064637,0.036955,-0.036448,0.007317,0.003496,-0.015807,0.031912,-0.018393,-0.005094,-0.011109,-0.011276,-0.008199,0.02324,0.009102,-0.018469,-0.016408,-0.002777,0.00147,-0.022306,0.001711,-0.005529,-0.001948,...,-0.006978,0.005297,0.00104,0.003455,-0.007144,-0.006633,0.003135,0.006671,-0.010396,0.003409,0.001292,0.00166,0.004507,-0.006544,-0.008661,-0.007365,-0.006274,0.006022,0.001709,-0.006762,-0.003619,-0.002418,0.000256,5.2e-05,-0.002625,0.012168,0.002082,-0.013992,-0.00368,-0.003141,0.003246,-0.000502,0.003161,-0.003392,20738.0,1.0,1.0,2.0,1.0,2.0
764015621929367586,1.227629,-0.057567,0.045329,0.307684,-0.378097,1.342761,-0.47541,-0.082669,0.789706,0.067754,-0.074613,-0.046426,0.686327,-0.644056,0.845123,0.151952,0.006688,0.120635,-0.16638,0.068958,-0.07817,-0.011791,0.130785,0.080263,-0.067059,0.004475,0.004838,0.048233,-0.042638,-0.096127,-0.144643,0.276434,0.397256,-0.492176,0.957969,0.143199,0.104937,-0.237832,0.468004,0.037818,...,-0.05953,0.02256,-0.194546,0.032373,-0.116263,0.070135,0.014322,0.04618,0.012347,-0.025068,0.061757,-0.040025,0.059608,0.012901,0.008808,-0.038358,0.013106,0.152483,-0.025216,0.019414,0.047884,0.00916,0.033771,0.078514,-0.043113,0.067004,0.090099,-0.073363,-0.079905,0.080013,-0.159157,-0.248073,0.274366,0.151285,122.0,1.0,3.0,2.0,1.0,1.0
10846552445983457719,0.08526,0.109924,0.299164,0.86129,-0.430507,-0.285675,-0.125108,-0.085212,-0.052841,0.301601,0.113324,0.230529,0.696884,-0.310273,0.093155,-0.124344,-0.026946,0.032483,-0.001762,0.030034,-0.029834,-0.043316,-0.029376,-0.015402,-0.018304,-0.009062,-0.062101,-0.009256,-0.007697,-0.00374,-0.002182,-0.015565,-0.001157,-0.017887,0.00307,-0.019592,-0.045027,-0.029897,-0.008037,-0.001275,...,0.014675,0.021212,-0.023287,0.009237,-0.034729,0.012161,0.006328,0.043441,-0.043361,0.01247,0.005556,-0.056102,-0.009241,-0.003833,0.006879,-0.025621,-0.031069,0.031094,0.00019,-0.018766,0.024116,0.004497,0.005693,0.005347,-0.012256,0.013282,0.00963,-0.034428,-0.021929,0.017642,0.008498,0.021866,0.000389,-0.00745,1735.0,0.0,0.0,1.0,0.0,1.0
5087506707876194815,1.041919,-0.202393,-0.075857,0.109147,0.023643,-0.070532,-0.040474,-0.010898,-0.029603,0.387249,0.120527,0.245035,0.727058,-0.284426,0.063152,-0.060805,0.008412,0.042996,-0.034869,0.028239,-0.026684,-0.047513,-0.02773,-0.02448,-0.071931,-0.020039,-0.105923,-0.000254,0.024882,0.021468,-0.005955,-0.043904,0.056714,-0.095702,-0.036477,0.125357,0.432349,0.663626,-0.037359,-0.277946,...,-0.079156,0.096766,0.041312,0.014758,-0.064243,-0.013374,0.008866,0.047162,-0.043204,0.000112,0.010583,-0.058543,-0.015238,-0.007289,0.016533,-0.015691,-0.039492,0.020611,0.016583,-0.001244,0.011114,0.003738,0.010277,0.010243,-0.018302,0.008427,0.02473,-0.044477,-0.030381,0.032736,0.001875,0.029077,-0.000872,-0.016956,59.0,1.0,2.0,2.0,2.0,9.0
9094535307341385563,1.04108,-0.160387,-0.084188,0.061716,-0.073297,0.128333,-0.087326,-0.078266,0.009992,0.070557,0.931831,-0.155165,-0.294538,-0.06599,-0.291644,-0.196866,0.069036,-0.085623,0.043731,-0.161204,0.021061,-0.309758,0.87417,0.387983,-0.360148,-0.023082,-0.078195,-0.031849,0.035605,0.024938,-0.125703,-0.039902,-0.044831,0.07649,0.027983,-0.290261,-0.022902,0.109369,-0.068197,-0.214508,...,0.036684,-0.00073,-0.01476,0.04704,0.012864,0.008369,-0.006737,0.009794,-0.004752,-0.047088,-0.014931,0.003079,-0.039029,-0.059524,0.028211,-0.031836,0.014193,0.012767,-0.034438,0.010045,-0.015692,-0.008552,0.016699,-0.006221,0.03059,-0.028374,-0.022767,-0.006314,-0.021006,0.007919,0.019242,-0.085243,0.060246,-0.040377,38172.0,0.0,2.0,2.0,1.0,2.0


In [12]:
import sklearn.preprocessing

features_array = sklearn.preprocessing.normalize(all_features, norm='max', axis=0, copy=False)
features = pd.DataFrame(data=features_array, index=all_features.index, columns=range(np.shape(all_features)[1]))
print(features)

                           0         1    ...       104       105
0                                         ...                    
13685534557686295101  0.011911  0.043331  ...  0.012821  0.060606
764015621929367586    0.755909 -0.024484  ...  0.012821  0.030303
10846552445983457719  0.052498  0.046752  ...  0.000000  0.030303
5087506707876194815   0.641559 -0.086080  ...  0.025641  0.272727
9094535307341385563   0.641042 -0.068214  ...  0.012821  0.060606
...                        ...       ...  ...       ...       ...
16892862651199510638  0.008502  0.013068  ...  0.000000  0.000000
7229716827056183679   0.004069  0.012683  ...  0.000000  0.060606
4626564123199390189   0.000306  0.000600  ...  0.000000  0.000000
2978566027619600648   0.594315 -0.091386  ...  0.000000  0.000000
9744251496066876000   0.004363  0.001056  ...  0.000000  0.000000

[808944 rows x 106 columns]


# Read Regression Targets

In [13]:
target_folder = '/content/gdrive/My Drive/vc_modeling/regression_targets/'
marks = [200, 500, 1000, 2000]

regression_marks = {}
for mark in marks:
  regression_marks[mark] = pd.read_csv(target_folder + str(mark) + '.csv')[['hash', 'initial_valuation', 'log_valuation_factor']].set_index('hash')


In [17]:
mark_data = regression_marks[200] # pd.read_pickle("/content/gdrive/My Drive/vc_modeling/regression_targets/200.pkl")
# mark_data = mark_data[mark_data['log_valuation_factor'] > 0]
print(mark_data)
print(features)
print(np.count_nonzero(mark_data.index.isin(features.index)))

                      initial_valuation  log_valuation_factor
hash                                                         
2053339725337568679         413036820.0              0.000000
13360469805707984821          3000000.0              0.266595
12201126308526847683         47500000.0              0.000000
17482404514494389050          2157880.0              0.000000
16923506324318240851          7500000.0              0.000000
...                                 ...                   ...
4057326795460754576            552105.0              0.000000
17393885764651115266         20000000.0              1.011831
14785394360939257924        125000000.0              0.000000
15207057269115911424         31150000.0              0.000000
12723450708549610702            75000.0              1.688395

[144569 rows x 2 columns]
                           0         1    ...       104       105
0                                         ...                    
13685534557686295101  0.011911  0.0

# Train Model

In [26]:
# Train Models
import sklearn.metrics
import sklearn.model_selection

def regression_analysis(model, train_data, train_values, test_data, test_values):
    predicted_train_values = model.predict(train_data)
    predicted_test_values = model.predict(test_data)

    print("Sample values: ", predicted_test_values[:5], test_values[:5])

    train_mse = sklearn.metrics.mean_squared_error(train_values, predicted_train_values)
    test_mse = sklearn.metrics.mean_squared_error(test_values, predicted_test_values)
    train_explained_variance = sklearn.metrics.explained_variance_score(train_values, predicted_train_values)
    test_explained_variance = sklearn.metrics.explained_variance_score(test_values, predicted_test_values)

    print("Train MSE: ", train_mse)
    print("Train Explained Variance Score: ", train_explained_variance)
    print("Test MSE: ", test_mse)
    print("Test Explained Variance Score: ", test_explained_variance)

    return model

def classification_analysis(model, train_data, train_values, test_data, test_values):
    train_values_predicted = model.predict(train_data)
    threshold = np.average(train_values_predicted)
    train_prediction = train_values_predicted > threshold
    train_prediction = train_prediction.astype(np.int32)

    test_values_predicted = model.predict(test_data)
    test_prediction = test_values_predicted > threshold
    test_prediction = test_prediction.astype(np.int32)

    train_labels = train_values > threshold
    train_labels = train_labels.astype(np.int32)

    test_labels = test_values > threshold
    test_labels = test_labels.astype(np.int32)

    confusion_matrix_large = pd.DataFrame(sklearn.metrics.confusion_matrix(test_labels, test_prediction, labels=[1, 0]),
                                    columns=['positive', 'negative'], index=['Truth is +', 'Truth is -'])
    print("Confusion:\n", confusion_matrix_large)
    test_acc = sum(test_labels==test_prediction)/len(test_labels)
    print("Test accuracy: ", test_acc)
    train_acc = sum(train_labels==train_prediction)/len(train_labels)
    print("Train accuracy: ", train_acc)


    # # Use the metrics.roc_curve function to get the true positive rate (tpr) and false positive rate (fpr)
    # fpr, tpr, thresholds = sklearn.metrics.roc_curve(test_labels, test_probabilities)

    # # Get the area under the curve (AUC)
    # auc = np.mean(cross_val_score(model, test_data, test_labels, scoring="roc_auc", cv=5))
    # print("AUC = " , str(round(auc, 2)))

    # # Plot the ROC curve

    # plt.xlabel("False positive rate (fpr)")
    # plt.ylabel("True positive rate (tpr)")
    # plt.plot(fpr, tpr, label='model')
    # plt.plot([0, 1], [0, 1], color='k', label="random")
    # plt.legend(loc='best')

    # plt.figure()
    # plt.xlabel("Recall")
    # plt.ylabel("Precision")
    # precision, recall, _ = sklearn.metrics.precision_recall_curve(test_labels, test_probabilities)
    # plt.plot(recall, precision)

def train_model_over_mark(model, input_data, mark, filter_unknown=True, hyperparams=None):
  print("\nRESULTS FOR", str(mark), "DAY MARK:\n")
  # print(features)
  mark_data = regression_marks[mark].dropna()
  # print(mark_data)
  if filter_unknown:
    mark_data = mark_data[mark_data['log_valuation_factor'] != 0]
  # Select the data that we have regression targets for

  data = input_data[input_data.index.isin(mark_data.index)].sort_index()
  # print(data)


  # Select the column with log_valuation_factor.
  values = mark_data[mark_data.index.isin(input_data.index)]['log_valuation_factor'].sort_index()
  # print(values)

  train_data, test_data, train_values, test_values = sklearn.model_selection.train_test_split(data, values, test_size=0.25)
  if hyperparams:
    print("Conducting Grid Search")
    search = sklearn.model_selection.GridSearchCV(model, hyperparams)
    search.fit(train_data, train_values)
    model = search.best_estimator_
  
  print("Trained on", str(np.shape(train_data)[0]), "rows.")
  model.fit(train_data, train_values)
  regression_analysis(model, train_data, train_values, test_data, test_values)
  classification_analysis(model, train_data, train_values, test_data, test_values)

In [19]:
print(founder_features_only[:5])

                      founders_top_rank  ...  founders_count
hash                                     ...                
2705467411384211821            265041.0  ...               4
13360469805707984821             6551.0  ...               2
13990853631299335829           147007.0  ...               1
17482404514494389050           785716.0  ...               1
15337030219814864514             1672.0  ...               1

[5 rows x 6 columns]


## Lasso

In [28]:
import sklearn.linear_model

for mark in marks:
  lasso_model = sklearn.linear_model.LassoCV()
  train_model_over_mark(lasso_model, founder_features_only, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 15194 rows.
Sample values:  [0.47289684 0.48227043 0.45136202 0.4787192  0.47818043] hash
9622546503644133800     1.148421
13564795469384548957   -0.013797
8146852167827177215     0.268857
14221072989662828273    0.248886
9182469554284269992     0.334900
Name: log_valuation_factor, dtype: float64
Train MSE:  0.9234066704219968
Train Explained Variance Score:  0.005736404158916963
Test MSE:  0.9533087896836367
Test Explained Variance Score:  0.007994415095862184
Confusion:
             positive  negative
Truth is +      1476       599
Truth is -      1910      1080
Test accuracy:  0.504639684106614
Train accuracy:  0.5080294853231538

RESULTS FOR 500 DAY MARK:

Trained on 15258 rows.
Sample values:  [0.92139    0.79760271 0.48490741 0.89010718 0.80555982] hash
15952436086378540884    2.148845
10361184551274432178    3.018954
12377247905022160551    0.082780
6064338030755353548     1.213023
9567037658667064656     1.632980
Name: log_valuation_factor

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

models = []
hyperdict = {'n_estimators': [20, 40, 60, 80, 100], 'min_samples_split':[10, 20, 30]}

for mark in marks:
  tree_model = RandomForestRegressor(criterion='mse', max_depth=4)
  train_model_over_mark(tree_model, features, mark, filter_unknown=True)
  models.append(tree_model)


RESULTS FOR 200 DAY MARK:

Trained on 39445 rows.


## SGD

In [27]:
import sklearn.linear_model

sgd = sklearn.linear_model.SGDRegressor()

for mark in marks:
  train_model_over_mark(sgd, features, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 39445 rows.
Sample values:  [0.22003474 0.33866962 0.3883994  0.30914981 0.27774328] hash
13764208312783600756    0.299257
7712191640504818461    -0.037245
520587253883061091      0.870304
7921057564085320398     1.447795
9676512273645742147     0.297620
Name: log_valuation_factor, dtype: float64
Train MSE:  0.862958986492888
Train Explained Variance Score:  0.015644671691800727
Test MSE:  0.8371483697020786
Test Explained Variance Score:  0.01302847815889685
Confusion:
             positive  negative
Truth is +      2769      2586
Truth is -      3220      4574
Test accuracy:  0.558445509164195
Train accuracy:  0.5648117632146026

RESULTS FOR 500 DAY MARK:

Trained on 39574 rows.


ValueError: ignored

# SVR

In [23]:
import sklearn.svm

for mark in marks:
  svm = sklearn.svm.SVR(C=0.001)
  train_model_over_mark(svm, features, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 39445 rows.


KeyboardInterrupt: ignored

## MultiLayer Perceptron

In [31]:
import sklearn.neural_network

models = []
for mark in marks:
  mlp = sklearn.neural_network.MLPRegressor(alpha=1e-2, hidden_layer_sizes=(100, 50, 20), max_iter=500)
  train_model_over_mark(mlp, features, mark, filter_unknown=True)
  models.append(mlp)


RESULTS FOR 200 DAY MARK:

Trained on 13498 rows.
Sample values:  [ 1.01448316  0.13523465  2.98108355 -0.20288775  0.64604466] [ 1.41098697  8.26614023 -0.21043823  0.19912993  0.37140677]
Train MSE:  0.42701258332359915
Train Explained Variance Score:  0.5338952743648366
Test MSE:  1.625637056147282
Test Explained Variance Score:  -0.7780854593892346
Confusion:
             positive  negative
Truth is +       883       882
Truth is -      1414      1321
Test accuracy:  0.48977777777777776
Train accuracy:  0.7315157801155727

RESULTS FOR 500 DAY MARK:

Trained on 10535 rows.
Sample values:  [-1.34096084  0.61557298  0.33235761  3.82992509  0.38427185] [-2.67065643  1.17937934  3.40119738  0.81166578 -0.40932986]
Train MSE:  0.6380924964172782
Train Explained Variance Score:  0.638961100389262
Test MSE:  3.5991605344502013
Test Explained Variance Score:  -0.9857997549251816
Confusion:
             positive  negative
Truth is +       935       886
Truth is -       878       813
Test ac

In [32]:
# Save the models
import pickle
print(models)
for mark, model in zip(marks, models):
  print(model)
  pickle.dump(model, open('/content/gdrive/My Drive/vc_modeling/models/' + str(mark) + '.pkl', 'wb'))

[MLPRegressor(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100, 50, 20), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=500,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False), MLPRegressor(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100, 50, 20), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=500,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1

# Export Predictions

In [33]:
# Load the models
import pickle
models = []
for mark in marks:

  model = pickle.load(open('/content/gdrive/My Drive/vc_modeling/models/' + str(int(mark)) + '.pkl', 'rb'))
  models.append(model)


In [41]:
org_info = pd.read_csv("/content/gdrive/My Drive/vc_modeling/data/crunchbase_bulk_export/organizations.csv")

In [42]:
org_info['hash'] = org_info['uuid'].apply(cityhash.CityHash64)
org_info = org_info.set_index('hash')
org_info = org_info[['uuid', 'name', 'created_at']]
org_info.head()

Unnamed: 0_level_0,uuid,name,created_at
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
13685534557686295101,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,2007-05-25 13:51:27
764015621929367586,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,2007-05-26 02:30:28
10846552445983457719,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,2007-05-26 03:03:23
10693046220981818130,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,2007-05-26 03:21:34
5087506707876194815,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,2007-05-26 04:22:15


In [44]:
import math

org_info_join = org_info.copy()

print(marks)
for model, mark in zip(models, marks):
  truth = regression_marks[mark][['initial_valuation', 'log_valuation_factor']].copy()
  # print(truth)
  prediction_array = model.predict(features)

  prediction = pd.DataFrame(data=prediction_array, index=features.index, columns=['prediction_' + str(mark)])
  uuid_hash = features.index
  pred_truth = truth.join(prediction)
  pred_truth = pred_truth.rename(columns={'log_valuation_factor':'truth_' + str(mark)})
  org_info_join = org_info_join.join(pred_truth, rsuffix=mark)

org_info_join org_info_join.set_index('uuid', drop=True)
org_info_join.head()

org_info_join.to_csv('/content/gdrive/My Drive/vc_modeling/model_output/predictions.csv')

[200, 500, 1000, 2000]


Unnamed: 0_level_0,uuid,name,created_at,initial_valuation,truth_200,prediction_200,initial_valuation500,truth_500,prediction_500,initial_valuation1000,truth_1000,prediction_1000,initial_valuation2000,truth_2000,prediction_2000
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13685534557686295101,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,2007-05-25 13:51:27,26250000.0,0.0,0.051553,26250000.0,0.339276,1.352991,26250000.0,0.0,1.194802,26250000.0,2.80565,2.923317
764015621929367586,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,2007-05-26 02:30:28,,,,,,,,,,,,
10846552445983457719,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,2007-05-26 03:03:23,14000000.0,0.0,0.35335,14000000.0,0.0,0.6698,14000000.0,0.0,3.216546,14000000.0,0.0,1.861921
10693046220981818130,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,2007-05-26 03:21:34,,,,,,,,,,,,
5087506707876194815,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,2007-05-26 04:22:15,2500000.0,2.416558,2.876117,2500000.0,0.0,0.548189,2500000.0,0.0,1.083458,2500000.0,0.0,1.773893


# Comparison to Random