<a href="https://colab.research.google.com/github/siddtheshah/vc_modeling/blob/master/regressor_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse
!pip install cityhash
import cityhash

print(pd.__version__)

from copy import deepcopy

1.0.5


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
file_names = os.listdir("/content/gdrive/My Drive/vc_modeling/data/crunchbase_bulk_export/")
df_names = [x[:-4] for x in file_names]
print(df_names)

dfs = [pd.read_csv("/content/gdrive/My Drive/vc_modeling/data/crunchbase_bulk_export/"+x) for x in file_names]
df_dict = dict(zip(df_names, dfs))
print(df_dict.keys())

['category_groups', 'funding_rounds', 'people', 'checksum', 'people_descriptions', 'investors', 'organization_descriptions', 'investment_partners', 'event_appearances', 'organizations', 'org_parents', 'jobs', 'acquisitions', 'funds', 'ipos', 'degrees', 'investments', 'events']
dict_keys(['category_groups', 'funding_rounds', 'people', 'checksum', 'people_descriptions', 'investors', 'organization_descriptions', 'investment_partners', 'event_appearances', 'organizations', 'org_parents', 'jobs', 'acquisitions', 'funds', 'ipos', 'degrees', 'investments', 'events'])


# Read/Join Features

In [4]:
feature_folder = '/content/gdrive/My Drive/vc_modeling/feature_extraction'

sparse_category_features_array = pd.read_pickle(feature_folder + "/category_features/category_features.pkl")

# print(sparse_category_features_array)
## Other features here!! Remember to sparsify the dataframes if they're dense!


features = scipy.sparse.coo_matrix(sparse_category_features_array)



# Read Regression Targets

In [5]:
target_folder = '/content/gdrive/My Drive/vc_modeling/regression_targets/'
marks = [200, 500, 1000, 2000]

regression_marks = {}
for mark in marks:
  regression_marks[mark] = pd.read_csv(target_folder + str(mark) + '.csv')


In [79]:

mark_data = regression_marks[200]
mark_data = mark_data[mark_data['log_valuation_factor'] > 0]

388


# Train Model

In [98]:
# Train Models
import sklearn.metrics

def regression_analysis(model, train_data, train_values, test_data, test_values):
    predicted_train_values = model.predict(train_data)
    predicted_test_values = model.predict(test_data)

    train_mse = sklearn.metrics.mean_squared_error(train_values, predicted_train_values)
    test_mse = sklearn.metrics.mean_squared_error(test_values, predicted_test_values)
    train_explained_variance = sklearn.metrics.explained_variance_score(train_values, predicted_train_values)
    test_explained_variance = sklearn.metrics.explained_variance_score(test_values, predicted_test_values)

    print("Train MSE: ", train_mse)
    print("Train Explained Variance Score: ", train_explained_variance)
    print("Test MSE: ", test_mse)
    print("Test Explained Variance Score: ", test_explained_variance)

    return model

def classification_analysis(model, train_data, train_values, test_data, test_values):
    train_values_predicted = model.predict(train_data)
    train_prediction = train_values_predicted > 0.00001
    train_prediction = train_prediction.astype(np.int32)

    test_values_predicted = model.predict(test_data)
    test_prediction = test_values_predicted > 0.00001
    test_prediction = test_prediction.astype(np.int32)

    train_labels = train_values > 0.00001
    train_labels = train_labels.astype(np.int32)

    test_labels = test_values > 0.00001
    test_labels = test_labels.astype(np.int32)

    confusion_matrix_large = pd.DataFrame(sklearn.metrics.confusion_matrix(test_labels, test_prediction, labels=[1, 0]),
                                    columns=['positive', 'negative'], index=['Truth is +', 'Truth is -'])
    print("Confusion:\n", confusion_matrix_large)
    test_acc = sum(test_labels==test_prediction)/len(test_labels)
    print("Test accuracy: ", test_acc)
    train_acc = sum(train_labels==train_prediction)/len(train_labels)
    print("Train accuracy: ", train_acc)


    # # Use the metrics.roc_curve function to get the true positive rate (tpr) and false positive rate (fpr)
    # fpr, tpr, thresholds = sklearn.metrics.roc_curve(test_labels, test_probabilities)

    # # Get the area under the curve (AUC)
    # auc = np.mean(cross_val_score(model, test_data, test_labels, scoring="roc_auc", cv=5))
    # print("AUC = " , str(round(auc, 2)))

    # # Plot the ROC curve

    # plt.xlabel("False positive rate (fpr)")
    # plt.ylabel("True positive rate (tpr)")
    # plt.plot(fpr, tpr, label='model')
    # plt.plot([0, 1], [0, 1], color='k', label="random")
    # plt.legend(loc='best')

    # plt.figure()
    # plt.xlabel("Recall")
    # plt.ylabel("Precision")
    # precision, recall, _ = sklearn.metrics.precision_recall_curve(test_labels, test_probabilities)
    # plt.plot(recall, precision)

def train_model_over_mark(model, mark, filter_unknown=True):
  print("\nRESULTS FOR", str(mark), "DAY MARK:\n")
  # print(features)
  mark_data = regression_marks[mark]
  if filter_unknown:
    mark_data = mark_data[mark_data['log_valuation_factor'] != 0]
  sparse_data = pd.DataFrame.sparse.from_spmatrix(features)
  # Select the data that we have regression targets for
  data = sparse_data[sparse_data.index.isin(mark_data.index)].drop(columns=[0])

  # Select the column with log_valuation_factor.
  values = mark_data['log_valuation_factor']

  data = np.array(data)

  train_data, test_data, train_values, test_values = sklearn.model_selection.train_test_split(data, values, test_size=0.25)
  print("Trained on", str(np.shape(train_data)[0]), "rows.")
  
  model.fit(train_data, train_values)
  regression_analysis(lasso_model, train_data, train_values, test_data, test_values)
  classification_analysis(lasso_model, train_data, train_values, test_data, test_values)

## LASSO

In [99]:
import sklearn.linear_model

lasso_model = sklearn.linear_model.Lasso()
for mark in marks:
  train_model_over_mark(lasso_model, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 351 rows.
Train MSE:  0.6963439723467224
Train Explained Variance Score:  0.0
Test MSE:  0.3651160874001776
Test Explained Variance Score:  0.0
Confusion:
             positive  negative
Truth is +        93         0
Truth is -        24         0
Test accuracy:  0.7948717948717948
Train accuracy:  0.8404558404558404

RESULTS FOR 500 DAY MARK:

Trained on 256 rows.
Train MSE:  1.3062890500983122
Train Explained Variance Score:  0.0
Test MSE:  0.7313452076024265
Test Explained Variance Score:  0.0
Confusion:
             positive  negative
Truth is +        67         0
Truth is -        19         0
Test accuracy:  0.7790697674418605
Train accuracy:  0.83984375

RESULTS FOR 1000 DAY MARK:

Trained on 156 rows.
Train MSE:  0.9895253552810069
Train Explained Variance Score:  0.0
Test MSE:  1.1408861930683114
Test Explained Variance Score:  0.0
Confusion:
             positive  negative
Truth is +        44         0
Truth is -         9         0
T

## Bayesian Regression

In [103]:
bayes_regressor = sklearn.linear_model.BayesianRidge()

for mark in marks:
  train_model_over_mark(bayes_regressor, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 351 rows.
Train MSE:  0.8057543310762841
Train Explained Variance Score:  0.0
Test MSE:  0.545760000174857
Test Explained Variance Score:  -2.220446049250313e-16
Confusion:
             positive  negative
Truth is +        96         0
Truth is -        21         0
Test accuracy:  0.8205128205128205
Train accuracy:  0.8319088319088319

RESULTS FOR 500 DAY MARK:

Trained on 256 rows.
Train MSE:  1.2663802355803324
Train Explained Variance Score:  -2.220446049250313e-16
Test MSE:  0.8580406469531269
Test Explained Variance Score:  0.0
Confusion:
             positive  negative
Truth is +        70         0
Truth is -        16         0
Test accuracy:  0.813953488372093
Train accuracy:  0.828125

RESULTS FOR 1000 DAY MARK:

Trained on 156 rows.
Train MSE:  1.007007963764927
Train Explained Variance Score:  0.0
Test MSE:  1.1104277122707051
Test Explained Variance Score:  0.0
Confusion:
             positive  negative
Truth is +        41         0

## Decision Tree Regressor

In [107]:
import sklearn.tree

tree_model = sklearn.tree.DecisionTreeRegressor()
for mark in marks:
  train_model_over_mark(bayes_regressor, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 351 rows.
Train MSE:  0.8250030159895296
Train Explained Variance Score:  0.0
Test MSE:  0.4880139454351201
Test Explained Variance Score:  0.0
Confusion:
             positive  negative
Truth is +        92         0
Truth is -        25         0
Test accuracy:  0.7863247863247863
Train accuracy:  0.8433048433048433

RESULTS FOR 500 DAY MARK:

Trained on 256 rows.
Train MSE:  1.2766169297266554
Train Explained Variance Score:  0.0
Test MSE:  0.8275686271687239
Test Explained Variance Score:  0.0
Confusion:
             positive  negative
Truth is +        70         0
Truth is -        16         0
Test accuracy:  0.813953488372093
Train accuracy:  0.828125

RESULTS FOR 1000 DAY MARK:

Trained on 156 rows.
Train MSE:  1.048135944584364
Train Explained Variance Score:  0.0
Test MSE:  0.9893717687267019
Test Explained Variance Score:  2.220446049250313e-16
Confusion:
             positive  negative
Truth is +        46         0
Truth is -        