<a href="https://colab.research.google.com/github/siddtheshah/vc_modeling/blob/master/regressor_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse
!pip install cityhash
import cityhash

print(pd.__version__)

from copy import deepcopy

1.0.5


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Read/Join Features

In [5]:
feature_folder = '/content/gdrive/My Drive/vc_modeling/feature_extraction'

sparse_category_features_array = scipy.sparse.load_npz(feature_folder + "/category_features/category_features.npz")
sparse_region_features_array = scipy.sparse.load_npz(feature_folder + "/region_features/region_features.npz")

In [93]:
# print(sparse_category_features_array)
## Other features here!! Remember to sparsify the dataframes if they're dense!

# print(sparse_category_features_array)

category_features_array = scipy.sparse.coo_matrix(sparse_category_features_array, dtype=np.uint64)
region_features_array = scipy.sparse.coo_matrix(sparse_region_features_array, dtype=np.uint64)

category_features_df = pd.DataFrame.sparse.from_spmatrix(category_features_array)
region_features_df = pd.DataFrame.sparse.from_spmatrix(region_features_array)

print(np.shape(category_features_df))
print(np.shape(region_features_df))

print("{}".format(category_features_df.iloc[0][0]))

(808944, 563)
(842699, 1032)
13685534557686294528


In [101]:
region_uuid = region_features_df.iloc[:, 0]
category_uuid = category_features_df.iloc[:, 0]

check_value = 14682255478983132503
print(np.size(region_uuid[region_uuid == 14682255478983132503]))
print(np.size(category_uuid[category_uuid == 14682255478983132503]))

0
0


In [95]:
category_features_df = category_features_df.set_index(0)
region_features_df = region_features_df.set_index(0)
# features = category_features_df.join(region_features_df, lsuffix='category_features', rsuffix='region_features')
features = category_features_df
print(features)

                      1    2    3    4    5    ...  558  559  560  561  562
0                                              ...                         
13685534557686294528  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0
764015621929367552    1.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0
10846552445983457280  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0
5087506707876194304   0.0  1.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0
9094535307341385728   0.0  0.0  1.0  1.0  0.0  ...  0.0  0.0  0.0  0.0  0.0
...                   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...
16892862651199510528  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0
7229716827056183296   0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0
4626564123199389696   0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0
2978566027619600896   0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0
9744251496066875392   0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0

[808944 row

# Read Regression Targets

In [96]:
target_folder = '/content/gdrive/My Drive/vc_modeling/regression_targets/'
marks = [200, 500, 1000, 2000]

regression_marks = {}
for mark in marks:
  regression_marks[mark] = pd.read_csv(target_folder + str(mark) + '.csv')[['hash', 'initial_valuation', 'log_valuation_factor']].set_index('hash')


In [97]:
mark_data = regression_marks[200]
mark_data = mark_data[mark_data['log_valuation_factor'] > 0]
print(mark_data)

                      initial_valuation  log_valuation_factor
hash                                                         
8743392861447547211            780000.0              0.614129
5641923866167708170           1300000.0              0.113837
5642299400592450831         429202970.0              0.041085
2557776506273453508           1272000.0              0.166351
17192933489252216187          6787424.0              0.846963
...                                 ...                   ...
1387591431561760765          32599399.0              0.351037
4922286768296865869         667000000.0              0.258937
4854494078884211674           2011524.0              1.198132
9154595940417423268          52000000.0              0.328775
1883867561978293910         300000000.0              0.065673

[388 rows x 2 columns]


# Train Model

In [98]:
# Train Models
import sklearn.metrics

def regression_analysis(model, train_data, train_values, test_data, test_values):
    predicted_train_values = model.predict(train_data)
    predicted_test_values = model.predict(test_data)

    train_mse = sklearn.metrics.mean_squared_error(train_values, predicted_train_values)
    test_mse = sklearn.metrics.mean_squared_error(test_values, predicted_test_values)
    train_explained_variance = sklearn.metrics.explained_variance_score(train_values, predicted_train_values)
    test_explained_variance = sklearn.metrics.explained_variance_score(test_values, predicted_test_values)

    print("Train MSE: ", train_mse)
    print("Train Explained Variance Score: ", train_explained_variance)
    print("Test MSE: ", test_mse)
    print("Test Explained Variance Score: ", test_explained_variance)

    return model

def classification_analysis(model, train_data, train_values, test_data, test_values):
    train_values_predicted = model.predict(train_data)
    train_prediction = train_values_predicted > 0.00001
    train_prediction = train_prediction.astype(np.int32)

    test_values_predicted = model.predict(test_data)
    test_prediction = test_values_predicted > 0.00001
    test_prediction = test_prediction.astype(np.int32)

    train_labels = train_values > 0.00001
    train_labels = train_labels.astype(np.int32)

    test_labels = test_values > 0.00001
    test_labels = test_labels.astype(np.int32)

    confusion_matrix_large = pd.DataFrame(sklearn.metrics.confusion_matrix(test_labels, test_prediction, labels=[1, 0]),
                                    columns=['positive', 'negative'], index=['Truth is +', 'Truth is -'])
    print("Confusion:\n", confusion_matrix_large)
    test_acc = sum(test_labels==test_prediction)/len(test_labels)
    print("Test accuracy: ", test_acc)
    train_acc = sum(train_labels==train_prediction)/len(train_labels)
    print("Train accuracy: ", train_acc)


    # # Use the metrics.roc_curve function to get the true positive rate (tpr) and false positive rate (fpr)
    # fpr, tpr, thresholds = sklearn.metrics.roc_curve(test_labels, test_probabilities)

    # # Get the area under the curve (AUC)
    # auc = np.mean(cross_val_score(model, test_data, test_labels, scoring="roc_auc", cv=5))
    # print("AUC = " , str(round(auc, 2)))

    # # Plot the ROC curve

    # plt.xlabel("False positive rate (fpr)")
    # plt.ylabel("True positive rate (tpr)")
    # plt.plot(fpr, tpr, label='model')
    # plt.plot([0, 1], [0, 1], color='k', label="random")
    # plt.legend(loc='best')

    # plt.figure()
    # plt.xlabel("Recall")
    # plt.ylabel("Precision")
    # precision, recall, _ = sklearn.metrics.precision_recall_curve(test_labels, test_probabilities)
    # plt.plot(recall, precision)

def train_model_over_mark(model, mark, filter_unknown=True):
  print("\nRESULTS FOR", str(mark), "DAY MARK:\n")
  # print(features)
  mark_data = regression_marks[mark]
  print(mark_data)
  if filter_unknown:
    mark_data = mark_data[mark_data['log_valuation_factor'] != 0]
  sparse_data = features
  # Select the data that we have regression targets for
  data = sparse_data[sparse_data.index.isin(mark_data.index)]

  print(np.shape(data))
  # Select the column with log_valuation_factor.
  values = mark_data['log_valuation_factor']

  print(np.shape(values))
  data = scipy.sparse.coo_matrix(data.values)

  train_data, test_data, train_values, test_values = sklearn.model_selection.train_test_split(data, values, test_size=0.25)
  print("Trained on", str(np.shape(train_data)[0]), "rows.")
  
  model.fit(train_data, train_values)
  regression_analysis(lasso_model, train_data, train_values, test_data, test_values)
  classification_analysis(lasso_model, train_data, train_values, test_data, test_values)

## LASSO

In [102]:
import sklearn.linear_model

lasso_model = sklearn.linear_model.Lasso()
for mark in marks:
  train_model_over_mark(lasso_model, mark, filter_unknown=False)


RESULTS FOR 200 DAY MARK:

                      initial_valuation  log_valuation_factor
hash                                                         
1866024166243989391        7.000000e+09              0.000000
4186118337114422631        2.458696e+06              0.000000
14682255478983132503       1.270000e+08              0.000000
14016798683532967688       3.317074e+07              0.000000
11319552642808472166       9.040640e+05              0.000000
...                                 ...                   ...
12793274349240043824       9.000000e+06              0.000000
1883867561978293910        3.000000e+08              0.065673
12378300792955007516       2.000000e+08              0.000000
11714718557605488698       2.900000e+08              0.000000
12980928961936013303       8.600000e+08              0.000000

[3464 rows x 2 columns]
(12, 562)
(3464,)


ValueError: ignored

## Bayesian Regression

In [9]:
bayes_regressor = sklearn.linear_model.BayesianRidge()

for mark in marks:
  train_model_over_mark(bayes_regressor, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 351 rows.
Train MSE:  0.8898744902346791
Train Explained Variance Score:  0.0
Test MSE:  0.6531589837434656
Test Explained Variance Score:  1.1102230246251565e-16
Confusion:
             positive  negative
Truth is +       100         0
Truth is -        17         0
Test accuracy:  0.8547008547008547
Train accuracy:  0.8205128205128205

RESULTS FOR 500 DAY MARK:

Trained on 256 rows.
Train MSE:  1.3139963636506837
Train Explained Variance Score:  2.220446049250313e-16
Test MSE:  0.8272872420156947
Test Explained Variance Score:  1.1102230246251565e-16
Confusion:
             positive  negative
Truth is +        70         0
Truth is -        16         0
Test accuracy:  0.813953488372093
Train accuracy:  0.828125

RESULTS FOR 1000 DAY MARK:

Trained on 156 rows.
Train MSE:  1.1254649385050608
Train Explained Variance Score:  0.0
Test MSE:  0.7394396724878717
Test Explained Variance Score:  0.0
Confusion:
             positive  negative
Truth is +

## Decision Tree Regressor

In [10]:
import sklearn.tree

tree_model = sklearn.tree.DecisionTreeRegressor()
for mark in marks:
  train_model_over_mark(bayes_regressor, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 351 rows.
Train MSE:  0.8618496284597087
Train Explained Variance Score:  2.220446049250313e-16
Test MSE:  0.7372335690683764
Test Explained Variance Score:  0.0
Confusion:
             positive  negative
Truth is +       101         0
Truth is -        16         0
Test accuracy:  0.8632478632478633
Train accuracy:  0.8176638176638177

RESULTS FOR 500 DAY MARK:

Trained on 256 rows.
Train MSE:  1.3552654602220633
Train Explained Variance Score:  0.0
Test MSE:  0.7044396987334486
Test Explained Variance Score:  0.0
Confusion:
             positive  negative
Truth is +        71         0
Truth is -        15         0
Test accuracy:  0.8255813953488372
Train accuracy:  0.82421875

RESULTS FOR 1000 DAY MARK:

Trained on 156 rows.
Train MSE:  1.1553940992414502
Train Explained Variance Score:  0.0
Test MSE:  0.651346293716612
Test Explained Variance Score:  2.220446049250313e-16
Confusion:
             positive  negative
Truth is +        49        