<a href="https://colab.research.google.com/github/siddtheshah/vc_modeling/blob/master/regressor_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse
!pip install cityhash
import cityhash
import sklearn.decomposition

print(pd.__version__)

from copy import deepcopy

1.0.5


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Read/Join Features

In [3]:
feature_folder = '/content/gdrive/My Drive/vc_modeling/feature_extraction'

sparse_category_features_array = scipy.sparse.load_npz(feature_folder + "/category_features/category_features_large.npz")
sparse_region_features_array = scipy.sparse.load_npz(feature_folder + "/region_features/region_features.npz")

In [4]:
# print(sparse_category_features_array)
## Other features here!! Remember to sparsify the dataframes if they're dense!

# print(sparse_category_features_array)

category_features_array = scipy.sparse.coo_matrix(sparse_category_features_array, dtype=np.uint64)
region_features_array = scipy.sparse.coo_matrix(sparse_region_features_array, dtype=np.uint64)

print(category_features_array.getnnz())
print(region_features_array.getnnz())

category_features_df = pd.DataFrame.sparse.from_spmatrix(category_features_array)
region_features_df = pd.DataFrame.sparse.from_spmatrix(region_features_array)

print(np.shape(category_features_df))
print(np.shape(region_features_df))

print("{}".format(category_features_df.iloc[0][0]))

2355537
1681588
(963967, 676)
(842699, 1032)
13685534557686295101


In [5]:
region_uuid = region_features_df.iloc[:, 0]
category_uuid = category_features_df.iloc[:, 0]
print(region_uuid)

print(np.count_nonzero(category_uuid.isin(region_uuid)))
check_value = 7551169957279540846
# check_value = cityhash.CityHash64('ffffabce-6d4a-b3d1-13c0-4e90cedf5270')
print(check_value)
print(np.size(region_uuid[region_uuid == check_value]))
print(np.size(category_uuid[category_uuid == check_value]))

0         13685534557686295101
1           764015621929367586
2         10846552445983457719
3          5087506707876194815
4          9094535307341385563
                  ...         
842694     4626564123199390189
842695     2978566027619600648
842696     1747954284855665241
842697     9744251496066876000
842698     7648380604111671063
Name: 0, Length: 842699, dtype: Sparse[uint64, 0]
808944
7551169957279540846
1
1


In [6]:
join_base = category_features_df.set_index(0)
join1 = region_features_df.set_index(0)
full_feature_join = join_base.join(join1, lsuffix='category_features', rsuffix='region_features')
full_feature_join = full_feature_join.dropna()
# features = category_features_df
print(full_feature_join)

                      1category_features  2category_features  ...  1030  1031
0                                                             ...            
13685534557686295101                 0.0                 0.0  ...   0.0   0.0
764015621929367586                   1.0                 1.0  ...   0.0   0.0
10846552445983457719                 0.0                 0.0  ...   0.0   0.0
5087506707876194815                  0.0                 0.0  ...   0.0   0.0
9094535307341385563                  0.0                 0.0  ...   0.0   0.0
...                                  ...                 ...  ...   ...   ...
16892862651199510638                 0.0                 0.0  ...   0.0   0.0
7229716827056183679                  0.0                 0.0  ...   0.0   0.0
4626564123199390189                  0.0                 0.0  ...   0.0   0.0
2978566027619600648                  0.0                 0.0  ...   0.0   0.0
9744251496066876000                  0.0                 0.0  ..

## Dimensionality Reduction

In [7]:
svd = sklearn.decomposition.TruncatedSVD(n_components=100, n_iter=10)
svd.fit(full_feature_join[:10000])
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)

reduced_features = svd.transform(full_feature_join)

[0.06401376 0.04622242 0.03231303 0.02621636 0.02257431 0.01852799
 0.0186303  0.01779371 0.01739593 0.01666311 0.0150197  0.0143001
 0.01278896 0.01234873 0.0116726  0.01147952 0.01105916 0.00990296
 0.00961921 0.00929172 0.0087435  0.00855063 0.00827113 0.00810978
 0.00806891 0.00790611 0.00728912 0.00715424 0.00696381 0.00653401
 0.00640232 0.00630214 0.00614893 0.00613491 0.00598965 0.00591875
 0.00553692 0.00539549 0.00524816 0.00511368 0.00500888 0.00478776
 0.00469004 0.00467389 0.00455159 0.00444006 0.00443729 0.0043122
 0.00424332 0.00412112 0.00400106 0.00395953 0.00391999 0.00388471
 0.00378563 0.0036996  0.00358337 0.00350389 0.00342093 0.00339636
 0.00332248 0.00331422 0.00320684 0.0031823  0.00314515 0.00306975
 0.00302897 0.00298353 0.00294386 0.00289755 0.00286698 0.00282751
 0.00278782 0.0027293  0.00270976 0.00271416 0.00267104 0.00264891
 0.00263675 0.00259499 0.00255405 0.00254637 0.00251334 0.00248847
 0.00248535 0.0024536  0.00241769 0.00238761 0.00237204 0.002325

In [48]:
print(reduced_features)

sparse_data = pd.DataFrame(data=reduced_features, index=full_feature_join.index, columns=range(np.shape(reduced_features)[1])) * 100
print(sparse_data*100)

[[ 2.13267225e-02  9.55330856e-02  2.59727273e-01 ... -2.36151409e-04
   4.54597035e-03 -4.69870046e-03]
 [ 1.23233228e+00 -8.14231652e-02  3.90605807e-03 ...  1.98890213e-02
  -2.38637775e-02  3.95002210e-02]
 [ 9.28904313e-02  9.56332376e-02  2.86148240e-01 ... -1.07869175e-02
   6.39112267e-03  1.83483506e-02]
 ...
 [ 4.97493302e-04  1.59515213e-03  1.81023944e-03 ... -9.59748019e-03
   3.92670295e-03 -1.38205766e-03]
 [ 9.58401906e-01 -2.30705297e-01 -1.30523489e-01 ...  9.49070769e-02
  -7.56302242e-02 -3.66692276e-02]
 [ 6.43466070e-03  2.86405002e-03  5.11493682e-03 ...  3.28887361e-01
   2.94411529e-01 -1.22057421e-01]]
                                0            1   ...           98           99
0                                                ...                          
13685534557686295101    213.267225   955.330856  ...    45.459704   -46.987005
764015621929367586    12323.322800  -814.231652  ...  -238.637775   395.002210
10846552445983457719    928.904313   956.332376 

# Read Regression Targets

In [19]:
target_folder = '/content/gdrive/My Drive/vc_modeling/regression_targets/'
marks = [200, 500, 1000, 2000]

regression_marks = {}
for mark in marks:
  regression_marks[mark] = pd.read_csv(target_folder + str(mark) + '.csv')[['hash', 'initial_valuation', 'log_valuation_factor']].set_index('hash')


In [16]:
mark_data = regression_marks[200] # pd.read_pickle("/content/gdrive/My Drive/vc_modeling/regression_targets/200.pkl")
# mark_data = mark_data[mark_data['log_valuation_factor'] > 0]
print(mark_data)
print(category_features_df)
print(np.count_nonzero(mark_data.index.isin(region_features_df.index)))

                      initial_valuation  log_valuation_factor
hash                                                         
13360469805707984821          3000000.0              0.000000
7551169957279540846          45000000.0              0.266595
17638643441008354186          2280520.0              0.000000
14753292511968607343          3000000.0              0.000000
18303053280205650499         20400150.0              0.000000
...                                 ...                   ...
1328871151505778773            600000.0              0.183442
16862606919743425243           162360.0              0.000000
14711304329054892014         65500000.0              0.344393
17393885764651115266         20000000.0             -0.189295
12723450708549610702            75000.0              0.000000

[49489 rows x 2 columns]
                         0    1    2    3    4    ...  671  672  673  674  675
0       13685534557686295101  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0
1         

# Train Model

In [37]:
# Train Models
import sklearn.metrics

def regression_analysis(model, train_data, train_values, test_data, test_values):
    predicted_train_values = model.predict(train_data)
    predicted_test_values = model.predict(test_data)

    print("Sample values: ", predicted_test_values[:5], test_values[:5])

    train_mse = sklearn.metrics.mean_squared_error(train_values, predicted_train_values)
    test_mse = sklearn.metrics.mean_squared_error(test_values, predicted_test_values)
    train_explained_variance = sklearn.metrics.explained_variance_score(train_values, predicted_train_values)
    test_explained_variance = sklearn.metrics.explained_variance_score(test_values, predicted_test_values)

    print("Train MSE: ", train_mse)
    print("Train Explained Variance Score: ", train_explained_variance)
    print("Test MSE: ", test_mse)
    print("Test Explained Variance Score: ", test_explained_variance)

    return model

def classification_analysis(model, train_data, train_values, test_data, test_values):
    train_values_predicted = model.predict(train_data)
    train_prediction = train_values_predicted > 0.00001
    train_prediction = train_prediction.astype(np.int32)

    test_values_predicted = model.predict(test_data)
    test_prediction = test_values_predicted > 0.00001
    test_prediction = test_prediction.astype(np.int32)

    train_labels = train_values > 0.00001
    train_labels = train_labels.astype(np.int32)

    test_labels = test_values > 0.00001
    test_labels = test_labels.astype(np.int32)

    confusion_matrix_large = pd.DataFrame(sklearn.metrics.confusion_matrix(test_labels, test_prediction, labels=[1, 0]),
                                    columns=['positive', 'negative'], index=['Truth is +', 'Truth is -'])
    print("Confusion:\n", confusion_matrix_large)
    test_acc = sum(test_labels==test_prediction)/len(test_labels)
    print("Test accuracy: ", test_acc)
    train_acc = sum(train_labels==train_prediction)/len(train_labels)
    print("Train accuracy: ", train_acc)


    # # Use the metrics.roc_curve function to get the true positive rate (tpr) and false positive rate (fpr)
    # fpr, tpr, thresholds = sklearn.metrics.roc_curve(test_labels, test_probabilities)

    # # Get the area under the curve (AUC)
    # auc = np.mean(cross_val_score(model, test_data, test_labels, scoring="roc_auc", cv=5))
    # print("AUC = " , str(round(auc, 2)))

    # # Plot the ROC curve

    # plt.xlabel("False positive rate (fpr)")
    # plt.ylabel("True positive rate (tpr)")
    # plt.plot(fpr, tpr, label='model')
    # plt.plot([0, 1], [0, 1], color='k', label="random")
    # plt.legend(loc='best')

    # plt.figure()
    # plt.xlabel("Recall")
    # plt.ylabel("Precision")
    # precision, recall, _ = sklearn.metrics.precision_recall_curve(test_labels, test_probabilities)
    # plt.plot(recall, precision)

def train_model_over_mark(model, input_data, mark, filter_unknown=True):
  print("\nRESULTS FOR", str(mark), "DAY MARK:\n")
  # print(features)
  mark_data = regression_marks[mark]
  # print(mark_data)
  if filter_unknown:
    mark_data = mark_data[mark_data['log_valuation_factor'] != 0]
  # Select the data that we have regression targets for

  data = input_data[input_data.index.isin(mark_data.index)]

  # Select the column with log_valuation_factor.
  values = mark_data[mark_data.index.isin(input_data.index)]['log_valuation_factor']

  data = scipy.sparse.coo_matrix(data.values)

  train_data, test_data, train_values, test_values = sklearn.model_selection.train_test_split(data, values, test_size=0.25)
  print("Trained on", str(np.shape(train_data)[0]), "rows.")
  
  model.fit(train_data, train_values)
  regression_analysis(lasso_model, train_data, train_values, test_data, test_values)
  classification_analysis(lasso_model, train_data, train_values, test_data, test_values)

## LASSO

In [None]:
import sklearn.linear_model

lasso_model = sklearn.linear_model.LassoCV()
for mark in marks:
  train_model_over_mark(lasso_model, sparse_data, mark, filter_unknown=False)


RESULTS FOR 200 DAY MARK:

Trained on 13498 rows.


## Decision Tree Regressor

In [39]:
import sklearn.tree

tree_model = sklearn.tree.DecisionTreeRegressor()
for mark in marks:
  train_model_over_mark(tree_model, sparse_data, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 13498 rows.
Sample values:  [1.05813049 1.05813049 1.05813049 1.05813049 1.05813049] hash
5229763523100805031    0.515780
3094613846486599977    0.050225
3937258757194268108    0.247288
5988842577345096839   -0.631386
337764115501710783    -0.480854
Name: log_valuation_factor, dtype: float64
Train MSE:  1.3996318523092568
Train Explained Variance Score:  0.0
Test MSE:  1.412777818750263
Test Explained Variance Score:  0.0
Confusion:
             positive  negative
Truth is +      3192         0
Truth is -      1308         0
Test accuracy:  0.7093333333333334
Train accuracy:  0.7162542598903541

RESULTS FOR 500 DAY MARK:

Trained on 10535 rows.
Sample values:  [1.05813049 1.05813049 1.05813049 1.05813049 1.05813049] hash
2944595664942598160    -0.738131
1045848997689399098     0.954062
9470584322935010952     1.568535
11092637014621205076    1.414427
8172313266675299117     1.417220
Name: log_valuation_factor, dtype: float64
Train MSE:  1.85448981

## SGD

In [42]:
import sklearn.linear_model

sgd = sklearn.linear_model.SGDRegressor()

for mark in marks:
  train_model_over_mark(sgd, sparse_data, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 13498 rows.
Sample values:  [1.05813049 1.05813049 1.05813049 1.05813049 1.05813049] hash
17278859060768167080    0.810735
11968636648591427098   -0.154518
11488947952176063535    0.152253
4039903851674016217     0.158555
15135450892273648273    0.176724
Name: log_valuation_factor, dtype: float64
Train MSE:  1.430981398475113
Train Explained Variance Score:  1.1102230246251565e-16
Test MSE:  1.3187431133843237
Test Explained Variance Score:  0.0
Confusion:
             positive  negative
Truth is +      3221         0
Truth is -      1279         0
Test accuracy:  0.7157777777777777
Train accuracy:  0.7141057934508817

RESULTS FOR 500 DAY MARK:

Trained on 10535 rows.
Sample values:  [1.05813049 1.05813049 1.05813049 1.05813049 1.05813049] hash
8411078950694427371     0.849025
13142405533271192037    0.778114
12542550360124116734    0.587007
1039234271495430370     0.210602
11243690867264535754   -1.007667
Name: log_valuation_factor, dtype: float6

# SVR

In [44]:
import sklearn.svm

for mark in marks:
  svm = sklearn.svm.SVR()
  train_model_over_mark(svm, sparse_data, mark, filter_unknown=True)


RESULTS FOR 200 DAY MARK:

Trained on 13498 rows.
Sample values:  [1.05813049 1.05813049 1.05813049 1.05813049 1.05813049] hash
7605692520027625648     1.791759
2220049177037197525    -0.304924
18279224463993360850    0.124412
10247127945768244428    0.341429
3242767193354031088     0.777820
Name: log_valuation_factor, dtype: float64
Train MSE:  1.4096404167477183
Train Explained Variance Score:  0.0
Test MSE:  1.3827565736857397
Test Explained Variance Score:  2.220446049250313e-16
Confusion:
             positive  negative
Truth is +      3183         0
Truth is -      1317         0
Test accuracy:  0.7073333333333334
Train accuracy:  0.716921025337087

RESULTS FOR 500 DAY MARK:

Trained on 10535 rows.
Sample values:  [1.05813049 1.05813049 1.05813049 1.05813049 1.05813049] hash
13765974736818171632    0.916291
16458525995278093477    1.832581
11970094808217328200   -0.415780
8306724763510158031     0.415741
7138478480837883586     1.059293
Name: log_valuation_factor, dtype: float64