<a href="https://colab.research.google.com/github/amitsp21/vc_modeling/blob/master/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Import the libraries we will be using
import numpy as np
import pandas as pd
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')
os.listdir('/content/gdrive/My Drive/vc_modeling')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


['data']

In [11]:
path_prefix = "/content/gdrive/My Drive/vc_modeling/data/"
feature_csv_filenames = ["organization_founders_features.csv", "timeline.csv"]
dfs = [pd.read_csv(path_prefix + x) for x in feature_csv_filenames]
df_names = [x[:-4] for x in feature_csv_filenames]
df_dict = dict(zip(df_names, dfs))
df_dict.keys()

dict_keys(['organization_founders_features', 'timeline'])

In [12]:
founders_df = df_dict["organization_founders_features"]
founders_df.head()

Unnamed: 0,org_uuid,founders_max_rank,founders_top_college,founders_max_degree_type_ordinal,founders_max_degree_count,founders_count
0,000014da-0c46-b9cb-0941-3a93c027b119,438498.0,0,2.0,1.0,4
1,00002470-bff7-6226-5800-0ca1b3787b6f,399048.0,1,2.0,2.0,2
2,00007c5c-9260-0dfb-c160-89a416f1a7cc,525715.0,0,,,2
3,0000b5c1-07dd-aeb6-2b17-eb3d64e652a6,147007.0,0,2.0,1.0,1
4,0000c0e1-eb00-9281-9a7f-63dfa277e8df,289919.0,0,,,1


In [57]:
timeline_df = df_dict["timeline"]
timeline_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470732 entries, 0 to 470731
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   org_uuid       470732 non-null  object 
 1   date           470732 non-null  object 
 2   valuation      36215 non-null   float64
 3   event          470713 non-null  object 
 4   is_succeeding  470732 non-null  bool   
 5   has_succeeded  470732 non-null  int64  
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 18.4+ MB


In [109]:
timeline_df["has_succeeded"] = timeline_df["has_succeeded"].astype(int)
timeline_success_df = timeline_df[timeline_df["has_succeeded"] == 1].drop_duplicates(["org_uuid"])[["org_uuid", "has_succeeded"]]
print("num companies succeeded: " + str(len(timeline_success_df)))

founders_and_success_df = founders_df.join(timeline_success_df.set_index("org_uuid"), on=["org_uuid"])
founders_and_success_df["has_succeeded"] = founders_and_success_df["has_succeeded"].fillna(0)

founders_and_success_df["founders_max_degree_type_ordinal"] = founders_and_success_df["founders_max_degree_type_ordinal"].replace("-1", np.nan)
founders_and_success_df["founders_max_degree_count"] = founders_and_success_df["founders_max_degree_count"].replace("-1", np.nan)
founders_and_success_df["founders_count"] = founders_and_success_df["founders_count"].replace("-1", np.nan)

founders_and_success_df.head()

num companies succeeded: 700


Unnamed: 0,org_uuid,founders_max_rank,founders_top_college,founders_max_degree_type_ordinal,founders_max_degree_count,founders_count,has_succeeded
0,000014da-0c46-b9cb-0941-3a93c027b119,438498.0,0,2.0,1.0,4,0.0
1,00002470-bff7-6226-5800-0ca1b3787b6f,399048.0,1,2.0,2.0,2,0.0
2,00007c5c-9260-0dfb-c160-89a416f1a7cc,525715.0,0,,,2,0.0
3,0000b5c1-07dd-aeb6-2b17-eb3d64e652a6,147007.0,0,2.0,1.0,1,0.0
4,0000c0e1-eb00-9281-9a7f-63dfa277e8df,289919.0,0,,,1,0.0


In [68]:
len(founders_and_success_df[founders_and_success_df["has_succeeded"]==0])

229408

In [125]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')

modelConfigs = [
    {'label': 'Logistic Regression', 'model': LogisticRegression(C=1., solver='liblinear')},
    {'label': 'CART', 'model': DecisionTreeClassifier(criterion="entropy", min_samples_leaf=300, random_state=42)}
]

data = founders_and_success_df
Y = data['has_succeeded']
X = data.drop(['org_uuid', 'has_succeeded'], axis=1)

imp = imp.fit(X)
X = pd.DataFrame(imp.transform(X))

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.50, random_state=422)

for config in modelConfigs:
    print("Testing " + config['label'] + " model...")
    model = config['model']
    X_train_imputed = imp.transform(X_train)
    model.fit(X_train_imputed, Y_train)

    # Get the probability of Y_test records being = 1
    X_test_imputed = imp.transform(X_test)
    Y_test_probability_1 = model.predict_proba(X_test_imputed)[:, 1]
    
    prediction = Y_test_probability_1 > 0.50

    # Build and print a confusion matrix
    confusion_matrix = metrics.confusion_matrix(Y_test, prediction, labels=[1, 0])
    confusion_matrix_large = pd.DataFrame(confusion_matrix.T, columns=['p', 'n'], index=['Y', 'N'])
    
    # Compute tp, fp, tn and fn to compute error
    tp = confusion_matrix[0][0]
    fp = confusion_matrix[0][1]
    tn = confusion_matrix[1][1]
    fn = confusion_matrix[1][0]
    error = (fp + fn) / (tp + fp + tn + fn)
    print(confusion_matrix_large)
    print("error: " + str(error))

    # Use the metrics.roc_curve function to get the true positive rate (tpr) and false positive rate (fpr)
    fpr, tpr, thresholds = metrics.roc_curve(Y_test, Y_test_probability_1)
    
    # Get the area under the curve (AUC)
    auc = np.mean(cross_val_score(model, X, Y, scoring="roc_auc", cv=5))

    # Calculate precision-recall
    precision, recall, _ = precision_recall_curve(Y_test, Y_test_probability_1)
    
    print("")

Testing Logistic Regression model...
     p      n
Y    0      0
N  143  99082
error: 0.0014411690602166793

Testing CART model...
     p      n
Y    0      0
N  143  99082
error: 0.0014411690602166793

