# Model Building

In [1]:
# Data handling stuff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn stuff
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Gradient boosting
from xgboost import XGBClassifier

# Mute warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# First import the data
dataset =  pd.read_csv('finished_data/finished_data.csv').drop(columns=['Unnamed: 0'])
dataset.dtypes

success                                int64
num_degs_finished_by_curr_ceo        float64
max_number_founded_by_one_founder    float64
avg_num_degs_finished_by_founders      int64
at_least_one_veteran_founder           int64
years_between_degree_founding          int64
years_between_first_curr_founding      int64
Advertising                            int64
Apps                                   int64
Commerce and Shopping                  int64
Community and Lifestyle                int64
Consumer Electronics                   int64
Content and Publishing                 int64
Data and Analytics                     int64
Design                                 int64
Education                              int64
Financial Services                     int64
Hardware                               int64
Health Care                            int64
Information Technology                 int64
Internet Services                      int64
Manufacturing                          int64
Media and 

## Final data preprocessing

In [3]:
success = dataset['success'].sum()
failure = dataset['success'].count() - success

print(f'We have a dataset of {success + failure} observations')
print(f'We have a dataset of {len(dataset.columns)} predictors')
print(f'Out of that number, we have {success} successful companies which accounts for a {success/(success + failure)*100:.2f}% of all observations')
print(f'Out of that number, we have {failure} failed companies which accounts for a {failure/(success + failure)*100:.2f}% of all observations')

We have a dataset of 67334 observations
We have a dataset of 31 predictors
Out of that number, we have 8443 successful companies which accounts for a 12.54% of all observations
Out of that number, we have 58891 failed companies which accounts for a 87.46% of all observations


In [4]:
X = dataset.drop(columns=['success'])
y = dataset['success']

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,random_state=109,stratify=y)

In [5]:
print(f'The size of X_train dataset is {X_train.shape}')
print(f'The size of y_train dataset is {y_train.shape}')
print(f'The size of X_test dataset is {X_test.shape}')
print(f'The size of y_test dataset is {y_test.shape}')

The size of X_train dataset is (53867, 30)
The size of y_train dataset is (53867,)
The size of X_test dataset is (13467, 30)
The size of y_test dataset is (13467,)


In [6]:
# Relevant scores
scores = ['accuracy','f1','precision','recall']

## Majority classifier

In [7]:
# Definde majority classifier model
class majority_classifier():
    
    def train(self,X,y):
        self.pred = np.bincount(y).argmax()
    
    def predict(self,X):
        n = len(X)
        return np.array(n * [self.pred])

In [8]:
# Train model
majority = majority_classifier()
majority.train(X_train,y_train)

# Get predictions
y_pred = majority.predict(X_test)

In [9]:
# Compute accuracy, precision, recall, F1
print(f'Accuracy score = {accuracy_score(y_test,y_pred):.4f}')
print(f'F1 score = {f1_score(y_test,y_pred):.4f}')
print(f'Precision score = {precision_score(y_test,y_pred):.4f}')
print(f'Recall score = {recall_score(y_test,y_pred):.4f}')

Accuracy score = 0.8746
F1 score = 0.0000
Precision score = 0.0000
Recall score = 0.0000


## Logistic regression

In [10]:
# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Initial results
logit = LogisticRegression(penalty='none',max_iter=100*10000,solver='saga')

# Cross validation
results = cross_validate(logit,X=X_train_scaled,y=y_train,scoring=scores,cv=10)

d_logit_initial = {}

for i in range(4):
    score = scores[i]
    d_logit_initial[score] = np.mean(results['test_' + score])
    print(f'(Initial 10-fold cross-validation) {score} = {d_logit_initial[score]:.4f}')

(Initial 10-fold cross-validation) accuracy = 0.8738
(Initial 10-fold cross-validation) f1 = 0.0153
(Initial 10-fold cross-validation) precision = 0.3554
(Initial 10-fold cross-validation) recall = 0.0078


In [35]:
# Hyperparameter tunning

# Get hyperparameter options
parameters = {
    'penalty':['l1', 'l2'], 
    'C':[0.01, 0.1, 0.5, 10, 50, 100]
}

# Initialize model
logit = LogisticRegression(max_iter=100*10000,solver='saga')

# Use gridsearch to determine the best model
model_GSCV_logit = GridSearchCV(logit,parameters,scoring=scores,cv=5,refit='f1')

# fit
model_GSCV_logit.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=LogisticRegression(max_iter=1000000, solver='saga'),
             param_grid={'C': [0.01, 0.1, 0.5, 10, 50, 100],
                         'penalty': ['l1', 'l2']},
             refit='f1', scoring=['accuracy', 'f1', 'precision', 'recall'])

In [36]:
# Print best parameters
best_params = model_GSCV_logit.best_params_

for key, value in best_params.items():
    print(f'{key} =  {value}')

C =  10
penalty =  l1


In [37]:
# Get best model
best_logit = model_GSCV_logit.best_estimator_

# Train model on full train set
best_logit.fit(X_train,y_train)

# Predict on test set
y_pred = best_logit.predict(X_test)

# Compute accuracy, precision, recall, F1
print(f'(GridSearchCV-tuned model) Accuracy score = {accuracy_score(y_test,y_pred):.4f}')
print(f'(GridSearchCV-tuned model) F1 score = {f1_score(y_test,y_pred):.4f}')
print(f'(GridSearchCV-tuned model) Precision score = {precision_score(y_test,y_pred):.4f}')
print(f'(GridSearchCV-tuned model) Recall score = {recall_score(y_test,y_pred):.4f}')

(GridSearchCV-tuned model) Accuracy score = 0.8736
(GridSearchCV-tuned model) F1 score = 0.0218
(GridSearchCV-tuned model) Precision score = 0.3725
(GridSearchCV-tuned model) Recall score = 0.0112


## XGradientBoost

In [38]:
# Initial results
XGB = XGBClassifier(eval_metric='error')

# Cross validation
results = cross_validate(XGB,X=X_train,y=y_train,scoring=scores,cv=10)

d_XGB_initial = {}

for i in range(4):
    score = scores[i]
    d_XGB_initial[score] = np.mean(results['test_' +score])
    print(f'(Initial 10-fold cross-validation) {score} = {d_XGB_initial[score]:.4f}')

(Initial 10-fold cross-validation) accuracy = 0.8722
(Initial 10-fold cross-validation) f1 = 0.0512
(Initial 10-fold cross-validation) precision = 0.3721
(Initial 10-fold cross-validation) recall = 0.0275


In [None]:
# Hyperparameter tunning

# Get hyperparameter options
parameters = {
    'n_estimators' : [100, 250, 500, 750, 1000], 
    'max_depth' : [3, 5, 7, 10, 12, 15, 17, 20, 25],
    'learning_rate' : [0.05, 0.1, 0.15, 0.2, 0.25, 0.30],
    'gamma' : [0, 0.1, 0.2, 0.3, 0.4],
    'min_child_weight' : [1, 3, 5, 7],
    'colsample_bytree' : [0.3, 0.4, 0.5, 0.7],
}

# Initialize model
XGB = XGBClassifier(eval_metric='error')

# Use gridsearch to determine the best model
model_GSCV_XGboost = GridSearchCV(XGB,parameters,scoring=scores,cv=5,refit='f1')

# fit
model_GSCV_XGboost.fit(X_train,y_train)

In [None]:
# Print best parameters
best_params = model_GSCV_XGboost.best_params_

for key, value in best_params.items():
    print(f'{key} =  {value}')

In [None]:
# Get best model
best_XGboost = model_GSCV_XGboost.best_estimator_

# Train model on full train set
best_XGboost.fit(X_train,y_train)

# Predict on test set
y_pred = best_XGboost.predict(X_test)

# Compute accuracy, precision, recall, F1
print(f'(GridSearchCV-tuned model) Accuracy score = {accuracy_score(y_test,y_pred):.4f}')
print(f'(GridSearchCV-tuned model) F1 score = {f1_score(y_test,y_pred):.4f}')
print(f'(GridSearchCV-tuned model) Precision score = {precision_score(y_test,y_pred):.4f}')
print(f'(GridSearchCV-tuned model) Recall score = {recall_score(y_test,y_pred):.4f}')