# Model Building

In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# SK learn stuff
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [63]:
# First import the data
dataset =  pd.read_csv('finished_data/finished_data.csv').drop(columns=['Unnamed: 0'])
dataset.dtypes

success                                int64
num_degs_finished_by_curr_ceo        float64
max_number_founded_by_one_founder    float64
avg_num_degs_finished_by_founders      int64
at_least_one_veteran_founder           int64
years_between_degree_founding          int64
years_between_first_curr_founding      int64
Advertising                            int64
Apps                                   int64
Commerce and Shopping                  int64
Community and Lifestyle                int64
Consumer Electronics                   int64
Content and Publishing                 int64
Data and Analytics                     int64
Design                                 int64
Education                              int64
Financial Services                     int64
Hardware                               int64
Health Care                            int64
Information Technology                 int64
Internet Services                      int64
Manufacturing                          int64
Media and 

## Final data preprocessing

In [64]:
success = dataset['success'].sum()
failure = dataset['success'].count() - success

print(f'We have a dataset of {success + failure} observations')
print(f'Out of that number, we have {success} successful companies which accounts for a {success/(success + failure)*100:.2f}% of all observations')
print(f'Out of that number, we have {failure} failed companies which accounts for a {failure/(success + failure)*100:.2f}% of all observations')

We have a dataset of 67334 observations
Out of that number, we have 8443 successful companies which accounts for a 12.54% of all observations
Out of that number, we have 58891 failed companies which accounts for a 87.46% of all observations


As we see there is a massive class imbalance.

- Bootstrapping ???
- Not bootstrapping ??

In [65]:
X = dataset.drop(columns=['success'])
y = dataset['success']

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,random_state=109,stratify=y)

In [66]:
print(f'The size of X_train dataset is {X_train.shape}')
print(f'The size of y_train dataset is {y_train.shape}')
print(f'The size of X_test dataset is {X_test.shape}')
print(f'The size of y_test dataset is {y_test.shape}')

The size of X_train dataset is (53867, 30)
The size of y_train dataset is (53867,)
The size of X_test dataset is (13467, 30)
The size of y_test dataset is (13467,)


In [None]:
# Relevant scores
scores = ['accuracy','f1','precision','recall']

## Majority classifier

In [67]:
# Train model

In [68]:
# Compute accuricy, precision, recall, F1

## Logistic regression

In [69]:
# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [79]:
# Initial results
logit = LogisticRegression(penalty='none',max_iter=100*10000,solver='saga')

# Cross validation
results = cross_validate(logit,X=X_train_scaled,y=y_train,scoring=scores,cv=10)

d_logit_initial = {}

for i in range(4):
    score = scores[i]
    d_logit_initial[score] = np.mean(results['test_' +score])
    print(f'(Initial 10-fold cross-validation) {score} = {d_logit_initial[score]:.4f}')

(Initial 10-fold cross-validation) accuracy = 0.8738
(Initial 10-fold cross-validation) f1 = 0.0153
(Initial 10-fold cross-validation) precision = 0.3554
(Initial 10-fold cross-validation) recall = 0.0078


In [None]:
# Hyperparameter tunning

# Get hyperparameter options
parameters = {
    'penalty':['none','l1','l2'], 
    'C':[0.0001, 0.001, 0.01, 0.1, 0.5, 10, 50, 100, 1000]
}

# Initialize model
logit = LogisticRegression(max_iter=100*10000,solver='saga')

# Use gridsearch to determine the best model
model_GSCV = GridSearchCV(logit,parameters,scoring=scores,cv=10,refit=False)

# fit
model_GSCV.fit(X_train,y_train)

## Feed-forward neural network

## XGradientBoost