Step 4: Modeling

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
#ignore warning messages to ensure clean outputs
import warnings
warnings.filterwarnings('ignore')

Applying the Machine Learning models:
This is a classification problem, in supervised learning. Here we have used the following classification models:
   
   - Random Forest
   - XGBoost
   - Logistic Regression
  
    
    
    

In [3]:
df = pd.read_pickle(r'C:\Users\User\Car_Accidents_preprocessed.pkl')
#df.head()

Here we split the data between Train and Test (note we do not need to standardize our data since we are using strictly categorical data in this scenario)

In [4]:
#print(df.columns[10:])
#print(df.columns[:10])

In [5]:
from sklearn.model_selection import train_test_split

X= df.drop(columns= df.columns[:10])
y = df['DEATH OCCURED']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Random Forest:

    Pros:
        - Works well with categorical data (transformation is not needed)
        - Works well with a high number of features since they are randomly chosen from
        - We will be able to see the feature importance
    Cons:
        - Has "black box" effect where much of what goes on in the model cannot be controlled
        
Potential Hyperperameters to tune:
    n_estimators
    max_features
    max_depth
    min_samples_split
    bootstrap

In [6]:
from sklearn.ensemble import RandomForestClassifier

# we bootstrap, use entropy and use 50 n_estimators just as starting points for this model

rf = RandomForestClassifier(bootstrap=True,n_estimators=50,criterion='entropy', random_state =1)
rf.fit(X_train, y_train)

#Predict using the model:

y_predict_rf = rf.predict(X_test)



In [7]:
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(y_test, y_predict_rf)
print(cnf_matrix)
Accuracy_rf=rf.score(X_test,y_test)
print(Accuracy_rf)

[[537030    130]
 [   674      8]]
0.9985051371964258


In [None]:
from sklearn.model_selection import cross_val_score

cv_scores_test= cross_val_score(rf,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(rf,X_train,y_train,cv=5,scoring='roc_auc')
print(cv_scores_test)
cv_scores_rf_test= cv_scores_test.mean()
cv_scores_rf_train= cv_scores_train.mean()
cv_scores_std_rf= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_rf_test))
print ('Mean cross validation train score: ' +str(cv_scores_rf_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_rf))

XGBoost:

    Pros:
    - Handles missing values well
    - handles large data sets well
    - fast to interpret and good execution speed
    Cons:
    - overfitting is possible if hyperparameters are not tuned correctly 
    - many hyperparameters can complicate things
    
Potential hyperparameters to tune: n_estimators, max_depth, learning_rate, n_jobs, min_child_weight, eval metric


In [8]:
xgbModel = XGBClassifier(n_estimators=2, objective= 'binary:logistic', eval_metric= 'error', random_state=1)
xgbModel.fit(X_train, y_train)
y_predict_xgb = xgbModel.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_predict_xgb)
print(cnf_matrix)
Accuracy_xgb=xgbModel.score(X_test,y_test)
print(Accuracy_xgb)

[[537160      0]
 [   682      0]]
0.9987319696118935


Logistic Regression

    Pros:
    -Easy to implement
    -efficient in train time
    Cons:
    - Prone to overfitting with high dimensionality 
    - has difficulty capturing complex relationships
    - Does not work well with many features
Potential Hyperparameters to tune: C

In [9]:
Logreg = LogisticRegression(penalty = 'l2', C = .1,random_state = 40)
Logreg.fit(X_train,y_train)
y_pred_lr = Logreg.predict(X_test)
    

In [10]:
from sklearn.metrics import confusion_matrix

cnf_matrix= confusion_matrix(y_test,y_pred_lr)
print(cnf_matrix)
Accuracy_lr=Logreg.score(X_test,y_test)

print(Accuracy_lr)

[[537160      0]
 [   682      0]]
0.9987319696118935


Comparison of Training Models
We applied 3 different ML models and will evaluate their performance in terms of ROC AUC score. 

In [11]:
myLabels = ['Random Forest','XGBoost','Logistic Regression']
Accuracy_score = [Accuracy_rf, Accuracy_xgb, Accuracy_lr]

score_tab = pd.DataFrame(list(zip(myLabels, Accuracy_score)), 
               columns =['Algorithm', 'Model accuracy score']) 


Applying RandomSearchCV for hyperparameter tuning 

Random Forest hyperparameter Tuning 

In [None]:
rf = RandomForestClassifier(max_features='auto',random_state=1, n_jobs=-1)

param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10], "min_samples_split" : [2,8,16], "n_estimators": [10,50,100,500]}

rs = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, scoring='accuracy', cv=3, n_jobs=-1)

rs = rs.fit(X_train, y_train)

print(rs.best_estimator_) 
print(rs.best_score_)

Fiiting Random Forest with optimal Hyperparameters

In [None]:
rf = # the hyperparameters from the test
rf.fit(X_train, y_train)
y_predictions_rf = rf.predict(X_test)
acc = accuracy_score(y_test, y_predictions)
cv = cross_val_score(rf, X_test, y_test,scoring='roc_auc').mean()
print(acc)
print(cv)


Parameter Tuning for XBgoost

In [None]:
param_grid = {'max_depth': [3,4,5,6,7], 'learning_rate':[.01,.1,.5], 'early_stopping_rounds':[2,4,6,8]}
xgbModel = XGBClassifier( objective= 'binary:logistic', eval_metric= 'error', random_state=1)
rs2 = RandomizedSearchCV(estimator = xbgModel, scoring='accuracy', cv=3, n_jobs =-1)

Fitting XGBoost Model with Optimal Hyperparameter

In [None]:
xbg = # the hyperparameters from the test
xgb.fit(X_train, y_train)
y_predictions_xgb = xgb.predict(X_test)
acc = accuracy_score(y_test, y_predictions)
cv = cross_val_score(xgb, X_test, y_test,scoring='roc_auc').mean()
print(acc)
print(cv)


Parameter Tuning For Logistic Regression

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
logreg = LogisticRegression(penalty = 'l2',random_state = 40)
rs3 = RandomizedSearchCV(estimator= logreg,scoring= 'acccuracy', cv=3, n_jobs =-1 )

rs3 = rs.fit(X_train, y_train)
print(rs3.best_estimator_)
print(rs3.best_score_)

Fitting Logistic Regression with Optimal Hyperparameter

In [None]:
logreg = # the hyperparameters from the test
logreg.fit(X_train, y_train)
y_predictions_log = log.predict(X_test)
acc = accuracy_score(y_test, y_predictions)
cv = cross_val_score(logreg, X_test, y_test,scoring='roc_auc').mean()
print(acc)
print(cv)




Ranking performance 

In [None]:
#plot roc-auc scores

Feature Importances

In [None]:
features = X
importances = list(rf.feature_importances_)
imp=np.sort(importances)
tab=pd.DataFrame(list(zip(X,imp)),columns =['Features', 'Importance scores']) 
print(tab)


plt.figure(figsize=(10,5))
#indices = np.argsort(importances)
index = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]
_=plt.barh(index,imp, align='center',color='b')
plt.xlabel('Relative Importance',fontsize=15)
plt.ylabel('Features',fontsize=15)
plt.yticks(index, features)
plt.title('Feature Importances for Random Forest classifier model',fontsize=15)
plt.savefig("28.png")
plt.show()

In [None]:
features = X
importances = list(xgb.feature_importances_)
imp=np.sort(importances)
tab=pd.DataFrame(list(zip(X,imp)),columns =['Features', 'Importance scores']) 
print(tab)

plt.figure(figsize=(10,5))
#indices = np.argsort(importances)
index = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]
_=plt.barh(index,imp, align='center',color='b')
plt.xlabel('Relative Importance',fontsize=15)
plt.ylabel('Features',fontsize=15)
plt.yticks(index, features)
plt.title('Feature Importances for XGBoost model',fontsize=15)
plt.savefig("28.png")
plt.show()

Conclusion and Further Progress