In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import warnings 
warnings.filterwarnings('ignore')
pd.pandas.set_option('display.max_columns',None)

In [79]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,StratifiedKFold,KFold
from sklearn.metrics import accuracy_score, roc_auc_score,f1_score,precision_score,recall_score,confusion_matrix,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [117]:
def model_metrics(model,x_test,y_test):
    y_pred=model.predict(x_test)
#     y_proba=model.predict_proba(x_test)
    print('confusion matrix:\n',confusion_matrix(y_test,y_pred))
    tn, fp, fn, tp=confusion_matrix(y_test,y_pred).ravel()
    print('tn: ',tn,'\t','fp: ',fp)
    print('fn: ',fn,'\t','tp: ',tp)
    print('\nAccuracy: ',accuracy_score(y_test,y_pred).round(3))
    print('f1_score: ', f1_score(y_test,y_pred,average=None).round(3))
    print('recall: ', recall_score(y_test,y_pred,average=None).round(3))
    print('precision: ', precision_score(y_test,y_pred,average=None).round(3))
#     print('auc_score: ', roc_auc_score(y_test,y_proba[:,1]).round(3))
    print('\n')
    print(classification_report(y_test,y_pred,labels=[0,1]))

In [14]:
def model_CVmetrics(model,x_train,y_train):
    scv=StratifiedKFold(n_splits=5,random_state=100)
    kcv=KFold(n_splits=5, random_state=100)
    metrics=['accuracy','f1','recall','precision','roc_auc']
    dfMet=pd.DataFrame()
    dfMet['Stratified_10Fold']=[cross_val_score(model,X=x_train,y=y_train,cv=scv,scoring=met).mean().round(3) for met in metrics]
    dfMet['10Fold']=[cross_val_score(model,X=x_train,y=y_train,cv=kcv,scoring=met).mean().round(3) for met in metrics]
    dfMet.index=metrics
    print(dfMet)

In [102]:
dct=DecisionTreeClassifier(class_weight='balanced',random_state=100)
rf=RandomForestClassifier(class_weight='balanced',random_state=100)
lr=LogisticRegression(class_weight='balanced',random_state=100)
svm=SVC(class_weight='balanced',random_state=100)

starcv=StratifiedKFold(n_splits=5)
kfoldcv=KFold(n_splits=5)

In [105]:
dct_params={'max_depth':np.arange(3,15),'criterion':['gini','entropy']}
rf_params={'n_estimators': [100,300,500],'max_features': ['auto', 'sqrt', 'log2'],
                    'max_depth' : [3,4,5,6,7,8],'criterion' :['gini', 'entropy']}
svm_params={'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001]}
lr_params={'penalty':['l1','l2','elastic'],'C':np.logspace(-4,4,20)}

In [None]:
df_ap0=pd.read_csv('Horses_Approach0_2.csv')
df_ap0.dropna(axis=0,inplace=True)
df_ap1=pd.read_csv('Horses_Aggregate_Approach1_2.csv')
df_ap2=pd.read_csv('Horses_Approach2_2.csv')

## Approach 2
Remove all features containing missing values >50%, remove obs with atleast one missing value

In [104]:
x_train2,x_test2,y_train2,y_test2=train_test_split(df_ap2.drop(['Won'],axis=1),df_ap2.Won,
                                               test_size=0.25,stratify=df_ap2.Won,random_state=100)
#scaling between mean and 1 std deviation
std_scaler=StandardScaler()
x_train2=std_scaler.fit_transform(x_train2)
x_test2=std_scaler.fit_transform(x_test2)

print('*'*70,'\n','dimmensions of train data:',x_train2.shape,'\n','*'*70)
print('class %:','\n',y_train2.value_counts(normalize=True)*100,'\n')
print('class counts:','\n',y_train0.value_counts())

print('\n','*'*70,'\n','dimmensions of test data:',x_test2.shape,'\n','*'*70)
print('class %:','\n',y_test2.value_counts(normalize=True)*100,'\n')
print('class counts:','\n',y_test2.value_counts())

********************************************************************** 
 dimmensions of train data: (22053, 64) 
 **********************************************************************
class %: 
 0    90.699678
1     9.300322
Name: Won, dtype: float64 

class counts: 
 0    25419
1     2180
Name: Won, dtype: int64

 ********************************************************************** 
 dimmensions of test data: (7351, 64) 
 **********************************************************************
class %: 
 0    90.708747
1     9.291253
Name: Won, dtype: float64 

class counts: 
 0    6668
1     683
Name: Won, dtype: int64


In [112]:
dct_grid=GridSearchCV(dct,dct_params,scoring='f1',cv=starcv).fit(x_train2,y_train2)
rf_grid=GridSearchCV(rf,rf_params,scoring='f1',cv=starcv).fit(x_train2,y_train2)
svm_grid=GridSearchCV(svm,svm_params,scoring='f1',cv=starcv).fit(x_train2,y_train2)
#lr_grid=GridSearchCV(lr,lr_params,scoring='f1',cv=starcv).fit(x_train2,y_train2)

In [108]:
lr_grid=GridSearchCV(lr,lr_params,scoring='f1',cv=starcv).fit(x_train2,y_train2)

In [113]:
model_metrics(dct_grid.best_estimator_,x_test2,y_test2)

confusion matrix:
 [[4490 2178]
 [ 212  471]]
tn:  4490 	 fp:  2178
fn:  212 	 tp:  471

Accuracy:  0.675
f1_score:  [0.79  0.283]
recall:  [0.673 0.69 ]
precision:  [0.955 0.178]
auc_score:  0.719


              precision    recall  f1-score   support

           0       0.95      0.67      0.79      6668
           1       0.18      0.69      0.28       683

    accuracy                           0.67      7351
   macro avg       0.57      0.68      0.54      7351
weighted avg       0.88      0.67      0.74      7351



In [115]:
model_metrics(rf_grid.best_estimator_,x_test2,y_test2)

confusion matrix:
 [[4620 2048]
 [ 245  438]]
tn:  4620 	 fp:  2048
fn:  245 	 tp:  438

Accuracy:  0.688
f1_score:  [0.801 0.276]
recall:  [0.693 0.641]
precision:  [0.95  0.176]
auc_score:  0.723


              precision    recall  f1-score   support

           0       0.95      0.69      0.80      6668
           1       0.18      0.64      0.28       683

    accuracy                           0.69      7351
   macro avg       0.56      0.67      0.54      7351
weighted avg       0.88      0.69      0.75      7351



In [118]:
model_metrics(svm_grid.best_estimator_,x_test2,y_test2)

confusion matrix:
 [[4130 2538]
 [ 192  491]]
tn:  4130 	 fp:  2538
fn:  192 	 tp:  491

Accuracy:  0.629
f1_score:  [0.752 0.265]
recall:  [0.619 0.719]
precision:  [0.956 0.162]


              precision    recall  f1-score   support

           0       0.96      0.62      0.75      6668
           1       0.16      0.72      0.26       683

    accuracy                           0.63      7351
   macro avg       0.56      0.67      0.51      7351
weighted avg       0.88      0.63      0.71      7351

