<a href="https://colab.research.google.com/github/sriharsha-ramaraju/Winning_Horse_Prediction/blob/master/Horse_PredictiveModelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import warnings 
warnings.filterwarnings('ignore')
pd.pandas.set_option('display.max_columns',None)

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,StratifiedKFold,KFold
from sklearn.metrics import accuracy_score, roc_auc_score,f1_score,precision_score,recall_score,confusion_matrix,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [0]:
def model_metrics(model,x_test,y_test):
    y_pred=model.predict(x_test)
    y_proba=model.predict_proba(x_test)
    print('confusion matrix:\n',confusion_matrix(y_test,y_pred))
    tn, fp, fn, tp=confusion_matrix(y_test,y_pred).ravel()
    print('tn: ',tn,'\t','fp: ',fp)
    print('fn: ',fn,'\t','tp: ',tp)
    print('\nAccuracy: ',accuracy_score(y_test,y_pred).round(3))
    print('f1_score: ', f1_score(y_test,y_pred,average=None).round(3))
    print('Avg_unweighted f1_score: ', f1_score(y_test,y_pred,average='macro').round(3))
    print('Avg_weighted f1_score: ', f1_score(y_test,y_pred,average='weighted').round(3))
    print('recall: ', recall_score(y_test,y_pred,average=None).round(3))
    print('precision: ', precision_score(y_test,y_pred,average=None).round(3))
    print('auc_score: ', roc_auc_score(y_test,y_proba[:,1]).round(3))
    print('\n')
    print(classification_report(y_test,y_pred,labels=[0,1]))

In [0]:
def model_CVmetrics(model,x_train,y_train):
    scv=StratifiedKFold(n_splits=5,random_state=100)
    kcv=KFold(n_splits=5, random_state=100)
    metrics=['accuracy','f1','f1_macro','f1_weighted','recall','precision','roc_auc']
    dfMet=pd.DataFrame()
    dfMet['Stratified_10Fold']=[cross_val_score(model,X=x_train,y=y_train,cv=scv,scoring=met).mean().round(3) for met in metrics]
    dfMet['10Fold']=[cross_val_score(model,X=x_train,y=y_train,cv=kcv,scoring=met).mean().round(3) for met in metrics]
    dfMet.index=metrics
    print(dfMet)

In [0]:
dct=DecisionTreeClassifier(class_weight='balanced',random_state=100)
rf=RandomForestClassifier(class_weight='balanced',random_state=100)
lr=LogisticRegression(class_weight='balanced',random_state=100)
svm=SVC(class_weight='balanced',random_state=100,probability=True)

starcv=StratifiedKFold(n_splits=5)
kfoldcv=KFold(n_splits=5)

In [11]:
from google.colab import drive
drive

<module 'google.colab.drive' from '/usr/local/lib/python3.6/dist-packages/google/colab/drive.py'>

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
df_ap0=pd.read_csv("/content/drive/My Drive/Colab_Data/Horses_Approach0_2.csv")
df_ap0.dropna(axis=0,inplace=True)
df_ap1=pd.read_csv("/content/drive/My Drive/Colab_Data/Horses_Aggregate_Approach1_2.csv")
df_ap2=pd.read_csv("/content/drive/My Drive/Colab_Data/Horses_Approach2_2.csv")

### APPROACH 0-Classic Approach
#### Fill in Missing values, Taking top 5 levels in categorical variables, convert to dummy variables

In [13]:
x_train0,x_test0,y_train0,y_test0=train_test_split(df_ap0.drop(['Won'],axis=1),df_ap0.Won,
                                               test_size=0.25,stratify=df_ap0.Won,random_state=100)
#scaling between mean and 1 std deviation
std_scaler=StandardScaler()
x_train0=std_scaler.fit_transform(x_train0)
x_test0=std_scaler.fit_transform(x_test0)

print('*'*70,'\n','dimmensions of train data:',x_train0.shape,'\n','*'*70)
print('class %:','\n',y_train0.value_counts(normalize=True)*100,'\n')
print('class counts:','\n',y_train0.value_counts())

print('\n','*'*70,'\n','dimmensions of test data:',x_test0.shape,'\n','*'*70)
print('class %:','\n',y_test0.value_counts(normalize=True)*100,'\n')
print('class counts:','\n',y_test0.value_counts())

********************************************************************** 
 dimmensions of train data: (27599, 94) 
 **********************************************************************
class %: 
 0    92.101163
1     7.898837
Name: Won, dtype: float64 

class counts: 
 0    25419
1     2180
Name: Won, dtype: int64

 ********************************************************************** 
 dimmensions of test data: (9200, 94) 
 **********************************************************************
class %: 
 0    92.097826
1     7.902174
Name: Won, dtype: float64 

class counts: 
 0    8473
1     727
Name: Won, dtype: int64


In [0]:
dct_params={'max_depth':np.arange(3,15),'criterion':['gini','entropy']}
rf_params={'n_estimators': [100,300,500],'max_features': ['auto', 'sqrt', 'log2'],
                    'max_depth' : [3,4,5,6,7,8],'criterion' :['gini', 'entropy']}
svm_params={'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001]}
lr_params={'penalty':['l1','l2'],'C':np.logspace(-4,4,20)}

In [0]:
dct_grid=GridSearchCV(dct,dct_params,scoring='f1',cv=starcv).fit(x_train0,y_train0)

In [0]:
rf_grid=GridSearchCV(rf,rf_params,scoring='f1',cv=starcv).fit(x_train0,y_train0)

In [0]:
svm_grid=GridSearchCV(svm,svm_params,scoring='f1',cv=starcv).fit(x_train0,y_train0)

In [0]:
lr_grid=GridSearchCV(lr,lr_params,scoring='f1',cv=starcv).fit(x_train0,y_train0)

In [29]:
model_metrics(dct_grid.best_estimator_,x_test0,y_test0)

confusion matrix:
 [[4996 3477]
 [ 144  583]]
tn:  4996 	 fp:  3477
fn:  144 	 tp:  583

Accuracy:  0.606
f1_score:  [0.734 0.244]
Avg_unweighted f1_score:  0.489
Avg_weighted f1_score:  0.695
recall:  [0.59  0.802]
precision:  [0.972 0.144]
auc_score:  0.749


              precision    recall  f1-score   support

           0       0.97      0.59      0.73      8473
           1       0.14      0.80      0.24       727

    accuracy                           0.61      9200
   macro avg       0.56      0.70      0.49      9200
weighted avg       0.91      0.61      0.70      9200



In [32]:
model_CVmetrics(dct_grid.best_estimator_,x_train0,y_train0)

             Stratified_10Fold  10Fold
accuracy                 0.680   0.676
f1                       0.267   0.263
f1_macro                 0.531   0.527
f1_weighted              0.753   0.750
recall                   0.737   0.729
precision                0.163   0.160
roc_auc                  0.763   0.762


In [33]:
model_metrics(rf_grid.best_estimator_,x_test0,y_test0)

confusion matrix:
 [[5752 2721]
 [ 232  495]]
tn:  5752 	 fp:  2721
fn:  232 	 tp:  495

Accuracy:  0.679
f1_score:  [0.796 0.251]
Avg_unweighted f1_score:  0.523
Avg_weighted f1_score:  0.753
recall:  [0.679 0.681]
precision:  [0.961 0.154]
auc_score:  0.746


              precision    recall  f1-score   support

           0       0.96      0.68      0.80      8473
           1       0.15      0.68      0.25       727

    accuracy                           0.68      9200
   macro avg       0.56      0.68      0.52      9200
weighted avg       0.90      0.68      0.75      9200



In [34]:
model_CVmetrics(rf_grid.best_estimator_,x_train0,y_train0)

             Stratified_10Fold  10Fold
accuracy                 0.702   0.701
f1                       0.270   0.269
f1_macro                 0.541   0.540
f1_weighted              0.770   0.769
recall                   0.697   0.694
precision                0.167   0.166
roc_auc                  0.759   0.759


In [44]:
model_metrics(svm_grid.best_estimator_,x_test0,y_test0)

confusion matrix:
 [[5663 2810]
 [ 234  493]]
tn:  5663 	 fp:  2810
fn:  234 	 tp:  493

Accuracy:  0.669
f1_score:  [0.788 0.245]
Avg_unweighted f1_score:  0.516
Avg_weighted f1_score:  0.745
recall:  [0.668 0.678]
precision:  [0.96  0.149]


              precision    recall  f1-score   support

           0       0.96      0.67      0.79      8473
           1       0.15      0.68      0.24       727

    accuracy                           0.67      9200
   macro avg       0.55      0.67      0.52      9200
weighted avg       0.90      0.67      0.75      9200



In [45]:
model_CVmetrics(svm_grid.best_estimator_,x_train0,y_train0)

KeyboardInterrupt: ignored

In [38]:
model_metrics(lr_grid.best_estimator_,x_test0,y_test0)

confusion matrix:
 [[5808 2665]
 [ 243  484]]
tn:  5808 	 fp:  2665
fn:  243 	 tp:  484

Accuracy:  0.684
f1_score:  [0.8  0.25]
Avg_unweighted f1_score:  0.525
Avg_weighted f1_score:  0.756
recall:  [0.685 0.666]
precision:  [0.96  0.154]
auc_score:  0.731


              precision    recall  f1-score   support

           0       0.96      0.69      0.80      8473
           1       0.15      0.67      0.25       727

    accuracy                           0.68      9200
   macro avg       0.56      0.68      0.52      9200
weighted avg       0.90      0.68      0.76      9200



In [42]:
model_CVmetrics(lr_grid.best_estimator_,x_train0,y_train0)

             Stratified_10Fold  10Fold
accuracy                 0.689   0.689
f1                       0.261   0.260
f1_macro                 0.532   0.532
f1_weighted              0.761   0.760
recall                   0.695   0.694
precision                0.161   0.160
roc_auc                  0.746   0.746


### APPROACH 1-Aggregate Approach
#### Aggregate on HorseID, convert categorical to dummy variables

In [17]:
x_train1,x_test1,y_train1,y_test1=train_test_split(df_ap1.drop(['Won'],axis=1),df_ap1.Won,
                                               test_size=0.25,stratify=df_ap1.Won,random_state=100)
#scaling between mean and 1 std deviation
std_scaler=StandardScaler()
x_train1=std_scaler.fit_transform(x_train1)
x_test1=std_scaler.fit_transform(x_test1)

print('*'*70,'\n','dimmensions of train data:',x_train1.shape,'\n','*'*70)
print('class %:','\n',y_train1.value_counts(normalize=True)*100,'\n')
print('class counts:','\n',y_train1.value_counts())

print('\n','*'*70,'\n','dimmensions of test data:',x_test1.shape,'\n','*'*70)
print('class %:','\n',y_test1.value_counts(normalize=True)*100,'\n')
print('class counts:','\n',y_test1.value_counts())

********************************************************************** 
 dimmensions of train data: (4548, 59) 
 **********************************************************************
class %: 
 0    97.273527
1     2.726473
Name: Won, dtype: float64 

class counts: 
 0    4424
1     124
Name: Won, dtype: int64

 ********************************************************************** 
 dimmensions of test data: (1516, 59) 
 **********************************************************************
class %: 
 0    97.295515
1     2.704485
Name: Won, dtype: float64 

class counts: 
 0    1475
1      41
Name: Won, dtype: int64


In [0]:
dct_grid=GridSearchCV(dct,dct_params,scoring='f1',cv=starcv,n_jobs=-1).fit(x_train1,y_train1)
rf_grid=GridSearchCV(rf,rf_params,scoring='f1',cv=starcv,n_jobs=-1).fit(x_train1,y_train1)
svm_grid=GridSearchCV(svm,svm_params,scoring='f1',cv=starcv,n_jobs=-1).fit(x_train1,y_train1)
lr_grid=GridSearchCV(lr,lr_params,scoring='f1',cv=starcv,n_jobs=-1).fit(x_train1,y_train1)

In [22]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [24]:
model_metrics(dct_grid.best_estimator_,x_test1,y_test1)

confusion matrix:
 [[682 793]
 [ 15  26]]
tn:  682 	 fp:  793
fn:  15 	 tp:  26

Accuracy:  0.467
f1_score:  [0.628 0.06 ]
Avg_unweighted f1_score:  0.344
Avg_weighted f1_score:  0.613
recall:  [0.462 0.634]
precision:  [0.978 0.032]
auc_score:  0.551


              precision    recall  f1-score   support

           0       0.98      0.46      0.63      1475
           1       0.03      0.63      0.06        41

    accuracy                           0.47      1516
   macro avg       0.51      0.55      0.34      1516
weighted avg       0.95      0.47      0.61      1516



In [25]:
model_CVmetrics(dct_grid.best_estimator_,x_train1,y_train1)

             Stratified_10Fold  10Fold
accuracy                 0.676   0.714
f1                       0.077   0.089
f1_macro                 0.439   0.458
f1_weighted              0.780   0.807
recall                   0.491   0.478
precision                0.042   0.050
roc_auc                  0.599   0.616


In [26]:
model_metrics(rf_grid.best_estimator_,x_test1,y_test1)

confusion matrix:
 [[1230  245]
 [  29   12]]
tn:  1230 	 fp:  245
fn:  29 	 tp:  12

Accuracy:  0.819
f1_score:  [0.9   0.081]
Avg_unweighted f1_score:  0.49
Avg_weighted f1_score:  0.878
recall:  [0.834 0.293]
precision:  [0.977 0.047]
auc_score:  0.614


              precision    recall  f1-score   support

           0       0.98      0.83      0.90      1475
           1       0.05      0.29      0.08        41

    accuracy                           0.82      1516
   macro avg       0.51      0.56      0.49      1516
weighted avg       0.95      0.82      0.88      1516



In [27]:
model_CVmetrics(rf_grid.best_estimator_,x_train1,y_train1)

             Stratified_10Fold  10Fold
accuracy                 0.845   0.845
f1                       0.118   0.105
f1_macro                 0.517   0.510
f1_weighted              0.893   0.893
recall                   0.380   0.325
precision                0.070   0.064
roc_auc                  0.666   0.650


In [28]:
model_metrics(svm_grid.best_estimator_,x_test1,y_test1)

confusion matrix:
 [[1029  446]
 [  27   14]]
tn:  1029 	 fp:  446
fn:  27 	 tp:  14

Accuracy:  0.688
f1_score:  [0.813 0.056]
Avg_unweighted f1_score:  0.435
Avg_weighted f1_score:  0.793
recall:  [0.698 0.341]
precision:  [0.974 0.03 ]
auc_score:  0.541


              precision    recall  f1-score   support

           0       0.97      0.70      0.81      1475
           1       0.03      0.34      0.06        41

    accuracy                           0.69      1516
   macro avg       0.50      0.52      0.43      1516
weighted avg       0.95      0.69      0.79      1516



In [29]:
model_CVmetrics(svm_grid.best_estimator_,x_train1,y_train1)

             Stratified_10Fold  10Fold
accuracy                 0.692   0.692
f1                       0.085   0.078
f1_macro                 0.450   0.447
f1_weighted              0.794   0.795
recall                   0.524   0.465
precision                0.046   0.043
roc_auc                  0.627   0.617


In [30]:
model_metrics(lr_grid.best_estimator_,x_test1,y_test1)

confusion matrix:
 [[990 485]
 [ 23  18]]
tn:  990 	 fp:  485
fn:  23 	 tp:  18

Accuracy:  0.665
f1_score:  [0.796 0.066]
Avg_unweighted f1_score:  0.431
Avg_weighted f1_score:  0.776
recall:  [0.671 0.439]
precision:  [0.977 0.036]
auc_score:  0.589


              precision    recall  f1-score   support

           0       0.98      0.67      0.80      1475
           1       0.04      0.44      0.07        41

    accuracy                           0.66      1516
   macro avg       0.51      0.56      0.43      1516
weighted avg       0.95      0.66      0.78      1516



In [31]:
model_CVmetrics(lr_grid.best_estimator_,x_train1,y_train1)

             Stratified_10Fold  10Fold
accuracy                 0.673   0.670
f1                       0.083   0.081
f1_macro                 0.442   0.440
f1_weighted              0.781   0.779
recall                   0.540   0.527
precision                0.045   0.044
roc_auc                  0.640   0.627
