## CAPSTONE PROJECT MODELING COMPOSITIONS
## AUTHOR: Simon Lee
## DATE CREATED: JULY 27th 2022
## LAST DATE ADJUSTED: AUG 15th 2022

In [667]:
## Importing all packages that may be needed

from __future__ import print_function
import subprocess
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer, QuantileTransformer, LabelEncoder
from sklearn import decomposition
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm


In [48]:
#comps = pd.read_csv(r"/Users/simonlee/Downloads/MatchComposition.csv")
comps = pd.read_csv(r"/Users/simonlee/Desktop/Capstone/Team Composition Final.csv")
city = pd.read_excel(r"/Users/simonlee/Desktop/Capstone/city_and_Category.xlsx")
comps = comps.reset_index(drop=True)
comps.head(2)

Unnamed: 0,H Left hand Bat,A Left hand Bat,H NoBat,A NoBat,H Right hand Bat,A Right hand Bat,City,H Left Arm Wrist Spinner,A Left Arm Wrist Spinner,H Right Arm Spinner,...,A Batting Allrounder,H Batter,A Batter,H Middle order Batter,A Middle order Batter,H Opening Batter,A Opening Batter,H Bowler,A Bowler,Result
0,5,2,0,0,6,9,Perth,1,0,1,...,0,1,1,0,2,1,1,2,4,0
1,6,2,0,0,5,9,Perth,1,0,2,...,0,1,1,1,2,2,0,4,5,1


In [49]:
comps = comps.merge(city, how = 'left', left_on = "City", right_on = "City")
comps = comps.drop(["City"], axis = 1)

#Removing all matches played in Non-Traditional Cricketing teritories to remove substandard matches as best as possible (ex. matches played in Spain)
comps = comps[comps["Country"] != "Other"]

#Getting Countires to a numeric categorical variable
le = LabelEncoder()
comps["Country"] = le.fit_transform(comps["Country"])

#Only want wins and losses as results
comps = comps[(comps["Result"] == 0) | (comps["Result"] == 1)]

In [665]:
le.classes_

array(['Australia', 'England', 'NZ', 'SA', 'Sub-Cont', 'UK', 'WI', 'Zimb'],
      dtype=object)

In [40]:
list(comps.columns[:18]) + ["Country"]
#list(comps.columns[:18]) 

['H Top order Batter',
 'A Top order Batter',
 'H Bowling Allrounder',
 'A Bowling Allrounder',
 'H Wicketkeeper',
 'A Wicketkeeper',
 'H Allrounder',
 'A Allrounder',
 'H Batting Allrounder',
 'A Batting Allrounder',
 'H Batter',
 'A Batter',
 'H Middle order Batter',
 'A Middle order Batter',
 'H Opening Batter',
 'A Opening Batter',
 'H Bowler',
 'A Bowler',
 'Country']

## Splitting Data 80:20

In [None]:
feats = list(comps.columns[:38]) + ["Country"]
#y = comps["win"]
y = comps["Result"]
x = comps[feats]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)


### ONCE MODEL WAS TRAINED ALL THE SPLIT DATA WAS WRITTEN INTO HARD CSV FILES FOR FUTURE REFERENCE (WHETHER GRADING OR STUDENT USE)

* HERE I AM RELOADING IT

In [666]:
x_train = pd.read_csv(r"/Users/simonlee/Desktop/Capstone/Model Data/x_train.csv")
y_train = pd.read_csv(r"/Users/simonlee/Desktop/Capstone/Model Data/y_train.csv")
x_test = pd.read_csv(r"/Users/simonlee/Desktop/Capstone/Model Data/x_test.csv")
y_test = pd.read_csv(r"/Users/simonlee/Desktop/Capstone/Model Data/y_test.csv")

## USING GRID SEARCH TO GET BEST PARAMETERS IN MODEL

In [None]:

rf = RandomForestClassifier()
sc = StandardScaler()
pca = decomposition.PCA()
dt = tree.DecisionTreeClassifier()
et = ExtraTreesClassifier()
power = PowerTransformer()
quant = QuantileTransformer()

pipe = Pipeline(steps=[ ('power', power), ('dec_tree', et)])


#Grid Parameters
param_dist = {
    "dec_tree__criterion": ["gini", "entropy"],
    "dec_tree__ccp_alpha": [0.0, 0.001, 0.002, 0.5, 0.1], 
    "dec_tree__n_estimators": [50,70,90,110,150, 200, 180, 220],
    "dec_tree__bootstrap": [True, False]
    #"max_features" : [1, 2, 3, 4, 5, 6]

}


grid = GridSearchCV(pipe, param_grid = param_dist, n_jobs = 20)

grid.fit(x_train, y_train) 
x = sm.accuracy_score(y_test, grid.best_estimator_.predict(x_test))
print(x)

In [188]:
grid.best_estimator_.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('power', PowerTransformer()),
                ('dec_tree',
                 ExtraTreesClassifier(ccp_alpha=0.001, criterion='entropy',
                                      n_estimators=70))])>

In [660]:
power = PowerTransformer()
et = ExtraTreesClassifier(bootstrap = True, criterion = 'entropy', n_estimators = 70, ccp_alpha = 0.001)  

pipe = Pipeline(steps=[ ('power', power), ('dec_tree', et)])
mod = pipe.fit(x_train, y_train) 

x = sm.accuracy_score(y_test, mod.predict(x_test))
print(x)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.6094594594594595


## TAKING A LOOK AT FEATURE IMPORTANCE

* Notice that importance is pretty evenly spread about
* H NoBat and A NoBat both have very low importance however (possibly becase most times these columns are 0)

In [366]:
n_feats = []
for i in range (0, len(feats)):
    if pipe['dec_tree'].feature_importances_[i] > 0:
        n_feats.append([feats[i], pipe['dec_tree'].feature_importances_[i]])

n_feats

[['H Left hand Bat', 0.025958944176738243],
 ['A Left hand Bat', 0.027457550979079346],
 ['H NoBat', 0.002509848054779309],
 ['A NoBat', 0.0019546962782270427],
 ['H Right hand Bat', 0.02768334810012711],
 ['A Right hand Bat', 0.02706909424980162],
 ['H Left Arm Wrist Spinner', 0.0212644197969926],
 ['A Left Arm Wrist Spinner', 0.019604090295017085],
 ['H Right Arm Spinner', 0.02773589124458488],
 ['A Right Arm Spinner', 0.029326546687726086],
 ['H Left Arm Fast', 0.027211796020558867],
 ['A Left Arm Fast', 0.02646431980156704],
 ['H Right Arm Fast', 0.02848655268776771],
 ['A Right Arm Fast', 0.029160271699280343],
 ['H NoBowl', 0.030451775685782893],
 ['A NoBowl', 0.02747984848102697],
 ['H Left Arm Spinner', 0.02872455357538807],
 ['A Left Arm Spinner', 0.027475291208392746],
 ['H Right Arm Wrist Spinner', 0.028329357848901816],
 ['A Right Arm Wrist Spinner', 0.02596632726928667],
 ['H Top order Batter', 0.02933471321833043],
 ['A Top order Batter', 0.027893287452589715],
 ['H Bowli

In [676]:
sm.confusion_matrix(y_test, mod.predict(x_test))[0][0]/(sm.confusion_matrix(y_test, mod.predict(x_test))[0][1]+ sm.confusion_matrix(y_test, mod.predict(x_test))[0][0])

sm.confusion_matrix(y_test, mod.predict(x_test))[1][1]/(sm.confusion_matrix(y_test, mod.predict(x_test))[1][1]+ sm.confusion_matrix(y_test, mod.predict(x_test))[1][0])

sm.f1_score(y_test, mod.predict(x_test))

0.635561160151324

### Saving Model

In [661]:
import pickle

with open("model.pickle", 'wb') as f:
    pickle.dump(mod,f)