In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import warnings
from xgboost import XGBClassifier

In [2]:
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_pickle('../t20_DataClean/T20WinProb_Data_cached_df')
df.head(1)

Unnamed: 0,team_batting,team_bowling,city,runs_left,ball_left,wicket_left,target,crr,rrr,Result
125,Bangladesh,New Zealand,Mount Maunganui,193,119,10,194,6.0,9.73,0


## # result 0 means bowling team wins and 1 means batting team wins

In [4]:
x = df.iloc[:,:-1]
y = df.iloc[:, -1]

In [5]:
x1,x_test, y1,y_test = train_test_split(x,y,test_size=0.2,random_state=1)
x_train, x_val, y_train, y_val =  train_test_split(x1,y1,test_size=0.2,random_state=2)

In [6]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(86444, 9)
(21612, 9)
(27015, 9)
(86444,)
(21612,)
(27015,)


In [7]:
trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse=False, drop='first'),['team_batting','team_bowling','city'])
], remainder='passthrough')

In [8]:
lr_pipe = Pipeline(steps=[
    ('step1', trf),
    ('scl', StandardScaler()),
    ('step2', LogisticRegression(solver='liblinear'))
])

rf_pipe = Pipeline(steps=[
    ('step1', trf), 
    ('rf', RandomForestClassifier())
])

xgb_pipe = Pipeline(steps=[
    ('step1', trf),
    ('scl', StandardScaler()),
    ('xgb', XGBClassifier(n_estimators=1000,max_depth=12,learning_rate =0.2,random_state=10))
])

In [9]:
lr_pipe.fit(x_train, y_train)
rf_pipe.fit(x_train, y_train)
xgb_pipe.fit(x_train, y_train)

In [10]:
lry_pred_test = lr_pipe.predict(x_test)
print(f"Linear Model: {accuracy_score(lry_pred_test, y_test)}")

Linear Model: 0.8507866000370164


In [11]:
rfy_pred_test = rf_pipe.predict(x_test)
print(f"Random Forest Model: {accuracy_score(rfy_pred_test, y_test)}")

Random Forest Model: 0.9993707199703868


In [12]:
xgby_pred_test = xgb_pipe.predict(x_test)
print(f"XGB Reg Model: {accuracy_score(xgby_pred_test, y_test)}")

XGB Reg Model: 0.9998889505830094


In [13]:
lrscore = cross_val_score(lr_pipe, x_val,y_val,cv=5)
print(f"Linear M0del: {lrscore}")

rfscore = cross_val_score(rf_pipe, x_val,y_val,cv=5)
print(f"Random Forest M0del: {rfscore}")

xgbscore = cross_val_score(xgb_pipe, x_val,y_val,cv=5)
print(f"XGB Regressor M0del: {xgbscore}")

Linear M0del: [0.85519315 0.85264862 0.85053216 0.86048126 0.85886164]
Random Forest M0del: [0.98658339 0.98427018 0.9828783  0.98218417 0.98310967]
XGB Regressor M0del: [0.99236641 0.99352302 0.98704304 0.99213327 0.99236465]


In [14]:
x1 = x_test.head(1)
x1

Unnamed: 0,team_batting,team_bowling,city,runs_left,ball_left,wicket_left,target,crr,rrr
80257,Pakistan,South Africa,Johannesburg,124,87,9,188,11.64,8.55


## # result 0 means bowling team wins and 1 means batting team wins

In [15]:
y_test.head(1)

80257    0
Name: Result, dtype: int64

In [16]:
print(xgb_pipe.predict(x1))
print(xgb_pipe.predict_proba(x1))

[0]
[[9.996997e-01 3.002666e-04]]


In [17]:
print(rf_pipe.predict(x1))
print(rf_pipe.predict_proba(x1))

[0]
[[0.99 0.01]]


In [18]:
print(lr_pipe.predict(x1))
print(lr_pipe.predict_proba(x1))

[1]
[[0.22348051 0.77651949]]


In [5]:
import pickle

In [20]:
with open("../Models/T20_WinProb_ModelPred.pkl", "wb") as f:
    pickle.dump(xgb_pipe,f)

In [6]:
model = pickle.load(open('../Models/T20_WinProb_ModelPred.pkl', 'rb'))

In [22]:
print(model.predict(x1))
print(model.predict_proba(x1))

[0]
[[9.996997e-01 3.002666e-04]]
