In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error
import warnings
from xgboost import XGBRegressor

In [25]:
warnings.filterwarnings('ignore')

In [26]:
df = pd.read_pickle('../ipl_DataClean/IPL_1stInnPredData_cached_df')
df.head(1)

Unnamed: 0,team_batting,team_bowling,current_score,wicket_left,crr,city,balls_left,Rlast_fiveO,total_score
49376,Chennai Super Kings,Punjab Kings,97,7,8.82,Chandigarh,54,44.0,240


In [4]:
x = df.iloc[:,:-1]
y = df.iloc[:, -1]

In [5]:
x1,x_test, y1,y_test = train_test_split(x,y,test_size=0.2,random_state=1)
x_train, x_val, y_train, y_val =  train_test_split(x1,y1,test_size=0.2,random_state=2)

In [6]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(46449, 8)
(11613, 8)
(14516, 8)


In [7]:
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(46449,)
(11613,)
(14516,)


In [8]:
trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse=False, drop='first'),['team_batting','team_bowling','city'])
], remainder='passthrough')

In [9]:
lr_pipe = Pipeline(steps=[
    ('step1', trf),
    ('scl', StandardScaler()),
    ('step2', LinearRegression())
])

rf_pipe = Pipeline(steps=[
    ('step1', trf), 
    ('rf', RandomForestRegressor())
])

xgb_pipe = Pipeline(steps=[
    ('step1', trf),
    ('scl', StandardScaler()),
    ('xgb', XGBRegressor(n_estimators=1000,learning_rate=0.2,max_depth=12,random_state=10))
])

In [10]:
lr_pipe.fit(x_train, y_train)
rf_pipe.fit(x_train, y_train)
xgb_pipe.fit(x_train, y_train)

In [11]:
lry_pred_test = lr_pipe.predict(x_test)
print(f"Linear Model: {r2_score(lry_pred_test, y_test)}")
print(f"Linear Model: {mean_absolute_error(lry_pred_test, y_test)}")

Linear Model: 0.41713722004571807
Linear Model: 13.414145131646082


In [12]:
rfy_pred_test = rf_pipe.predict(x_test)
print(f"Random Forest Model: {r2_score(rfy_pred_test, y_test)}")
print(f"Random Forest Model: {mean_absolute_error(rfy_pred_test, y_test)}")

Random Forest Model: 0.9251215357190785
Random Forest Model: 4.006658952682754


In [13]:
xgby_pred_test = xgb_pipe.predict(x_test)
print(f"XGB Reg Model: {r2_score(xgby_pred_test, y_test)}")
print(f"XGB Reg Model: {mean_absolute_error(xgby_pred_test, y_test)}")

XGB Reg Model: 0.9555186682907132
XGB Reg Model: 3.0275803876337055


In [14]:
lrscore = cross_val_score(lr_pipe, x_val,y_val,cv=5)
print(f"Linear M0del: {lrscore}")

rfscore = cross_val_score(rf_pipe, x_val,y_val,cv=5)
print(f"Random Forest M0del: {rfscore}")

xgbscore = cross_val_score(xgb_pipe, x_val,y_val,cv=5)
print(f"XGB Regressor M0del: {xgbscore}")

Linear M0del: [0.60041479 0.6302669  0.61998887 0.64777606 0.61962467]
Random Forest M0del: [0.76316551 0.79247808 0.79257915 0.80171645 0.79027381]
XGB Regressor M0del: [0.8405599  0.86349183 0.84941379 0.85645635 0.85503406]


In [15]:
y_test.head(2)

67867    165
47041    162
Name: total_score, dtype: int64

In [16]:
x1 = x_test.head(2)
x1

Unnamed: 0,team_batting,team_bowling,current_score,wicket_left,crr,city,balls_left,Rlast_fiveO
67867,Chennai Super Kings,Mumbai Indians,107,8,8.03,Chennai,40,49.0
47041,Delhi Capitals,Gujarat Titans,150,3,7.56,Delhi,1,38.0


In [17]:
lr_pipe.predict(x1)

array([175.01776093, 151.36927537])

In [18]:
rf_pipe.predict(x1)

array([168.27, 160.52])

In [19]:
xgb_pipe.predict(x1)

array([165.61   , 160.08928], dtype=float32)

In [20]:
import pickle

In [21]:
with open("../Models/IPL_1st_Inn_ModelPred.pkl", "wb") as f:
    pickle.dump(xgb_pipe,f)

In [22]:
model = pickle.load(open('../Models/IPL_1st_Inn_ModelPred.pkl', 'rb'))

In [23]:
model.predict(x1)

array([168.27, 160.52])

In [28]:
df.columns

Index(['team_batting', 'team_bowling', 'current_score', 'wicket_left', 'crr',
       'city', 'balls_left', 'Rlast_fiveO', 'total_score'],
      dtype='object')

In [31]:
gh = pd.DataFrame(columns=['team_batting', 'team_bowling', 'current_score', 'wicket_left', 'crr',
       'city', 'balls_left', 'Rlast_fiveO'], 
                  data=[['Chennai Super Kings','Mumbai Indians',107,8,8.03,'Chennai',40,49.0]])
gh

Unnamed: 0,team_batting,team_bowling,current_score,wicket_left,crr,city,balls_left,Rlast_fiveO
0,Chennai Super Kings,Mumbai Indians,107,8,8.03,Chennai,40,49.0


In [35]:
res = model.predict(gh)[0]
res = int(res)
res

168