## Import Necessary Libraries

In [1]:
import sys
import warnings
import numpy as np
import pandas as pd
import sklearn
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
sys.path.append(str(Path.cwd().parent / 'src'))
import dataTransformer as fx

## Display Setting

In [2]:
pd.set_option('display.max_columns',None)
sklearn.set_config(transform_output='pandas')
warnings.filterwarnings(action='ignore')

## Load Data


In [3]:
bolls = pd.read_csv(Path('../data/deliveries.csv'))
matches = pd.read_csv(Path('../data/matches.csv'))

In [4]:
matches.head()

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,130.0,20.0,N,,Aleem Dar,GA Pratapkumar
3,335985,2007/08,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,166.0,20.0,N,,SJ Davis,DJ Harper
4,335986,2007/08,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0,111.0,20.0,N,,BF Bowden,K Hariharan


## Basic Overview

In [5]:
print(f'matches shape: {matches.shape}')
print(matches.isnull().sum().loc[lambda x: x>0])

matches shape: (1095, 20)
city                 51
player_of_match       5
winner                5
result_margin        19
target_runs           3
target_overs          3
method             1074
dtype: int64


In [6]:
matches.team1.unique()

array(['Royal Challengers Bangalore', 'Kings XI Punjab',
       'Delhi Daredevils', 'Mumbai Indians', 'Kolkata Knight Riders',
       'Rajasthan Royals', 'Deccan Chargers', 'Chennai Super Kings',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Gujarat Lions', 'Rising Pune Supergiants',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru'], dtype=object)

In [7]:
matches.team2.unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Royal Challengers Bangalore', 'Deccan Chargers',
       'Kings XI Punjab', 'Delhi Daredevils', 'Mumbai Indians',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Gujarat Titans', 'Lucknow Super Giants',
       'Royal Challengers Bengaluru'], dtype=object)

In [8]:
np.sort(matches.city.unique().tolist())

array(['Abu Dhabi', 'Ahmedabad', 'Bangalore', 'Bengaluru', 'Bloemfontein',
       'Cape Town', 'Centurion', 'Chandigarh', 'Chennai', 'Cuttack',
       'Delhi', 'Dharamsala', 'Dubai', 'Durban', 'East London',
       'Guwahati', 'Hyderabad', 'Indore', 'Jaipur', 'Johannesburg',
       'Kanpur', 'Kimberley', 'Kochi', 'Kolkata', 'Lucknow', 'Mohali',
       'Mumbai', 'Nagpur', 'Navi Mumbai', 'Port Elizabeth', 'Pune',
       'Raipur', 'Rajkot', 'Ranchi', 'Sharjah', 'Visakhapatnam', 'nan'],
      dtype='<U32')

In [9]:
np.sort(matches.venue.unique()).shape

(58,)

In [10]:
bolls.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,


## Data Cleaning & Transformation

In [11]:
clean_matches = fx.clean_matches(matches)
clean_matches.head()

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,umpire1,umpire2,team1_short,team2_short,winner_short,toss_winner_short
0,335982,2008,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,Asad Rauf,RE Koertzen,RCB,KKR,KKR,RCB
1,335983,2008,Chandigarh,2008-04-19,League,MEK Hussey,Punjab Cricket Association Stadium,Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,MR Benson,SL Shastri,KXIP,CSK,CSK,CSK
2,335984,2008,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Capitals,Rajasthan Royals,Rajasthan Royals,bat,Delhi Capitals,wickets,9.0,130.0,20.0,N,Aleem Dar,GA Pratapkumar,DC,RR,DC,RR
3,335985,2008,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,166.0,20.0,N,SJ Davis,DJ Harper,MI,RCB,RCB,MI
4,335986,2008,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Sunrisers Hyderabad,Sunrisers Hyderabad,bat,Kolkata Knight Riders,wickets,5.0,111.0,20.0,N,BF Bowden,K Hariharan,KKR,SH,KKR,SH


In [12]:
clean_bolls =fx.deliveries_transform(fx.clean_deliveries(bolls))
clean_bolls.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder,team_batting,team_bowling,score,wickets_left,current_sore,legal_ball,balls,ball_left,run_rate,chase_target,runs_left,req_rate
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,,KKR,RCB,222,10,1,1,1,119,1.0,222,221,1.857143
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,,KKR,RCB,222,10,1,1,2,118,0.5,222,221,1.872881
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,,KKR,RCB,222,10,2,0,2,118,1.0,222,220,1.864407
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,,KKR,RCB,222,10,2,1,3,117,0.666667,222,220,1.880342
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,,KKR,RCB,222,10,2,1,4,116,0.5,222,220,1.896552


In [13]:
final_DF = fx.features_sectection(clean_bolls,clean_matches)
final_DF.head(2)

Unnamed: 0,batting_team,bowling_team,city,chase_target,current_sore,run_rate,ball_left,wickets_left,runs_left,result
0,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,222,1,1.0,119,10,221,0
1,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,222,2,2.0,119,10,220,0


In [14]:
current_teams = ['Kolkata Knight Riders', 
         'Chennai Super Kings',
         'Kings XI Punjab', 
         'Rajasthan Royals',
         'Mumbai Indians',
         'Delhi Capitals',
         'Royal Challengers Bangalore',
         'Sunrisers Hyderabad',
         'Lucknow Super Giants',
         'Gujarat Titans'
         ]
len(current_teams)

10

In [15]:
final_DF = final_DF.loc[lambda x: x.batting_team.isin(current_teams) & x.bowling_team.isin(current_teams) ]

In [16]:
final_DF.dropna(inplace=True)

In [17]:
final_DF = final_DF.sample(final_DF.shape[0])

In [18]:
final_DF.shape

(115732, 10)

In [19]:
final_DF.head()

Unnamed: 0,batting_team,bowling_team,city,chase_target,current_sore,run_rate,ball_left,wickets_left,runs_left,result
101397,Delhi Capitals,Gujarat Titans,Pune,171,144,1.358491,14,1,27,0
29292,Delhi Capitals,Chennai Super Kings,Delhi,110,12,2.4,115,10,98,1
35483,Mumbai Indians,Kolkata Knight Riders,Mumbai,140,77,0.916667,36,7,63,0
78129,Sunrisers Hyderabad,Royal Challengers Bangalore,Bangalore,218,106,1.452055,47,8,112,0
92853,Delhi Capitals,Mumbai Indians,Dubai,200,66,1.081967,59,5,134,0


## Data Preparation

In [20]:
X = final_DF.drop('result', axis=1)
y = final_DF['result']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X
                                                    ,y
                                                    ,test_size=0.33
                                                    ,random_state=42)

In [22]:
trf = ColumnTransformer([
                        ('trf',OneHotEncoder(sparse_output=False,drop='first')
                              ,['batting_team','bowling_team','city'])
], remainder='passthrough')

## Model Traning

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

pipe = Pipeline([
    ('step1',trf),
    ('step2',RandomForestClassifier())
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

accuracy_score(y_test,y_pred)

0.9977744030163385

In [24]:
import pickle
pickle.dump(pipe,open('.\model\model.pkl','wb'))