In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ipl-data-set/matches.csv
/kaggle/input/ipl-data-set/teamwise_home_and_away.csv
/kaggle/input/ipl-data-set/deliveries.csv
/kaggle/input/ipl-data-set/most_runs_average_strikerate.csv
/kaggle/input/ipl-data-set/teams.csv
/kaggle/input/ipl-data-set/Players.xlsx


# IPL Winner Prediction Ball by Ball


Name : Rahul Krishna\
Cohort code : GN22DTDS001\
Employee Id : 2142034


**SME Evaluation**



***Explanation***

The winning probability of a match depends on variables like venue, Toss, Teams playing, Current Run Rate, Required Run Rate, Wickets left, etc.\
In most of the cases, we can predict winning probability in second innings. Because in second innings only we can have Target Score required to chase. Using the target we can calculate Required Run Rate which is most important attribute to calculate winning probability.

So in this model we are going to predict winning probability for each and every ball in the second innings.

**Table of Content**
1. **Import libraries**
2. **Data Preprocession**
    * importing data
    * Merging match and deliveries data
    * Dealing with Null Values
3. **Feature Engineering**
    * Processing Dismissal column
    * Creating Features
        * Current Run Rate
        * Required Run Rate
        * Wickets Left
        * Runs Left
        * Balls Left
4. **Model Building**
    * Spliting Data
    * Column Transformation using OHE
    * Building Pipeling
    * Predict Accuracy
5. **Analysis**

**1. Importing Libraries**

In [2]:
# Lets import the libraries which are required in this project
from sklearn.compose import ColumnTransformer # for Column Transformation
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split # for spliting the data
from sklearn.linear_model import LogisticRegression # Model for prediction 
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

**2. Data Preprocessing**

In [3]:
# Lets Import the deliveries data which is having ball by ball information
deliveries = pd.read_csv('/kaggle/input/ipl-data-set/deliveries.csv')
deliveries.head() # Lets print the data

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,4,0,4,,,
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,2,2,,,


In [4]:
# columns present in the data
deliveries.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 'is_super_over', 'wide_runs',
       'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
       'batsman_runs', 'extra_runs', 'total_runs', 'player_dismissed',
       'dismissal_kind', 'fielder'],
      dtype='object')

In [5]:
# lets import the match data which is having overall match information
# It will have information like which team win the match, with how many wickets or runs, etc.
matches = pd.read_csv('/kaggle/input/ipl-data-set/matches.csv')
matches.head()

Unnamed: 0,id,Season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,IPL-2017,Hyderabad,05-04-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,IPL-2017,Pune,06-04-2017,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,
2,3,IPL-2017,Rajkot,07-04-2017,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,
3,4,IPL-2017,Indore,08-04-2017,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,
4,5,IPL-2017,Bangalore,08-04-2017,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,


In [6]:
matches.columns

Index(['id', 'Season', 'city', 'date', 'team1', 'team2', 'toss_winner',
       'toss_decision', 'result', 'dl_applied', 'winner', 'win_by_runs',
       'win_by_wickets', 'player_of_match', 'venue', 'umpire1', 'umpire2',
       'umpire3'],
      dtype='object')

In [7]:
# We have different attributes in matches data which are not useful like match date, umpires, dl_applied, etc.
# Lets take some attributes which are requried
match_tmp = matches[['id','venue','team1','team2','toss_winner','toss_decision','winner']]

In [8]:
# Now Lets merge both deliveries and matches data using match_id
df = deliveries.merge(match_tmp, left_on='match_id', right_on='id')
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,player_dismissed,dismissal_kind,fielder,id,venue,team1,team2,toss_winner,toss_decision,winner
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,...,,,,1,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,...,,,,1,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,...,,,,1,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,DA Warner,S Dhawan,TS Mills,0,...,,,,1,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,DA Warner,S Dhawan,TS Mills,0,...,,,,1,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad


In [9]:
df.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 'is_super_over', 'wide_runs',
       'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
       'batsman_runs', 'extra_runs', 'total_runs', 'player_dismissed',
       'dismissal_kind', 'fielder', 'id', 'venue', 'team1', 'team2',
       'toss_winner', 'toss_decision', 'winner'],
      dtype='object')

In [10]:
# Now lets drop the columns which are not having special impact on model
df = df.drop(['is_super_over','wide_runs','bye_runs','legbye_runs','noball_runs','penalty_runs','extra_runs','batsman_runs','fielder','id','dismissal_kind','team1','team2'],axis=1)
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,total_runs,player_dismissed,venue,toss_winner,toss_decision,winner
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,4,,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,DA Warner,S Dhawan,TS Mills,0,,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,DA Warner,S Dhawan,TS Mills,2,,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad


In [11]:
df.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 'total_runs', 'player_dismissed',
       'venue', 'toss_winner', 'toss_decision', 'winner'],
      dtype='object')

In [12]:
#check the null values in each column
df.isna().sum()

match_id                 0
inning                   0
batting_team             0
bowling_team             0
over                     0
ball                     0
batsman                  0
non_striker              0
bowler                   0
total_runs               0
player_dismissed    170244
venue                    0
toss_winner              0
toss_decision            0
winner                 372
dtype: int64

In [13]:
#Replace null values in player_dismissed with 0
df['player_dismissed'].fillna(0, inplace=True)

df.isna().sum()

match_id              0
inning                0
batting_team          0
bowling_team          0
over                  0
ball                  0
batsman               0
non_striker           0
bowler                0
total_runs            0
player_dismissed      0
venue                 0
toss_winner           0
toss_decision         0
winner              372
dtype: int64

In [14]:
#Remove the rows which are having null values
df = df.dropna()
df.shape

(178706, 15)

In [15]:
df.isna().sum()

match_id            0
inning              0
batting_team        0
bowling_team        0
over                0
ball                0
batsman             0
non_striker         0
bowler              0
total_runs          0
player_dismissed    0
venue               0
toss_winner         0
toss_decision       0
winner              0
dtype: int64

**3. Feature Engineering**

In [16]:
#convert the dismissals to integer
#Replace the name of the player dismissed with 1
df['player_dismissed'] = df['player_dismissed'].apply(lambda x:x if x == 0 else 1)
df['player_dismissed'] = df['player_dismissed'].astype('int')
df['player_dismissed'].unique()

array([0, 1])

In [17]:
# lets calculate runs scored in each innings
# So We can have target value for second innings
total_runs = df.groupby(['match_id','inning']).sum()['total_runs'].reset_index()
total_runs.head()
# display the runs scored in first innings
total_runs = total_runs[total_runs['inning']==1]
total_runs.head()

Unnamed: 0,match_id,inning,total_runs
0,1,1,207
2,2,1,184
4,3,1,183
6,4,1,163
8,5,1,157


In [18]:
#Now merge the runs scored in each innings with df
df = df.merge(total_runs, on='match_id')
df.head()

Unnamed: 0,match_id,inning_x,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,total_runs_x,player_dismissed,venue,toss_winner,toss_decision,winner,inning_y,total_runs_y
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,0,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad,1,207
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,0,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad,1,207
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,4,0,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad,1,207
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,DA Warner,S Dhawan,TS Mills,0,0,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad,1,207
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,DA Warner,S Dhawan,TS Mills,2,0,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad,1,207


In [19]:
#df = df.drop('inning_y',axis=1)
df.rename(columns = {'inning_x':'inning'}, inplace = True)
df.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 'total_runs_x', 'player_dismissed',
       'venue', 'toss_winner', 'toss_decision', 'winner', 'inning_y',
       'total_runs_y'],
      dtype='object')

In [20]:
# As we know Delhi Daredevils and Delhi capitals are same. But the name changed. So rename it
# similarly replace Deccan Chargers with Sunrisers Hyderabad
df['batting_team'] = df['batting_team'].str.replace('Delhi Daredevils','Delhi Capitals')
df['bowling_team'] = df['bowling_team'].str.replace('Delhi Daredevils','Delhi Capitals')

df['batting_team'] = df['batting_team'].str.replace('Deccan Chargers','Sunrisers Hyderabad')
df['bowling_team'] = df['bowling_team'].str.replace('Deccan Chargers','Sunrisers Hyderabad')

In [21]:
teams={'Sunrisers Hyderabad','Mumbai Indians','Royal Challengers Bangalore',
       'Chennai Super Kings','Delhi Capitals','Kings XI Punjab','Rajasthan Royals','Kolkata Knight Riders'}

df=df[df['batting_team'].isin(teams)]
df=df[df['bowling_team'].isin(teams)]

In [22]:
# As we are calculating winning probability for second innings lets take only the second innings data only
df = df[df['inning']==2]
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,total_runs_x,player_dismissed,venue,toss_winner,toss_decision,winner,inning_y,total_runs_y
125,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,Mandeep Singh,A Nehra,1,0,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad,1,207
126,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,CH Gayle,A Nehra,0,0,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad,1,207
127,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,Mandeep Singh,CH Gayle,A Nehra,0,0,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad,1,207
128,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,4,Mandeep Singh,CH Gayle,A Nehra,2,0,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad,1,207
129,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,5,Mandeep Singh,CH Gayle,A Nehra,4,0,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,field,Sunrisers Hyderabad,1,207


In [23]:
# Now let us calcuate current score for each and every ball, Which can be calculated by using sum of runs scored by team at that ball
df['current_score'] = df.groupby('match_id').cumsum()['total_runs_x']
#df[['match_id','over','ball','current_runs']].head()
#df.head()

In [24]:
# Now lets calculate runs left to win
df['runs_left'] = df['total_runs_y'] - df['current_score'] + 1
#df.head()

In [25]:
# Calculate Balls left
df['balls_left'] = 120 -((df['over']-1)*6+df['ball'])
#df.head()

In [26]:
# Calculate Wickets left
wickets = df.groupby('match_id').cumsum()['player_dismissed'].values
df['wickets_left'] = 10 - wickets
#df.head()

In [27]:
# Now let us calculate Current run rate as crr and Required run rate as rrr
df['crr'] = (df['current_score']*6)/(120-df['balls_left'])
df['rrr'] = (df['runs_left']*6)/df['balls_left']
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,total_runs_x,...,toss_decision,winner,inning_y,total_runs_y,current_score,runs_left,balls_left,wickets_left,crr,rrr
125,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,Mandeep Singh,A Nehra,1,...,field,Sunrisers Hyderabad,1,207,1,207,119,10,6.0,10.436975
126,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,CH Gayle,A Nehra,0,...,field,Sunrisers Hyderabad,1,207,1,207,118,10,3.0,10.525424
127,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,Mandeep Singh,CH Gayle,A Nehra,0,...,field,Sunrisers Hyderabad,1,207,1,207,117,10,2.0,10.615385
128,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,4,Mandeep Singh,CH Gayle,A Nehra,2,...,field,Sunrisers Hyderabad,1,207,3,205,116,10,4.5,10.603448
129,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,5,Mandeep Singh,CH Gayle,A Nehra,4,...,field,Sunrisers Hyderabad,1,207,7,201,115,10,8.4,10.486957


In [28]:
# Replace winner of the match with 1 if batting team win the match else 0 if bowling team wins
def result(row):
    return 1 if row['batting_team'] == row['winner'] else 0
# Replace winner of the match with 1 if batting team win the toss else 0 if bowling team wins
def toss(row):
    return 1 if row['batting_team'] == row['toss_winner'] else 0

df['result'] = df.apply(result,axis=1)
df['toss_winner'] = df.apply(toss, axis=1)

In [29]:
df = df.drop(['batsman','non_striker','bowler','toss_decision'],axis=1)

In [30]:
df = df.drop('winner',axis=1)
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,total_runs_x,player_dismissed,venue,toss_winner,inning_y,total_runs_y,current_score,runs_left,balls_left,wickets_left,crr,rrr,result
125,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,1,0,"Rajiv Gandhi International Stadium, Uppal",1,1,207,1,207,119,10,6.0,10.436975,0
126,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,0,0,"Rajiv Gandhi International Stadium, Uppal",1,1,207,1,207,118,10,3.0,10.525424,0
127,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,0,0,"Rajiv Gandhi International Stadium, Uppal",1,1,207,1,207,117,10,2.0,10.615385,0
128,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,4,2,0,"Rajiv Gandhi International Stadium, Uppal",1,1,207,3,205,116,10,4.5,10.603448,0
129,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,5,4,0,"Rajiv Gandhi International Stadium, Uppal",1,1,207,7,201,115,10,8.4,10.486957,0


In [31]:
df = df.sample(df.shape[0])

In [32]:
df.dropna(inplace=True)

In [33]:
df = df.loc[df['balls_left'] != 0]
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,total_runs_x,player_dismissed,venue,toss_winner,inning_y,total_runs_y,current_score,runs_left,balls_left,wickets_left,crr,rrr,result
111619,471,2,Rajasthan Royals,Royal Challengers Bangalore,9,6,0,0,Sheikh Zayed Stadium,1,1,70,46,25,66,7,5.111111,2.272727,1
158391,7928,2,Chennai Super Kings,Royal Challengers Bangalore,13,2,0,1,Maharashtra Cricket Association Stadium,1,1,131,84,48,46,6,6.810811,6.26087,1
112272,474,2,Chennai Super Kings,Sunrisers Hyderabad,6,1,4,0,Sharjah Cricket Stadium,0,1,145,40,106,89,10,7.741935,7.146067,1
40405,172,2,Sunrisers Hyderabad,Delhi Capitals,7,5,1,0,SuperSport Park,0,1,153,86,68,79,9,12.585366,5.164557,0
15639,67,2,Mumbai Indians,Chennai Super Kings,5,1,4,0,"MA Chidambaram Stadium, Chepauk",1,1,208,35,174,95,8,8.4,10.989474,0


In [34]:
df.dtypes

match_id              int64
inning                int64
batting_team         object
bowling_team         object
over                  int64
ball                  int64
total_runs_x          int64
player_dismissed      int64
venue                object
toss_winner           int64
inning_y              int64
total_runs_y          int64
current_score         int64
runs_left             int64
balls_left            int64
wickets_left          int64
crr                 float64
rrr                 float64
result                int64
dtype: object

In [35]:
X = df.drop('result',axis=1)
y = df['result']

**4. Model Building**

In [36]:
from sklearn.model_selection import train_test_split
#split the data into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 50)

In [37]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Apply column Transformation using one hot encoding
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse=False,drop='first'),['batting_team','bowling_team','venue'])
]
,remainder='passthrough')

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Create pipeline for column Transformation and logistic regression
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',LogisticRegression(solver='liblinear'))
])

#fit the model
pipe.fit(X_train,y_train)

Pipeline(steps=[('step1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('trf',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  ['batting_team',
                                                   'bowling_team',
                                                   'venue'])])),
                ('step2', LogisticRegression(solver='liblinear'))])

In [39]:
y_pred = pipe.predict(X_test) # predict the test data

In [40]:
from sklearn.metrics import accuracy_score
print('Accuracy :',accuracy_score(y_test,y_pred)) # calculating accuracy

Accuracy : 0.8324205914567361


**5. Analysis**

Now let us calculate winning probability for each and every ball for a match.


In [41]:

def predict_winner(id, df):
    sample = df[df['match_id'] == id]
    sample = sample.drop('result',axis=1)
    sample = sample.sort_values(['over','ball'])
    sample = sample[sample['balls_left'] != 0]
    res = pipe.predict_proba(sample)
    sample['loss_prob'] = np.round(res.T[0]*100,1)
    sample['win_prob'] = np.round(res.T[1]*100,1)
    tmp = sample[['over','ball','current_score','runs_left','balls_left','wickets_left','crr','rrr', 'loss_prob','win_prob']]
    return tmp

predict_winner(1,df)


Unnamed: 0,over,ball,current_score,runs_left,balls_left,wickets_left,crr,rrr,loss_prob,win_prob
125,1,1,1,207,119,10,6.000000,10.436975,86.2,13.8
126,1,2,1,207,118,10,3.000000,10.525424,86.3,13.7
127,1,3,1,207,117,10,2.000000,10.615385,86.6,13.4
128,1,4,3,205,116,10,4.500000,10.603448,86.0,14.0
129,1,5,7,201,115,10,8.400000,10.486957,84.2,15.8
...,...,...,...,...,...,...,...,...,...,...
243,19,6,165,43,6,1,8.684211,43.000000,99.0,1.0
244,20,1,165,43,5,1,8.608696,51.600000,99.2,0.8
245,20,2,166,42,4,1,8.586207,63.000000,99.5,0.5
246,20,3,172,36,3,1,8.820513,72.000000,99.4,0.6
