In [1]:
import pandas as pd
import numpy as np

# Data load

In [2]:
delivery = pd.read_csv("Datasets/match_data.csv", low_memory=False)
matches = pd.read_csv("Datasets/match_info_data.csv", low_memory=False)

In [3]:
matches.shape

(1024, 18)

In [4]:
delivery.shape

(243817, 23)

In [5]:
delivery.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed', 'cricsheet_id'],
      dtype='object')

# Preprocessing The Data

### Calculate Total runs

In [6]:
runs_col = ['runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes', 'penalty']
delivery['total_runs'] = delivery[runs_col].sum(axis=1).astype(int)

In [7]:
total_score_df = delivery.groupby(['match_id', 'innings']).sum()['total_runs'].reset_index()

In [8]:
total_score_df['total_runs'] = total_score_df['total_runs'] + 1

In [9]:
total_score_df = total_score_df[total_score_df['innings'] == 1]

In [10]:
total_score_df

Unnamed: 0,match_id,innings,total_runs
0,335982,1,240
2,335983,1,247
4,335984,1,137
6,335985,1,177
8,335986,1,121
...,...,...,...
2065,1359544,1,205
2067,1370350,1,178
2069,1370351,1,199
2071,1370352,1,244


### Total runs data merge into match data

In [11]:
match_df = matches.merge(total_score_df[['match_id', 'total_runs']], left_on='id', right_on='match_id')

In [12]:
match_df['team1'].unique()

array(['Gujarat Titans', 'Mumbai Indians', 'Chennai Super Kings',
       'Sunrisers Hyderabad', 'Royal Challengers Bangalore',
       'Lucknow Super Giants', 'Punjab Kings', 'Delhi Capitals',
       'Kolkata Knight Riders', 'Rajasthan Royals', 'Kings XI Punjab',
       'Delhi Daredevils', 'Rising Pune Supergiant', 'Gujarat Lions',
       'Rising Pune Supergiants', 'Pune Warriors', 'Deccan Chargers',
       'Kochi Tuskers Kerala'], dtype=object)

### Replace similar type of teams

In [13]:
teams = [
    'Sunrisers Hyderabad',
    'Mumbai Indians',
    'Chennai Super Kings',
    'Royal Challengers Bangalore',
    'Delhi Capitals',
    'Kolkata Knight Riders',
    'Rajasthan Royals',
    'Punjab Kings',
    'Gujarat Titans',
    'Lucknow Super Giants'
]

In [14]:
match_df['team1'] = match_df['team1'].str.replace('Delhi Daredevils', 'Delhi Capitals')
match_df['team2'] = match_df['team2'].str.replace('Delhi Daredevils', 'Delhi Capitals')

match_df['team1'] = match_df['team1'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
match_df['team2'] = match_df['team2'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')

match_df['team1'] = match_df['team1'].str.replace('Gujarat Lions', 'Gujarat Titans')
match_df['team2'] = match_df['team2'].str.replace('Gujarat Lions', 'Gujarat Titans')

match_df['team1'] = match_df['team1'].str.replace('Kings XI Punjab', 'Punjab Kings')
match_df['team2'] = match_df['team2'].str.replace('Kings XI Punjab', 'Punjab Kings')

In [15]:
match_df = match_df[match_df['team1'].isin(teams)]
match_df = match_df[match_df['team2'].isin(teams)]

### get data without DLS from match data

In [16]:
match_df = match_df[match_df['dl_applied'] == 0]

### Retrive useful columns from match data

In [17]:
match_df = match_df[['match_id', 'city', 'winner', 'total_runs']]

### Merge data with delivery data

In [18]:
delivery_df = match_df.merge(delivery, on='match_id')

### Take only second inning data

In [19]:
delivery_df = delivery_df[delivery_df['innings'] == 2]

In [20]:
delivery_df['total_runs_x'] = delivery_df['total_runs_x'].astype('int64')
delivery_df['total_runs_y'] = delivery_df['total_runs_y'].astype('int64')

### Calculate  cumulative sum on total runs

In [21]:
delivery_df['current_score'] = delivery_df.groupby('match_id')['total_runs_y'].cumsum()

### Calculate runs left (target - current score)

In [22]:
delivery_df['runs_left'] = delivery_df['total_runs_x'] - delivery_df['current_score']

### Get overs and balls data from ball column

In [23]:
delivery_df['over'] = delivery_df['ball'].astype(int)

In [24]:
delivery_df['ball'] = (delivery_df['ball'] * 10).astype(int) % 10

In [25]:
delivery_df['over'] = delivery_df['over'] + 1

### Calculate balls left 

In [27]:
delivery_df['balls_left'] = 126 - (delivery_df['over'] * 6 + delivery_df['ball'])

### Fill NAN values

In [28]:
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].fillna("0")
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].apply(lambda x: x if x == "0" else "1")
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].astype('int')

### Calculate wickets in particular match

In [29]:
wickets = delivery_df.groupby('match_id')['player_dismissed'].cumsum().values

### Calculate wicket left

In [30]:
delivery_df['wickets_left'] = 10 - wickets

### Calculate current runrate

In [31]:
delivery_df['current_rr'] = ((delivery_df['current_score'] * 6) / (120 - delivery_df['balls_left'])).round(2)

### Calculate required runrate

In [32]:
delivery_df['required_rr'] = ((delivery_df['runs_left'] * 6) / delivery_df['balls_left']).round(2)

### this function for calculate result of the math

In [33]:
def result(row):
    return 1 if row['batting_team'] == row['winner'] else 0

In [34]:
delivery_df['result'] = delivery_df.apply(result, axis=1)

In [35]:
delivery_df.columns

Index(['match_id', 'city', 'winner', 'total_runs_x', 'season', 'start_date',
       'venue', 'innings', 'ball', 'batting_team', 'bowling_team', 'striker',
       'non_striker', 'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs',
       'byes', 'legbyes', 'penalty', 'wicket_type', 'player_dismissed',
       'other_wicket_type', 'other_player_dismissed', 'cricsheet_id',
       'total_runs_y', 'current_score', 'runs_left', 'over', 'balls_left',
       'wickets_left', 'current_rr', 'required_rr', 'result'],
      dtype='object')

### Take usefull column for project

In [36]:
final_df = delivery_df[['batting_team', 'bowling_team', 'city', 'runs_left', 'balls_left', 'wickets_left', 'total_runs_x', 'current_rr', 'required_rr', 'result']]

In [37]:
final_df = final_df.sample(final_df.shape[0])

In [38]:
final_df['batting_team'] = final_df['batting_team'].str.replace('Delhi Daredevils', 'Delhi Capitals')
final_df['bowling_team'] = final_df['bowling_team'].str.replace('Delhi Daredevils', 'Delhi Capitals')

final_df['batting_team'] = final_df['batting_team'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
final_df['bowling_team'] = final_df['bowling_team'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')

final_df['batting_team'] = final_df['batting_team'].str.replace('Gujarat Lions', 'Gujarat Titans')
final_df['bowling_team'] = final_df['bowling_team'].str.replace('Gujarat Lions', 'Gujarat Titans')

final_df['batting_team'] = final_df['batting_team'].str.replace('Kings XI Punjab', 'Punjab Kings')
final_df['bowling_team'] = final_df['bowling_team'].str.replace('Kings XI Punjab', 'Punjab Kings')

In [39]:
final_df = final_df[final_df['batting_team'].isin(teams)]
final_df = final_df[final_df['bowling_team'].isin(teams)]

In [40]:
final_df.sample()

Unnamed: 0,batting_team,bowling_team,city,runs_left,balls_left,wickets_left,total_runs_x,current_rr,required_rr,result
197615,Mumbai Indians,Rajasthan Royals,Durban,67,41,5,153,6.53,9.8,0


In [41]:
final_df.shape

(106438, 10)

### Drop nan values

In [42]:
final_df.dropna(inplace=True)

In [43]:
final_df.isnull().sum()

batting_team    0
bowling_team    0
city            0
runs_left       0
balls_left      0
wickets_left    0
total_runs_x    0
current_rr      0
required_rr     0
result          0
dtype: int64

In [44]:
final_df = final_df[final_df['balls_left'] != 0]

### Split data into dependent variable and independent variable for model training

In [45]:
X = final_df.iloc[:,:-1]
y = final_df.iloc[:, -1]

### Split data into train and test data

In [46]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [47]:
X_train

Unnamed: 0,batting_team,bowling_team,city,runs_left,balls_left,wickets_left,total_runs_x,current_rr,required_rr
207765,Rajasthan Royals,Punjab Kings,Chandigarh,43,1,3,227,9.28,258.00
61826,Delhi Capitals,Sunrisers Hyderabad,Abu Dhabi,38,16,5,170,7.62,14.25
177061,Royal Challengers Bangalore,Chennai Super Kings,Chennai,71,30,5,188,7.80,14.20
173642,Punjab Kings,Mumbai Indians,Mumbai,138,83,9,165,4.38,9.98
193220,Delhi Capitals,Punjab Kings,Chandigarh,26,15,6,156,7.43,10.40
...,...,...,...,...,...,...,...,...,...
138715,Delhi Capitals,Chennai Super Kings,Abu Dhabi,176,108,10,184,4.00,9.78
84584,Chennai Super Kings,Royal Challengers Bangalore,Pune,46,42,6,132,6.62,6.57
209700,Punjab Kings,Sunrisers Hyderabad,Chandigarh,21,14,6,181,9.06,9.00
168655,Mumbai Indians,Chennai Super Kings,Chennai,29,43,7,125,7.48,4.05


In [48]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse_output=False, drop='first'), ['batting_team', 'bowling_team', 'city'])
],
remainder='passthrough')

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

### Create pipe line for data transform and train model using logistic regression

In [50]:
pipe = Pipeline(steps=[
    ('step1', trf),
    ('step2', LogisticRegression(solver='liblinear'))
])

In [51]:
pipe.fit(X_train, y_train)

### predict the model on test data

In [52]:
y_pred = pipe.predict(X_test)

### check accuracy score of the model

In [53]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8043065547561951

### find the probability of the result

In [54]:
pipe.predict_proba(X_test)[5]

array([0.00941881, 0.99058119])

In [55]:
final_df['city'].unique()

array(['Kolkata', 'Mumbai', 'Jaipur', 'Centurion', 'Bangalore', 'Rajkot',
       'Nagpur', 'Delhi', 'Johannesburg', 'Chandigarh', 'Ahmedabad',
       'Bengaluru', 'Chennai', 'Indore', 'Guwahati', 'Hyderabad',
       'Lucknow', 'Durban', 'Raipur', 'Navi Mumbai', 'Abu Dhabi',
       'Sharjah', 'Port Elizabeth', 'Pune', 'Dubai', 'Visakhapatnam',
       'Kimberley', 'Cuttack', 'Dharamsala', 'East London', 'Cape Town',
       'Kanpur', 'Ranchi', 'Bloemfontein'], dtype=object)

In [59]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100080 entries, 112967 to 99327
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   batting_team  100080 non-null  object 
 1   bowling_team  100080 non-null  object 
 2   city          100080 non-null  object 
 3   runs_left     100080 non-null  int64  
 4   balls_left    100080 non-null  int32  
 5   wickets_left  100080 non-null  int32  
 6   total_runs_x  100080 non-null  int64  
 7   current_rr    100080 non-null  float64
 8   required_rr   100080 non-null  float64
 9   result        100080 non-null  int64  
dtypes: float64(2), int32(2), int64(3), object(3)
memory usage: 7.6+ MB


In [56]:
import pickle
pickle.dump(pipe, open('pipe.pkl', 'wb'))

In [58]:
pickle.dump(final_df, open('final_df.pkl', 'wb'))