In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df_del = pd.read_csv('deliveries.csv')
df_mat = pd.read_csv('matches.csv')

In [3]:
print(df_del.info())
print(df_mat.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150460 entries, 0 to 150459
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   match_id          150460 non-null  int64 
 1   inning            150460 non-null  int64 
 2   batting_team      150460 non-null  object
 3   bowling_team      150460 non-null  object
 4   over              150460 non-null  int64 
 5   ball              150460 non-null  int64 
 6   batsman           150460 non-null  object
 7   non_striker       150460 non-null  object
 8   bowler            150460 non-null  object
 9   is_super_over     150460 non-null  int64 
 10  wide_runs         150460 non-null  int64 
 11  bye_runs          150460 non-null  int64 
 12  legbye_runs       150460 non-null  int64 
 13  noball_runs       150460 non-null  int64 
 14  penalty_runs      150460 non-null  int64 
 15  batsman_runs      150460 non-null  int64 
 16  extra_runs        150460 non-null  int

In [4]:
df = pd.merge(df_del, df_mat, on="match_id")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150460 entries, 0 to 150459
Data columns (total 38 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   match_id          150460 non-null  int64  
 1   inning            150460 non-null  int64  
 2   batting_team      150460 non-null  object 
 3   bowling_team      150460 non-null  object 
 4   over              150460 non-null  int64  
 5   ball              150460 non-null  int64  
 6   batsman           150460 non-null  object 
 7   non_striker       150460 non-null  object 
 8   bowler            150460 non-null  object 
 9   is_super_over     150460 non-null  int64  
 10  wide_runs         150460 non-null  int64  
 11  bye_runs          150460 non-null  int64  
 12  legbye_runs       150460 non-null  int64  
 13  noball_runs       150460 non-null  int64  
 14  penalty_runs      150460 non-null  int64  
 15  batsman_runs      150460 non-null  int64  
 16  extra_runs        15

In [5]:
df['batting_team'].value_counts()

Mumbai Indians                 18943
Royal Challengers Bangalore    17678
Kings XI Punjab                17594
Kolkata Knight Riders          17229
Delhi Daredevils               17185
Chennai Super Kings            15754
Rajasthan Royals               13914
Sunrisers Hyderabad             9058
Deccan Chargers                 9034
Pune Warriors                   5443
Gujarat Lions                   3566
Rising Pune Supergiant          1900
Kochi Tuskers Kerala            1582
Rising Pune Supergiants         1580
Name: batting_team, dtype: int64

In [6]:
df.sort_values(by=['match_id', 'inning', 'over', 'ball'])

## now we will remove another match between non continous team line pune gujrat and etc...
teams_for_now = ['Sunrisers Hyderabad', 'Royal Challengers Bangalore',
        'Mumbai Indians','Kolkata Knight Riders', 'Kings XI Punjab',
        'Delhi Daredevils','Chennai Super Kings', 'Rajasthan Royals']

df = df[(df['batting_team'].isin(teams_for_now)) & (df['bowling_team'].isin(teams_for_now)) &
       (df['team1'].isin(teams_for_now)) & (df['team2'].isin(teams_for_now))]

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106567 entries, 0 to 150459
Data columns (total 38 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   match_id          106567 non-null  int64  
 1   inning            106567 non-null  int64  
 2   batting_team      106567 non-null  object 
 3   bowling_team      106567 non-null  object 
 4   over              106567 non-null  int64  
 5   ball              106567 non-null  int64  
 6   batsman           106567 non-null  object 
 7   non_striker       106567 non-null  object 
 8   bowler            106567 non-null  object 
 9   is_super_over     106567 non-null  int64  
 10  wide_runs         106567 non-null  int64  
 11  bye_runs          106567 non-null  int64  
 12  legbye_runs       106567 non-null  int64  
 13  noball_runs       106567 non-null  int64  
 14  penalty_runs      106567 non-null  int64  
 15  batsman_runs      106567 non-null  int64  
 16  extra_runs        10

In [7]:
df = df.drop(df[df['winner'].isnull()].index,axis = 0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106309 entries, 0 to 150459
Data columns (total 38 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   match_id          106309 non-null  int64  
 1   inning            106309 non-null  int64  
 2   batting_team      106309 non-null  object 
 3   bowling_team      106309 non-null  object 
 4   over              106309 non-null  int64  
 5   ball              106309 non-null  int64  
 6   batsman           106309 non-null  object 
 7   non_striker       106309 non-null  object 
 8   bowler            106309 non-null  object 
 9   is_super_over     106309 non-null  int64  
 10  wide_runs         106309 non-null  int64  
 11  bye_runs          106309 non-null  int64  
 12  legbye_runs       106309 non-null  int64  
 13  noball_runs       106309 non-null  int64  
 14  penalty_runs      106309 non-null  int64  
 15  batsman_runs      106309 non-null  int64  
 16  extra_runs        10

In [8]:
df.isnull().sum()

match_id                 0
inning                   0
batting_team             0
bowling_team             0
over                     0
ball                     0
batsman                  0
non_striker              0
bowler                   0
is_super_over            0
wide_runs                0
bye_runs                 0
legbye_runs              0
noball_runs              0
penalty_runs             0
batsman_runs             0
extra_runs               0
total_runs               0
player_dismissed    101075
dismissal_kind      101075
fielder             102533
season                   0
city                  1700
date                     0
team1                    0
team2                    0
toss_winner              0
toss_decision            0
result                   0
dl_applied               0
winner                   0
win_by_runs              0
win_by_wickets           0
player_of_match          0
venue                    0
umpire1                248
umpire2                248
u

In [9]:
df = df.drop(['match_id', 'over', 'ball', 'wide_runs', 'batsman', 'non_striker', 'bowler',
       'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
       'batsman_runs', 'extra_runs', 'total_runs', 'player_dismissed',
       'dismissal_kind', 'fielder', 'city','date', 'team1', 'team2', 'result',
        'win_by_runs', 'win_by_wickets', 'player_of_match',
        'dl_applied', 'umpire1', 'umpire2', 'umpire3', 'season'], axis = 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106309 entries, 0 to 150459
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   inning         106309 non-null  int64 
 1   batting_team   106309 non-null  object
 2   bowling_team   106309 non-null  object
 3   is_super_over  106309 non-null  int64 
 4   toss_winner    106309 non-null  object
 5   toss_decision  106309 non-null  object
 6   winner         106309 non-null  object
 7   venue          106309 non-null  object
dtypes: int64(2), object(6)
memory usage: 7.3+ MB


In [10]:
encoder = LabelEncoder()
ftr_list = ['batting_team', 'bowling_team', 'toss_winner', 'toss_decision', 'winner', 'venue']

for ftr in ftr_list:
    df[ftr] = encoder.fit_transform(df[ftr])

In [11]:
labels = df['winner']
features = df.drop(['winner'],axis = 1)

In [12]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.20, random_state = 42)

In [13]:
clf=RandomForestClassifier(n_estimators=100)
clf.fit(train_features, train_labels);

pred_test = clf.predict(test_features)
print(accuracy_score(pred_test,test_labels))

0.8684507572194525
