# T20 Match Result Analysis & Prediction using ML

In [257]:
##importing the required libraries
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [258]:
##loading the csv files into dataframe
match_df = pd.read_csv("Data/matchest20-1.csv")
match_df.sample(10)

Unnamed: 0,season,Match_date,city,team1,team2,winner,toss_winner,toss_decision,win_by_runs,win_by_wickets
773,2019,2019/10/23,Abu Dhabi,Canada,Ireland,Canada,Canada,bat,10.0,
181,2011,2011/01/12,Adelaide,Australia,England,England,Australia,bat,,1.0
702,2019,2019/05/20,Kampala,Ghana,Namibia,Namibia,Namibia,field,,9.0
211,2012,2012/02/24,Mombasa,Kenya,Ireland,Ireland,Ireland,bat,2.0,
499,2016,2016/03/12,Nagpur,Afghanistan,Zimbabwe,Afghanistan,Afghanistan,bat,59.0,
233,2012,2012/07/18,Belfast,Ireland,Bangladesh,Bangladesh,Ireland,field,71.0,
358,2014,2014/03/23,Dhaka,India,West Indies,India,India,field,,7.0
528,2016,2016/06/22,Harare,Zimbabwe,India,India,Zimbabwe,field,3.0,
124,2010,2010/02/10,Dubai (DSC),Canada,Kenya,Kenya,Kenya,field,,9.0
479,2016,2016/02/26,Dhaka,Bangladesh,U.A.E.,Bangladesh,U.A.E.,field,51.0,


## Exploratory Data Analysis

* Looking at all the teams in the teams and winners' column
* Checking for missing values for the no result matches
* Encoding Team Names
* Visualizing the toss wins and match wins by each team


### Looking at all the teams in the teams and winners' column

In [259]:
match_df['winner'].value_counts()

Pakistan          91
India             87
South Africa      73
New Zealand       64
Sri Lanka         63
England           60
Australia         58
West Indies       55
Afghanistan       50
Netherlands       43
Ireland           38
Scotland          34
Bangladesh        29
Zimbabwe          21
Kenya             14
Namibia           11
Hong Kong          8
P.N.G.             7
U.A.E.             7
Canada             6
Singapore          4
Nepal              3
Oman               2
Cayman Islands     2
World-XI           1
Bermuda            1
U.S.A.             1
Name: winner, dtype: int64

In [260]:
match_df['team1'].value_counts()

Australia       81
New Zealand     80
South Africa    69
India           68
Afghanistan     66
England         63
Bangladesh      57
Ireland         48
Sri Lanka       47
West Indies     41
Pakistan        41
Netherlands     36
Zimbabwe        27
Kenya           21
Canada          14
U.A.E.          14
Bermuda         12
Namibia         10
Hong Kong       10
Scotland        10
Oman             7
Nepal            3
Ghana            2
Uganda           2
Singapore        2
Botswana         1
P.N.G.           1
Name: team1, dtype: int64

In [261]:
match_df[match_df['winner'].isnull() == True]


Unnamed: 0,season,Match_date,city,team1,team2,winner,toss_winner,toss_decision,win_by_runs,win_by_wickets


In [262]:
##replacing null results with draw
match_df['winner'].fillna('Draw', inplace=True)

In [263]:
match_df[match_df['winner'].isnull() == True]

Unnamed: 0,season,Match_date,city,team1,team2,winner,toss_winner,toss_decision,win_by_runs,win_by_wickets


### Encoding Team Names 

In [264]:

team_encodings = {
    'India' : 1,
    'New Zealand' :2,
    'Sri Lanka' :3,
    'Pakistan' :4,
    'England':5,
    'Australia' :6,
    'Bangladesh' :7,
    'South Africa' :8,
    'West Indies' :9,
    'Zimbabwe' :10,
    'Ireland' :11,
    'Afghanistan' :12,
    'Kenya' :13,
    'Scotland' :14,
    'Netherlands' :15,
    'Ireland' :16,
    'Bermuda' :17,
    'Namibia' :18,
    'Canada' :19,
    'U.A.E.' :20,
    'Hong Kong' :21,
    'Nepal' :22,
    'P.N.G.' :23,
    'Oman' :24,
    'World-XI' :25,
    'Namibia' :26,
    'Nigeria' :27,
    'U.S.A.' :28,
    'Botswana' :29,
    'Cayman Islands' :30,
    'Singapore' :31,
    'Jersey' :32,
    'Ghana':33,
    'Uganda':34

    
}

team_encode_dict = {'team1': team_encodings,
                    'team2': team_encodings,
                    'toss_winner': team_encodings,
                    'winner': team_encodings
                   }

match_df.replace(team_encode_dict, inplace=True)
match_df.head()

Unnamed: 0,season,Match_date,city,team1,team2,winner,toss_winner,toss_decision,win_by_runs,win_by_wickets
0,2005,2005/02/17,Auckland,2,6,6,6,bat,44.0,
1,2005,2005/06/13,Southampton,5,6,5,5,bat,100.0,
2,2005,2005/10/21,Johannesburg,8,2,2,2,bat,,5.0
3,2006,2006/01/09,Brisbane,6,8,6,6,bat,95.0,
4,2006,2006/02/24,Johannesburg,8,6,8,8,bat,2.0,


### Exploring City Column

In [265]:
match_df['city'].value_counts()

Dubai (DSC)      59
Dhaka            43
Colombo (RPS)    36
Johannesburg     29
Abu Dhabi        28
                 ..
Kimberley         1
Geelong           1
Ahmedabad         1
Dublin            1
St George's       1
Name: city, Length: 109, dtype: int64

### Dropping all the redundant columns

In [266]:
match_df = match_df[[ 'team1','team2','city','toss_decision','toss_winner','winner']]
match_df.head()

Unnamed: 0,team1,team2,city,toss_decision,toss_winner,winner
0,2,6,Auckland,bat,6,6
1,5,6,Southampton,bat,5,5
2,8,2,Johannesburg,bat,2,2
3,6,8,Brisbane,bat,6,6
4,8,6,Johannesburg,bat,8,8


In [267]:
match_df.describe()

Unnamed: 0,team1,team2,toss_winner,winner
count,833.0,833.0,833.0,833.0
mean,8.570228,8.757503,8.809124,7.990396
std,6.198261,6.597914,6.405625,6.067521
min,1.0,1.0,1.0,1.0
25%,4.0,4.0,4.0,3.0
50%,7.0,7.0,7.0,6.0
75%,12.0,13.0,13.0,12.0
max,34.0,32.0,33.0,31.0


### Toss Wins and Match Wins by each Team

In [268]:
##looking at number of toss wins and match wins
toss_wins = match_df['winner'].value_counts(sort=True)
match_wins = match_df['winner'].value_counts(sort=True)

for idx, val in toss_wins.iteritems():
    print(f"{list(team_encode_dict['winner'].keys())[idx-1]} -> {toss_wins[idx]}")


Pakistan -> 91
India -> 87
South Africa -> 73
New Zealand -> 64
Sri Lanka -> 63
England -> 60
Australia -> 58
West Indies -> 55
Afghanistan -> 50
Netherlands -> 43
Bermuda -> 38
Scotland -> 34
Bangladesh -> 29
Zimbabwe -> 21
Kenya -> 14
U.S.A. -> 11
Nepal -> 8
Oman -> 7
Hong Kong -> 7
U.A.E. -> 6
Ghana -> 4
P.N.G. -> 3
World-XI -> 2
Jersey -> 2
Nigeria -> 1
Namibia -> 1
Cayman Islands -> 1


## Data Preparation

**Encoding all the remaining features**

* city
* toss_decision
* venue

In [269]:
##using the label encoder
from sklearn.preprocessing import LabelEncoder

ftr_list = ['city', 'toss_decision']
encoder = LabelEncoder()
for ftr in ftr_list:
    match_df[ftr] = encoder.fit_transform(match_df[ftr])
    print(encoder.classes_)

match_df

['Aberdeen' 'Abu Dhabi' 'Adelaide' 'Ahmedabad' 'Al Amerat' 'Amstelveen'
 'Auckland' 'Basseterre' 'Belfast' 'Bengaluru' 'Birmingham' 'Bloemfontein'
 'Bready' 'Bridgetown' 'Brisbane' 'Bristol' 'Bulawayo' 'Canberra'
 'Cape Town' 'Cardiff' 'Carrara' 'Centurion' 'Chattogram' 'Chennai'
 'Chester-le-Street' 'Christchurch' 'Colombo (PSS)' 'Colombo (RPS)'
 'Colombo (SSC)' 'Cuttack' 'Dehradun' 'Delhi' 'Deventer' 'Dhaka'
 'Dharamsala' 'Dubai (DSC)' 'Dublin' 'Dublin (Malahide)' 'Durban'
 'East London' 'Edinburgh' 'Fatullah' 'Geelong' 'Greater Noida'
 'Gros Islet' 'Guwahati' 'Hambantota' 'Hamilton' 'Harare' 'Hobart'
 'Hyderabad (Deccan)' 'ICCA 2 Dubai' 'ICCA Dubai' 'Indore' 'Johannesburg'
 'Kampala' 'Kanpur' 'Karachi' 'Khulna' 'Kimberley' 'King City (NW)'
 'Kingston' 'Kingstown' 'Kolkata' 'Lahore' 'Lauderhill' "Lord's" 'Lucknow'
 'Manchester' 'Melbourne' 'Mohali' 'Mombasa' 'Mong Kok' 'Mount Maunganui'
 'Mumbai' 'Mumbai (BS)' 'Nagpur' 'Nairobi (Gym)' 'Napier' 'Nelson'
 'North Sound' 'Nottingham' 'Pa

Unnamed: 0,team1,team2,city,toss_decision,toss_winner,winner
0,2,6,6,0,6,6
1,5,6,96,0,5,5
2,8,2,54,0,2,2
3,6,8,14,0,6,6
4,8,6,54,0,8,8
...,...,...,...,...,...,...
828,12,16,43,0,16,16
829,3,9,82,1,9,9
830,12,16,43,0,12,12
831,7,10,33,0,10,10


## Machine Learning

In [270]:
##splitting the data for training and testing

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(match_df, test_size=0.2, random_state=42)
print(train_df.shape)
print(test_df.shape)

(666, 6)
(167, 6)


In [271]:
#Import models from scikit learn module:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np


def print_model_scores(model, data, predictors, target):
    '''
    A generic function to generate the performance report of the
    model in question on the data passed to it using cross-validation
    
    Args:
        model: ML Model to be checked
        data: data on which the model needs to pe trained
        predictors: independent feature variable
        target: target variable
    '''
    model.fit(data[predictors], data[target])
    predictions = model.predict(data[predictors])
    accuracy = accuracy_score(predictions,data[target])
    print('Accuracy : %s' % '{0:.2%}'.format(accuracy))
    scores = cross_val_score(model, data[predictors], data[target], scoring="neg_mean_squared_error", cv=5)
    print('Cross-Validation Score :{}'.format(np.sqrt(-scores)))
    print(f"Average RMSE: {np.sqrt(-scores).mean()}")

### Logistic Regression

In [272]:
##PAVITHRANI:WE WILL BE USING THE RANDOM FORREST ONE COZ OF HIGHER ACCURACY
target_var=['winner']
predictor_var = ['team1','team2', 'toss_winner', 'city', 'toss_decision']
model = LogisticRegression()
print_model_scores(model, train_df, predictor_var, target_var)

Accuracy : 24.17%
Cross-Validation Score :[6.04522755 5.18021104 5.54679342 5.2041048  5.03372088]
Average RMSE: 5.40201153889536


### Random Forest Classifier

In [273]:
model1 = RandomForestClassifier(n_estimators=100)
target_var = ['winner']
predictor_var = ['team1', 'team2', 'toss_winner','city','toss_decision']
print_model_scores(model1, match_df,predictor_var, target_var)

Accuracy : 94.60%
Cross-Validation Score :[3.54441451 3.53172109 3.26017523 3.68405879 3.41682584]
Average RMSE: 3.4874390923705065


In [274]:
team1='Australia'
team2='India'
toss_winner='India'
inp = [team_encode_dict['team1'][team1],team_encode_dict['team2'][team2],team_encode_dict['toss_winner'][toss_winner],'5','2']
inp = np.array(inp).reshape((1, -1))
print(inp)
output=model1.predict(inp)
print(output)
print(f"The winner would be: {list(team_encodings.keys())[list(team_encode_dict['team1'].values()).index(output)]}")

[['6' '1' '1' '5' '2']]
[1]
The winner would be: India


In [275]:
import pickle 
pickle.dump(model1,open('model.pkl','wb'))