# **Setup**

## Setting up Kaggle & downloading datasets

In [None]:
from google.colab import files
print('Upload your Kaggle API token\n')
files.upload()

!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!kaggle config set -n path -v{/content}
!chmod 600 /root/.kaggle/kaggle.json


Upload your Kaggle API token



Saving kaggle.json to kaggle.json
- path is now set to: {/content}


In [None]:
!kaggle datasets download -d patrickb1912/ipl-complete-dataset-20082020 -p /content #Downloading dataset

!unzip \*.zip  && rm *.zip #Unzipping the zip files and deleting the zip files

Downloading ipl-complete-dataset-20082020.zip to /content
  0% 0.00/1.37M [00:00<?, ?B/s]
100% 1.37M/1.37M [00:00<00:00, 46.0MB/s]
Archive:  ipl-complete-dataset-20082020.zip
  inflating: IPL Ball-by-Ball 2008-2020.csv  
  inflating: IPL Matches 2008-2020.csv  


## Updating & importing libraries

In [None]:
!pip install seaborn==0.11.0



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
from math import sqrt

  import pandas.util.testing as tm


In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# **Functions**

##Number of records with no missing entries

In [None]:
def n_rec_missing(df):

  n_rec_missing = 0

  for i in range(len(df.index)):
    if df.iloc[i].isnull().sum() > 0:
      n_rec_missing += 1
    else:
      pass
  
  return n_rec_missing 

## Entropy Computation

In [None]:
def entropy(ds): #Computes the entropy of a dataset

  n_rec = len(ds) #Total number of records in the dataset
  value, freq = np.unique(ds, return_counts=True) #freq is the list of frequencies of each unique value of the dataset
  probs = freq/n_rec #probs is the list of probabilities of the random variable taking those values
  
  ent = stats.entropy(probs, base=2)

  return ent

## Diversity of data

In [None]:
def diversity(ds): #Checks if the data is diverse
  
  ent = entropy(ds)
  max_ent = np.log2(ds.nunique())

  if max_ent == 0:
    print('\nThere is no diversity in the data')
  else:
    div = ent/max_ent
    if div < 0.3:
      print('\nThere is less diversity in the data')
    elif div < 0.7:
      print('\nThere is some diversity in the data')
    elif div < 0.98:
      print('\nThere is high diversity in the data')
    else:
      print('\nThere is very high diversity in the data')

## Displaying Mode, Entropy, and Diversity

In [None]:
def disp_modeEnt(df,var): 
  
  print("Mode of the variable '"+var+"':")
  print(df[var].mode())

  print("\nEntropy of the variable '"+var+"':", entropy(df[var]))

  diversity(df[var])

## Computing important parameters of distributions of continuous random variables

In [None]:
def cont_par_calc(ds):
  print("Mean:",ds.mean())
  print("Variance:",ds.var())
  print("Skew:",ds.skew())
  print("Min:",ds.min())
  print("Max:",ds.max())
  print("Median:",ds.median())
  print("25th percentile:",ds.quantile(0.25))
  print("75th percentile:",ds.quantile(0.75))
  print("Inter-quantile range:",(ds.quantile(0.75)-ds.quantile(0.25)))

## Checking if variable's distribution is log-normal

In [None]:
def var_logNormal(ds):
  
  if ds.all():
    fig = sm.qqplot(np.log(ds))
    plt.show()
  else:
    print("Variable's distribution is not log normal")

# **IPL Data**

## Preliminaries

### Loading the file

In [None]:
df_bbb = pd.read_csv('IPL Ball-by-Ball 2008-2020.csv')
df_matches = pd.read_csv('IPL Matches 2008-2020.csv')

### Display the dataset

#### Ball-by-ball

In [None]:
display(df_bbb)

Unnamed: 0,id,inning,over,ball,batsman,non_striker,bowler,batsman_runs,extra_runs,total_runs,non_boundary,is_wicket,dismissal_kind,player_dismissed,fielder,extras_type,batting_team,bowling_team
0,335982,1,6,5,RT Ponting,BB McCullum,AA Noffke,1,0,1,0,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore
1,335982,1,6,6,BB McCullum,RT Ponting,AA Noffke,1,0,1,0,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore
2,335982,1,7,1,BB McCullum,RT Ponting,Z Khan,0,0,0,0,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore
3,335982,1,7,2,BB McCullum,RT Ponting,Z Khan,1,0,1,0,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore
4,335982,1,7,3,RT Ponting,BB McCullum,Z Khan,1,0,1,0,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193463,1237181,1,12,5,RR Pant,SS Iyer,NM Coulter-Nile,0,0,0,0,0,,,,,Delhi Capitals,Mumbai Indians
193464,1237181,1,12,6,RR Pant,SS Iyer,NM Coulter-Nile,1,0,1,0,0,,,,,Delhi Capitals,Mumbai Indians
193465,1237181,1,13,1,RR Pant,SS Iyer,KH Pandya,0,1,1,0,0,,,,wides,Delhi Capitals,Mumbai Indians
193466,1237181,1,13,2,RR Pant,SS Iyer,KH Pandya,1,0,1,0,0,,,,,Delhi Capitals,Mumbai Indians


#### Matches

In [None]:
display(df_matches)

Unnamed: 0,id,city,date,player_of_match,venue,neutral_venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,eliminator,method,umpire1,umpire2
0,335982,Bangalore,2008-04-18,BB McCullum,M Chinnaswamy Stadium,0,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,N,,Asad Rauf,RE Koertzen
1,335983,Chandigarh,2008-04-19,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",0,Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,N,,MR Benson,SL Shastri
2,335984,Delhi,2008-04-19,MF Maharoof,Feroz Shah Kotla,0,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,N,,Aleem Dar,GA Pratapkumar
3,335985,Mumbai,2008-04-20,MV Boucher,Wankhede Stadium,0,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,N,,SJ Davis,DJ Harper
4,335986,Kolkata,2008-04-20,DJ Hussey,Eden Gardens,0,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0,N,,BF Bowden,K Hariharan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,1216547,Dubai,2020-09-28,AB de Villiers,Dubai International Cricket Stadium,0,Royal Challengers Bangalore,Mumbai Indians,Mumbai Indians,field,Royal Challengers Bangalore,tie,,Y,,Nitin Menon,PR Reiffel
812,1237177,Dubai,2020-11-05,JJ Bumrah,Dubai International Cricket Stadium,0,Mumbai Indians,Delhi Capitals,Delhi Capitals,field,Mumbai Indians,runs,57.0,N,,CB Gaffaney,Nitin Menon
813,1237178,Abu Dhabi,2020-11-06,KS Williamson,Sheikh Zayed Stadium,0,Royal Challengers Bangalore,Sunrisers Hyderabad,Sunrisers Hyderabad,field,Sunrisers Hyderabad,wickets,6.0,N,,PR Reiffel,S Ravi
814,1237180,Abu Dhabi,2020-11-08,MP Stoinis,Sheikh Zayed Stadium,0,Delhi Capitals,Sunrisers Hyderabad,Delhi Capitals,bat,Delhi Capitals,runs,17.0,N,,PR Reiffel,S Ravi


### Number of records & variables

#### Ball-by-ball

In [None]:
print('Number of records = '+str(df_bbb.shape[0]))
print('Number of variables = '+str(df_bbb.shape[1]))

Number of records = 193468
Number of variables = 18


#### Matches

In [None]:
print('Number of records = '+str(df_matches.shape[0]))
print('Number of variables = '+str(df_matches.shape[1]))

Number of records = 816
Number of variables = 17


### Datatypes, unique values, and number of missing entries

#### Ball-by-ball

In [None]:
for var in df_bbb.columns.values:
  uni_vals = pd.unique(df_bbb[var])
  print(var + ' is of type ' + str(df_bbb[var].dtypes) + ', has ' + str(len(uni_vals)) + ' unique values, and ' + str(np.sum(pd.isnull(df_bbb[var]))) + ' null entries')
  print('\n')

print('Number of records with no missing entries:',df_bbb.shape[0]-n_rec_missing(df_bbb))

id is of type int64, has 816 unique values, and 0 null entries


inning is of type int64, has 2 unique values, and 0 null entries


over is of type int64, has 20 unique values, and 0 null entries


ball is of type int64, has 9 unique values, and 0 null entries


batsman is of type object, has 537 unique values, and 0 null entries


non_striker is of type object, has 530 unique values, and 0 null entries


bowler is of type object, has 420 unique values, and 0 null entries


batsman_runs is of type int64, has 7 unique values, and 0 null entries


extra_runs is of type int64, has 7 unique values, and 0 null entries


total_runs is of type int64, has 8 unique values, and 0 null entries


non_boundary is of type int64, has 2 unique values, and 0 null entries


is_wicket is of type int64, has 2 unique values, and 0 null entries


dismissal_kind is of type object, has 10 unique values, and 183973 null entries


player_dismissed is of type object, has 507 unique values, and 183973 null entrie

#### Matches

In [None]:
for var in df_matches.columns.values:
  uni_vals = pd.unique(df_matches[var])
  print(var + ' is of type ' + str(df_matches[var].dtypes) + ', has ' + str(len(uni_vals)) + ' unique values, and ' + str(np.sum(pd.isnull(df_matches[var]))) + ' null entries')
  print('\n')

print('Number of records with no missing entries:',df_matches.shape[0]-n_rec_missing(df_matches))

id is of type int64, has 816 unique values, and 0 null entries


city is of type object, has 33 unique values, and 13 null entries


date is of type object, has 596 unique values, and 0 null entries


player_of_match is of type object, has 234 unique values, and 4 null entries


venue is of type object, has 36 unique values, and 0 null entries


neutral_venue is of type int64, has 2 unique values, and 0 null entries


team1 is of type object, has 15 unique values, and 0 null entries


team2 is of type object, has 15 unique values, and 0 null entries


toss_winner is of type object, has 15 unique values, and 0 null entries


toss_decision is of type object, has 2 unique values, and 0 null entries


winner is of type object, has 16 unique values, and 4 null entries


result is of type object, has 4 unique values, and 4 null entries


result_margin is of type float64, has 92 unique values, and 17 null entries


eliminator is of type object, has 3 unique values, and 4 null entries


method

### Identifying the type of variables



####Ball-by-ball

In [None]:
nominal = ['batsman','non_striker','bowler','non_boundary','is_wicket','dismissal_kind','player_dismissed','fielder','extras_type','batting_team','bowling_team']
ordinal = ['inning']
temporal = []
integer = ['id','inning','over','ball','batsman_runs','extra_runs','total_runs','non_boundary','is_wicket']
discrete = ['inning','batsman','non_striker','bowler','non_boundary','is_wicket','dismissal_kind','player_dismissed','fielder','extras_type','batting_team','bowling_team']
continuous = ['id','over','ball','batsman_runs','extra_runs','total_runs']

print('Nominal Variables:')
display(df_bbb[nominal])
print('\nOrdinal Variables:')
display(df_bbb[ordinal])
print('\nTemporal Variables:')
display(df_bbb[temporal])
print('\nInteger Variables:')
display(df_bbb[integer])
print('\nContinuous Variables:')
display(df_bbb[continuous])

Nominal Variables:


Unnamed: 0,batsman,non_striker,bowler,non_boundary,is_wicket,dismissal_kind,player_dismissed,fielder,extras_type,batting_team,bowling_team
0,RT Ponting,BB McCullum,AA Noffke,0,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore
1,BB McCullum,RT Ponting,AA Noffke,0,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore
2,BB McCullum,RT Ponting,Z Khan,0,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore
3,BB McCullum,RT Ponting,Z Khan,0,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore
4,RT Ponting,BB McCullum,Z Khan,0,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore
...,...,...,...,...,...,...,...,...,...,...,...
193463,RR Pant,SS Iyer,NM Coulter-Nile,0,0,,,,,Delhi Capitals,Mumbai Indians
193464,RR Pant,SS Iyer,NM Coulter-Nile,0,0,,,,,Delhi Capitals,Mumbai Indians
193465,RR Pant,SS Iyer,KH Pandya,0,0,,,,wides,Delhi Capitals,Mumbai Indians
193466,RR Pant,SS Iyer,KH Pandya,0,0,,,,,Delhi Capitals,Mumbai Indians



Ordinal Variables:


Unnamed: 0,inning
0,1
1,1
2,1
3,1
4,1
...,...
193463,1
193464,1
193465,1
193466,1



Temporal Variables:


0
1
2
3
4
...
193463
193464
193465
193466
193467



Integer Variables:


Unnamed: 0,id,inning,over,ball,batsman_runs,extra_runs,total_runs,non_boundary,is_wicket
0,335982,1,6,5,1,0,1,0,0
1,335982,1,6,6,1,0,1,0,0
2,335982,1,7,1,0,0,0,0,0
3,335982,1,7,2,1,0,1,0,0
4,335982,1,7,3,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
193463,1237181,1,12,5,0,0,0,0,0
193464,1237181,1,12,6,1,0,1,0,0
193465,1237181,1,13,1,0,1,1,0,0
193466,1237181,1,13,2,1,0,1,0,0



Continuous Variables:


Unnamed: 0,id,over,ball,batsman_runs,extra_runs,total_runs
0,335982,6,5,1,0,1
1,335982,6,6,1,0,1
2,335982,7,1,0,0,0
3,335982,7,2,1,0,1
4,335982,7,3,1,0,1
...,...,...,...,...,...,...
193463,1237181,12,5,0,0,0
193464,1237181,12,6,1,0,1
193465,1237181,13,1,0,1,1
193466,1237181,13,2,1,0,1


#### Matches

In [None]:
nominal = ['city','player_of_match','venue','neutral_venue','team1','team2','toss_winner','toss_decision','winner','result','eliminator','method','umpire1','umpire2']
ordinal = []
temporal = ['date']
integer = ['id','neutral_venue','result_margin']
discrete = ['city','player_of_match','venue','neutral_venue','team1','team2','toss_winner','toss_decision','winner','result','eliminator','method','umpire1','umpire2']
continuous = ['id','result_margin']

print('Nominal Variables:')
display(df_matches[nominal])
print('\nOrdinal Variables:')
display(df_matches[ordinal])
print('\nTemporal Variables:')
display(df_matches[temporal])
print('\nInteger Variables:')
display(df_matches[integer])
print('\nContinuous Variables:')
display(df_matches[continuous])

Nominal Variables:


Unnamed: 0,city,player_of_match,venue,neutral_venue,team1,team2,toss_winner,toss_decision,winner,result,eliminator,method,umpire1,umpire2
0,Bangalore,BB McCullum,M Chinnaswamy Stadium,0,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,N,,Asad Rauf,RE Koertzen
1,Chandigarh,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",0,Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,N,,MR Benson,SL Shastri
2,Delhi,MF Maharoof,Feroz Shah Kotla,0,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,N,,Aleem Dar,GA Pratapkumar
3,Mumbai,MV Boucher,Wankhede Stadium,0,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,N,,SJ Davis,DJ Harper
4,Kolkata,DJ Hussey,Eden Gardens,0,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,N,,BF Bowden,K Hariharan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,Dubai,AB de Villiers,Dubai International Cricket Stadium,0,Royal Challengers Bangalore,Mumbai Indians,Mumbai Indians,field,Royal Challengers Bangalore,tie,Y,,Nitin Menon,PR Reiffel
812,Dubai,JJ Bumrah,Dubai International Cricket Stadium,0,Mumbai Indians,Delhi Capitals,Delhi Capitals,field,Mumbai Indians,runs,N,,CB Gaffaney,Nitin Menon
813,Abu Dhabi,KS Williamson,Sheikh Zayed Stadium,0,Royal Challengers Bangalore,Sunrisers Hyderabad,Sunrisers Hyderabad,field,Sunrisers Hyderabad,wickets,N,,PR Reiffel,S Ravi
814,Abu Dhabi,MP Stoinis,Sheikh Zayed Stadium,0,Delhi Capitals,Sunrisers Hyderabad,Delhi Capitals,bat,Delhi Capitals,runs,N,,PR Reiffel,S Ravi



Ordinal Variables:


0
1
2
3
4
...
811
812
813
814
815



Temporal Variables:


Unnamed: 0,date
0,2008-04-18
1,2008-04-19
2,2008-04-19
3,2008-04-20
4,2008-04-20
...,...
811,2020-09-28
812,2020-11-05
813,2020-11-06
814,2020-11-08



Integer Variables:


Unnamed: 0,id,neutral_venue,result_margin
0,335982,0,140.0
1,335983,0,33.0
2,335984,0,9.0
3,335985,0,5.0
4,335986,0,5.0
...,...,...,...
811,1216547,0,
812,1237177,0,57.0
813,1237178,0,6.0
814,1237180,0,17.0



Continuous Variables:


Unnamed: 0,id,result_margin
0,335982,140.0
1,335983,33.0
2,335984,9.0
3,335985,5.0
4,335986,5.0
...,...,...
811,1216547,
812,1237177,57.0
813,1237178,6.0
814,1237180,17.0


## Given the score after the first 10 overs, predict the over in which the team is likely to score most runs

Since this would depend on whether the team is batting first or chasing, we would have 2 cases. For simplicity, we will consider the case of a team batting first as this problem is less complex.

We will be considering the 2020 IPL season. Since there are only 3 venues in this season, much more data is available per venue and a stronger model can be made.

### Creating the datasets to be used

In [None]:
df_bbb20 = df_bbb[df_bbb['id']>=1216492]
df_matches20 = df_matches[df_matches['id']>=1216492].set_index('id')

In [None]:
df_bat20 = df_bbb20[df_bbb20['inning']==1]

####Finding score after 10 overs

In [None]:
df_10overs20 = df_bat20[df_bat20['over']<=10]
df_10overs20 = df_10overs20[['id','batting_team','bowling_team','total_runs','is_wicket']].set_index(['id','batting_team','bowling_team'])
df_10o_score20 = df_10overs20.groupby(['id','batting_team','bowling_team']).sum().reset_index(level=['batting_team','bowling_team'])
df_10o_score20 = df_10o_score20.rename(columns={"total_runs": "runs_10o", "is_wicket": "wickets_10o"})

In [None]:
display(df_10o_score20)

Unnamed: 0_level_0,batting_team,bowling_team,runs_10o,wickets_10o
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1216492,Mumbai Indians,Chennai Super Kings,92,3
1216493,Delhi Capitals,Kings XI Punjab,56,3
1216494,Kolkata Knight Riders,Royal Challengers Bangalore,39,5
1216495,Mumbai Indians,Sunrisers Hyderabad,81,2
1216496,Rajasthan Royals,Chennai Super Kings,129,1
1216497,Kolkata Knight Riders,Delhi Capitals,86,3
1216498,Kings XI Punjab,Sunrisers Hyderabad,69,3
1216499,Royal Challengers Bangalore,Mumbai Indians,93,1
1216500,Delhi Capitals,Rajasthan Royals,92,4
1216501,Kolkata Knight Riders,Chennai Super Kings,98,3


####Finding over in range [11,20] in which max runs were made

In [None]:
df_1120overs20 = df_bat20[df_bat20['over']>10]
df_1120overs20 = df_1120overs20[['id','over','total_runs']].set_index(['id','over'])
df_1120o_runs20 = df_1120overs20.groupby(['id','over']).sum().reset_index(level=['id','over'])

loc_maxruns = []
for i in pd.unique(df_1120o_runs20['id']):
  loc_id = pd.DataFrame(df_1120o_runs20.loc[df_1120o_runs20['id']==i])
  loc_maxruns.append(loc_id['total_runs'].idxmax())

df_1120o_maxruns20 = df_1120o_runs20.loc[loc_maxruns,:].set_index('id')

df_1120o_maxruns20 = df_1120o_maxruns20.rename(columns={"over": "over_maxruns","total_runs": "maxruns_1120"})

In [None]:
display(df_1120o_maxruns20)

Unnamed: 0_level_0,over_maxruns,maxruns_1120
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1216492,11,13
1216493,19,30
1216494,19,10
1216495,18,20
1216496,19,30
1216497,11,17
1216498,18,9
1216499,14,16
1216500,18,22
1216501,14,14


#### Creating the final dataset

In [None]:
df = df_10o_score20
df['venue'] = df_matches20['venue']

In [None]:
df['over_maxruns'] = df_1120o_maxruns20['over_maxruns']
df['maxruns_1120'] = df_1120o_maxruns20['maxruns_1120']

In [None]:
display(df)

Unnamed: 0_level_0,batting_team,bowling_team,runs_10o,wickets_10o,venue,over_maxruns,maxruns_1120
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1216492,Mumbai Indians,Chennai Super Kings,92,3,Sheikh Zayed Stadium,11,13
1216493,Delhi Capitals,Kings XI Punjab,56,3,Dubai International Cricket Stadium,19,30
1216494,Kolkata Knight Riders,Royal Challengers Bangalore,39,5,Sheikh Zayed Stadium,19,10
1216495,Mumbai Indians,Sunrisers Hyderabad,81,2,Sharjah Cricket Stadium,18,20
1216496,Rajasthan Royals,Chennai Super Kings,129,1,Sharjah Cricket Stadium,19,30
1216497,Kolkata Knight Riders,Delhi Capitals,86,3,Sheikh Zayed Stadium,11,17
1216498,Kings XI Punjab,Sunrisers Hyderabad,69,3,Dubai International Cricket Stadium,18,9
1216499,Royal Challengers Bangalore,Mumbai Indians,93,1,Sheikh Zayed Stadium,14,16
1216500,Delhi Capitals,Rajasthan Royals,92,4,Sharjah Cricket Stadium,18,22
1216501,Kolkata Knight Riders,Chennai Super Kings,98,3,Sheikh Zayed Stadium,14,14


### Pre-processing

Supervised Learning Problem

Target Variables to be predicted: over_maxruns [ Integer (continuous) ] & max_runs1120 [ Integer (continuous) ]

Measure of Performance: MSE

Relevant variables for prediction: All except id

There aren't many variables and they are mostly independent so PCA is not required

ML Frameworks considered: LASSO Regression, RF 

%LASSO is expected to perform better because RF will probably overfit


In [None]:
df_ml = df.reset_index()
df_ml = df_ml.drop(columns=['id'])

#### Functions

####Teams

In [None]:
def team_toInt(team):
  
  if team == 'Mumbai Indians':
    team = 0
  elif team == 'Delhi Capitals':
    team = 1
  elif team == 'Sunrisers Hyderabad':
    team = 2
  elif team == 'Royal Challengers Bangalore':
    team = 3
  elif team == 'Kolkata Knight Riders':
    team = 4
  elif team == 'Kings XI Punjab':
    team = 5
  elif team == 'Chennai Super Kings':
    team = 6
  elif team == 'Rajasthan Royals':
    team = 7

  return team

####Venue

In [None]:
def venue_toInt(venue):
  
  if venue == 'Sheikh Zayed Stadium':
    venue = 0
  elif venue == 'Dubai International Cricket Stadium':
    venue = 1
  elif venue == 'Sharjah Cricket Stadium':
    venue = 2

  return venue

####Converting categorical variables

In [None]:
df_ml['batting_team'] = df_ml['batting_team'].apply(lambda x : team_toInt(x))
df_ml['bowling_team'] = df_ml['bowling_team'].apply(lambda x : team_toInt(x))

In [None]:
df_ml['venue'] = df_ml['venue'].apply(lambda x : venue_toInt(x))

In [None]:
display(df_ml)

Unnamed: 0,batting_team,bowling_team,runs_10o,wickets_10o,venue,over_maxruns,maxruns_1120
0,0,6,92,3,0,11,13
1,1,5,56,3,1,19,30
2,4,3,39,5,0,19,10
3,0,2,81,2,2,18,20
4,7,6,129,1,2,19,30
5,4,1,86,3,0,11,17
6,5,2,69,3,1,18,9
7,3,0,93,1,0,14,16
8,1,7,92,4,2,18,22
9,4,6,98,3,0,14,14


#### Renaming the columns

In [None]:
df_ml = df_ml.rename(columns={"batting_team": "X1", "bowling_team": "X2", "runs_10o": "X3", "wickets_10o": "X4", "venue": "X5", "over_maxruns": "Y1", "maxruns_1120": "Y2"})

#### Splitting the dataset into features and target variables

In [None]:
X = df_ml[['X1', 'X2', 'X3', 'X4', 'X5']]
y1 = df_ml['Y1'] #over_maxruns
y2 = df_ml['Y2'] #maxruns_1120

In [None]:
display(X)

Unnamed: 0,X1,X2,X3,X4,X5
0,0,6,92,3,0
1,1,5,56,3,1
2,4,3,39,5,0
3,0,2,81,2,2
4,7,6,129,1,2
5,4,1,86,3,0
6,5,2,69,3,1
7,3,0,93,1,0
8,1,7,92,4,2
9,4,6,98,3,0


In [None]:
display(y1)

0     11
1     19
2     19
3     18
4     19
5     11
6     18
7     14
8     18
9     14
10    12
11    19
12    12
13    17
14    17
15    18
16    16
17    18
18    18
19    17
20    19
21    14
22    19
23    18
24    16
25    17
26    17
27    16
28    17
29    19
30    17
31    16
32    11
33    17
34    19
35    11
36    19
37    15
38    18
39    19
40    18
41    13
42    18
43    18
44    15
45    15
46    19
47    19
48    16
49    17
50    13
51    14
52    15
53    13
54    16
55    19
56    19
57    19
58    17
59    11
Name: Y1, dtype: int64

In [None]:
display(y2)

0     13
1     30
2     10
3     20
4     30
5     17
6      9
7     16
8     22
9     14
10     7
11    25
12    16
13    17
14    16
15    19
16    19
17    16
18    26
19    19
20    16
21    16
22    15
23    20
24    22
25    22
26    16
27    18
28    11
29    13
30    19
31    18
32    15
33    24
34    21
35    18
36    15
37    16
38    24
39    24
40    13
41    14
42    14
43    11
44    19
45    14
46    21
47    14
48    19
49    27
50    16
51    17
52    13
53    11
54    13
55    20
56    20
57    13
58    18
59    13
Name: Y2, dtype: int64

####Standardizing the continuous variables

In [None]:
scaler = StandardScaler()
X[['X3','X4']] = pd.DataFrame(scaler.fit_transform(X[['X3','X4']]),columns=['X3','X4'])
display(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,X1,X2,X3,X4,X5
0,0,6,0.460364,0.520306,0
1,1,5,-1.641931,0.520306,1
2,4,3,-2.634681,2.006894,0
3,0,2,-0.182004,-0.222988,2
4,7,6,2.621055,-0.966282,2
5,4,1,0.109981,0.520306,0
6,5,2,-0.882769,0.520306,1
7,3,0,0.518761,-0.966282,0
8,1,7,0.460364,1.2636,2
9,4,6,0.810746,0.520306,0


### ML Functions

####Lasso Regression

In [None]:
def lassoTune(X_train, y_train, X_vald, y_vald):

  alphas = [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50]
  
  alpha_best = alphas[0]
  reg_m = Lasso(alpha=alpha_best)
  reg_m.fit(X_train, y_train) 
  y_pred = reg_m.predict(X_vald) 
  mse_best = mean_squared_error(y_vald, y_pred)

  for i in alphas:
    reg_m = Lasso(alpha=i)
    reg_m.fit(X_train, y_train) 

    y_pred = reg_m.predict(X_vald) 

    mse = mean_squared_error(y_vald, y_pred)
    
    if mse < mse_best:
      alpha_best = i
      mse_best = mse
    else:
      pass
  
  return alpha_best

In [None]:
def lassoReg(X, y):
  
  X_train, X_mask, y_train, y_mask = train_test_split(X, y, test_size = 0.3, shuffle = False)  
  X_vald, X_test, y_vald, y_test = train_test_split(X_mask, y_mask, test_size = 0.5, shuffle = False)
  
  alpha_best = lassoTune(X_train, y_train, X_vald, y_vald)

  reg_m = Lasso(alpha=alpha_best)
  reg_m.fit(X_train, y_train) 

  y_pred = reg_m.predict(X_test) 

  print('\nMSE:', mean_squared_error(y_test, y_pred))
  print('\nR^2:', reg_m.score(X_test, y_test))

####RF Regression

In [None]:
def RFTune(X_train, y_train, X_vald, y_vald):

  n_estimators_list = [1, 5, 10, 15, 27, 60, 100, 150, 200]
  
  n_estimator_best = n_estimators_list[0]
  reg_m = RandomForestRegressor(n_estimators=n_estimator_best)
  reg_m.fit(X_train, y_train) 
  y_pred = reg_m.predict(X_vald) 
  mse_best = mean_squared_error(y_vald, y_pred)

  for i in n_estimators_list:
    reg_m = RandomForestRegressor(n_estimators=i)
    reg_m.fit(X_train, y_train) 

    y_pred = reg_m.predict(X_vald) 

    mse = mean_squared_error(y_vald, y_pred)
    
    if mse < mse_best:
      n_estimator_best = i
      mse_best = mse
    else:
      pass

  max_depth_best = 1
  reg_m = RandomForestRegressor(n_estimators=n_estimator_best, max_depth=max_depth_best)
  reg_m.fit(X_train, y_train) 
  y_pred = reg_m.predict(X_vald) 
  mse_best = mean_squared_error(y_vald, y_pred)
    
  for i in range(2,40):
    reg_m = RandomForestRegressor(n_estimators=n_estimator_best, max_depth=i)
    reg_m.fit(X_train, y_train) 

    y_pred = reg_m.predict(X_vald) 

    mse = mean_squared_error(y_vald, y_pred)
    
    if mse < mse_best:
      n_estimator_best = i
      mse_best = mse
    else:
      pass
  
  return n_estimator_best, max_depth_best

In [None]:
def RFReg(X, y):
  
  X_train, X_mask, y_train, y_mask = train_test_split(X, y, test_size = 0.3, shuffle = False)  
  X_vald, X_test, y_vald, y_test = train_test_split(X_mask, y_mask, test_size = 0.5, shuffle = False) 
  
  n_estimator_best, max_depth_best = RFTune(X_train, y_train, X_vald, y_vald)

  reg_m = RandomForestRegressor(n_estimators=n_estimator_best, max_depth=max_depth_best)
  reg_m.fit(X_train, y_train) 

  y_pred = reg_m.predict(X_test) 

  print('\nMSE:', mean_squared_error(y_test, y_pred))
  print('\nR^2:', reg_m.score(X_test, y_test))

#### Classification Error

In [None]:
def classErr(y_test, y_pred):
  
  sum = 0.0
  for i in range(len(y_test)):
    if y_test[i] != y_pred[i]:
      sum = sum+1
    else:
      pass

  sum = sum/len(y_test)

  return sum

####Logistic Regression

In [None]:
def logReg(X, y):
  
  X_train, X_mask, y_train, y_mask = train_test_split(X, y, test_size = 0.3, shuffle = False)  
  X_vald, X_test, y_vald, y_test = train_test_split(X_mask, y_mask, test_size = 0.5, shuffle = False) 

  reg_m = LogisticRegression() 
  reg_m.fit(X_train, y_train) 

  y_pred = reg_m.predict(X_test) 

  print('\nClassification Error:', classErr(y_test, y_pred))

####RF Classification

In [None]:
def RFCTune(X_train, y_train, X_vald, y_vald):

  n_estimators_list = [1, 5, 10, 15, 27, 60, 100, 150, 200]
  
  n_estimator_best = n_estimators_list[0]
  reg_m = RandomForestClassifier(n_estimators=n_estimator_best)
  reg_m.fit(X_train, y_train) 
  y_pred = reg_m.predict(X_vald) 
  cerr_best = classErr(y_vald, y_pred)

  for i in n_estimators_list:
    reg_m = RandomForestClassifier(n_estimators=i)
    reg_m.fit(X_train, y_train) 

    y_pred = reg_m.predict(X_vald) 

    cerr = classErr(y_vald, y_pred)
    
    if cerr < cerr_best:
      n_estimator_best = i
      cerr_best = cerr
    else:
      pass

  max_depth_best = 1
  reg_m = RandomForestClassifier(n_estimators=n_estimator_best, max_depth=max_depth_best)
  reg_m.fit(X_train, y_train) 
  y_pred = reg_m.predict(X_vald) 
  cerr_best = classErr(y_vald, y_pred)
    
  for i in range(2,40):
    reg_m = RandomForestClassifier(n_estimators=n_estimator_best, max_depth=i)
    reg_m.fit(X_train, y_train) 

    y_pred = reg_m.predict(X_vald) 

    cerr = classErr(y_vald, y_pred)
    
    if cerr < cerr_best:
      n_estimator_best = i
      cerr_best = cerr
    else:
      pass
  
  return n_estimator_best, max_depth_best

In [None]:
def RFClf(X, y):
  
  X_train, X_mask, y_train, y_mask = train_test_split(X, y, test_size = 0.3, shuffle = False)  
  X_vald, X_test, y_vald, y_test = train_test_split(X_mask, y_mask, test_size = 0.5, shuffle = False) 
  
  n_estimator_best, max_depth_best = RFCTune(X_train, y_train, X_vald, y_vald)

  reg_m = RandomForestClassifier(n_estimators=n_estimator_best, max_depth=max_depth_best)
  reg_m.fit(X_train, y_train) 

  y_pred = reg_m.predict(X_test) 

  print('\nClassification Error:', classErr(y_test, y_pred))

###Processing


#### over_maxruns (Y1)

##### LASSO Regression

In [None]:
lassoReg(X.to_numpy(), y1.to_numpy())


MSE: 8.834529945661505

R^2: -0.18869921195777728


##### RF

In [None]:
RFReg(X.to_numpy(), y1.to_numpy())


MSE: 8.533542650919474

R^2: -0.14820092146923147


##### Logistic Regression

In [None]:
logReg(X.to_numpy(), y1.to_numpy())


Classification Error: 0.8888888888888888


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


##### RF Classification

In [None]:
RFClf(X.to_numpy(), y1.to_numpy())


Classification Error: 1.0


#### maxruns_1120 (Y2)

##### LASSO Regression

In [None]:
lassoReg(X.to_numpy(), y2.to_numpy())


MSE: 20.584593963214758

R^2: -0.9708653794567325


##### RF

In [None]:
RFReg(X.to_numpy(), y2.to_numpy())


MSE: 18.363029952186178

R^2: -0.7581624422305917


##### Logistic Regression

In [None]:
logReg(X.to_numpy(), y2.to_numpy())


Classification Error: 0.6666666666666666


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


##### RF Classification

In [None]:
RFClf(X.to_numpy(), y2.to_numpy())


Classification Error: 1.0
