In [5]:
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


In [6]:
import os
os.chdir("/content/drive/My Drive/MLGroupProject/Dataset/")

In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [8]:
delivery=pd.read_csv('./kaggle_data/deliveries.csv')
matches=pd.read_csv('./kaggle_data/matches.csv')  

In [9]:
over_score = delivery.groupby(['match_id','inning','over'])['total_runs'].sum().reset_index()

In [10]:
df_innings = delivery.groupby(['match_id','inning'])

In [48]:
def calculate_balls_runs(df, nb_balls, df_new):
    for curr_ball in range(0,len(df), nb_balls):
        if (curr_ball+nb_balls) < len(df):

            ## Sumation for the number of balls 
            runs_in_balls = df.iloc[curr_ball: curr_ball+nb_balls]['total_runs'].sum() 

            ## Create a new row for the df
            new_element = [df.iloc[curr_ball]['match_id'], df.iloc[curr_ball]['inning'], df.iloc[curr_ball]['batting_team'], df.iloc[curr_ball]['bowling_team'], runs_in_balls]
            # print(new_element)

            ## Add the new row 
            df_new.loc[len(df_new.index)] = new_element 
            
            # print(runs_in_balls)

We take two ball intervals:
1. nb_balls = 12
2. nb_balls = 30


In [49]:
def create_new_df(nb_balls=30):

    ## Columns for the new df 
    df_per_balls_columns =  ['match_id', 'inning', 'batting_team', 'bowling_team', 'total_runs']
    
    ## Initialize the new df
    df_per_balls = pd.DataFrame(columns=df_per_balls_columns)

    ## Build the new df 
    df_innings.apply(calculate_balls_runs, nb_balls=10, df_new=df_per_balls)

    return df_per_balls

In [None]:
#new dataframe for the specified interval
new_df = create_new_df(nb_balls=30)
new_df[:10]

In [14]:
new_df.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'total_runs'], dtype='object')

In [15]:
new_df.shape

(17197, 5)

In [16]:
matches = matches.rename(columns={"id": "match_id"})

In [17]:
merged_df = pd.merge(new_df, matches, on='match_id')

In [18]:
merged_df.shape

(17197, 22)

In [19]:
merged_df.sample(5)

Unnamed: 0,match_id,inning,batting_team,bowling_team,total_runs,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
10860,477,2,Mumbai Indians,Sunrisers Hyderabad,13,2014,,2014-04-30,Sunrisers Hyderabad,Mumbai Indians,Mumbai Indians,field,normal,0,Sunrisers Hyderabad,15,0,B Kumar,Dubai International Cricket Stadium,HDPK Dharmasena,M Erasmus,
1963,87,2,Chennai Super Kings,Delhi Daredevils,28,2008,Delhi,2008-05-08,Delhi Daredevils,Chennai Super Kings,Chennai Super Kings,field,normal,0,Chennai Super Kings,0,4,MS Dhoni,Feroz Shah Kotla,Aleem Dar,RB Tiffin,
15569,7943,1,Mumbai Indians,Kings XI Punjab,23,2018,Mumbai,16/05/18,Mumbai Indians,Kings XI Punjab,Kings XI Punjab,field,normal,0,Mumbai Indians,3,0,JJ Bumrah,Wankhede Stadium,Marais Erasmus,Nitin Menon,Yeshwant Barde
6623,292,1,Deccan Chargers,Mumbai Indians,13,2011,Mumbai,2011-05-14,Deccan Chargers,Mumbai Indians,Deccan Chargers,bat,normal,0,Deccan Chargers,10,0,A Mishra,Wankhede Stadium,S Ravi,SK Tarapore,
14248,628,1,Sunrisers Hyderabad,Delhi Daredevils,18,2016,Raipur,2016-05-20,Sunrisers Hyderabad,Delhi Daredevils,Delhi Daredevils,field,normal,0,Delhi Daredevils,0,6,KK Nair,Shaheed Veer Narayan Singh International Stadium,A Nand Kishore,BNJ Oxenford,


We run the following Regression Models for both Label Encoded and One-Hot Encoded data:
1. Random Forest Regressor
2. Bagging Regressor
3. SGD Regressor
4. MLP Regressor

# Label Encoding

To convert each feature to a number

In [20]:
team_dict = {'Mumbai Indians':'0','Kolkata Knight Riders': '1','Royal Challengers Bangalore': '2','Deccan Chargers': '3','Chennai Super Kings': '4',
                 'Rajasthan Royals': '5','Delhi Daredevils': '6','Gujarat Lions': '7','Kings XI Punjab': '8',
                 'Sunrisers Hyderabad': '9','Rising Pune Supergiants':'10','Kochi Tuskers Kerala': '11','Pune Warriors': '12','Rising Pune Supergiant': '10', 'Delhi Capitals': '6'}

In [21]:
#to label encode data
def convert_string_to_label(df):

    ## batting_team
    df['batting_team'] = df['batting_team'].map(team_dict)
    df['batting_team'] = pd.to_numeric(df['batting_team'])

    ## bowling_team
    df['bowling_team'] = df['bowling_team'].map(team_dict)
    df['bowling_team'] = pd.to_numeric(df['bowling_team'])

    for col in df.columns:
        df[col] = pd.to_numeric(df[col])
    
    df['season'] = df['season'] - 2008

    df = df.dropna()

    return df 

In [22]:
reqd_df = merged_df[['inning','season', 'batting_team', 'bowling_team', 'total_runs']] ## Pick the required features 
use_df = reqd_df.copy() ## Change the copy of the df 

#label encoded data
final_df = convert_string_to_label(use_df)

In [23]:
final_df.shape

(17197, 5)

In [24]:
final_df.dtypes

inning          int64
season          int64
batting_team    int64
bowling_team    int64
total_runs      int64
dtype: object

In [25]:
X = final_df[['inning','season', 'batting_team', 'bowling_team']].to_numpy()
y = final_df['total_runs'].to_numpy()

In [26]:
#load necesssary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

### Random Forest Regressor
Finding the best hyperparameters for RandomForestRegressor()

In [28]:
#finding the best hyperparameters
clf =  RandomForestRegressor()
grid = GridSearchCV(clf, {'max_depth': [1,2,3,4,5,6,7,8,9,10]})
grid.fit(X_train, y_train)
print('\nBest RF Parameters:', grid.best_params_) 


Best RF Parameters: {'max_depth': 6}


In [29]:
#using the best hyperparameters
clf = RandomForestRegressor(max_depth = 6)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.53190229811064
Test: 4.613311699535201


### Bagging Regressor

Finding the best hyperparameters for BaggingRegressor()

In [30]:
#finding the best hyperparameters
clf =  BaggingRegressor()
grid = GridSearchCV(clf, {'n_estimators': [5,10,20,25]})
grid.fit(X_train, y_train)
print('\nBest RF Parameters:', grid.best_params_) 


Best RF Parameters: {'n_estimators': 25}


In [31]:
#Using the best hyperparameters
clf = BaggingRegressor(n_estimators=25)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.3051135807779595
Test: 4.731226962785506


### SGD Regressor
Finding the best hyperparameters for SGDRegressor()


In [32]:
#finding the best hyperparameters
clf = SGDRegressor()
grid = GridSearchCV(clf, {'alpha': [0.1, 0.01, 0.05, 0.001]})
grid.fit(X_train, y_train)
print('\nBest SGD Parameters:')
print(grid.best_params_) 


Best SGD Parameters:
{'alpha': 0.1}


In [33]:
#using the best hyperparameters
clf = SGDRegressor(alpha=0.1)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train) 

y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.693442077299199
Test: 4.723620234034666


### MLP Regressor
Finding the best hyperparameters for MLPRegressor()

In [34]:
#finding the best hyperparameters
clf =  MLPRegressor()
grid = GridSearchCV(clf, {'hidden_layer_sizes': [(15,), (12,), (10,), (8,), (6,), (4,)],'activation': ['logistic', 'tanh', 'relu'], 'max_iter': [2000]})
grid.fit(X_train, y_train)
print('\nBest MLP Parameters:')
print(grid.best_params_) 
grid_predictions = grid.predict(X_test) 



Best MLP Parameters:
{'activation': 'logistic', 'hidden_layer_sizes': (12,), 'max_iter': 2000}


In [35]:
#using the best hyperparameters
clf = MLPRegressor(hidden_layer_sizes = (12,), max_iter = 2000, activation = 'logistic')
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train) 

y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.58129736568199
Test: 4.61669706433701


# One-hot Encoding

To convert categorical data into a useful form

In [36]:
reqd_df_oh = merged_df[['inning','season', 'batting_team', 'bowling_team', 'total_runs']] ## Pick the required features 
use_df_oh = reqd_df_oh.copy() ## Change the copy of the df 

temp = pd.get_dummies(use_df_oh[['batting_team', 'bowling_team']])

final_df_oh = pd.concat([use_df_oh, temp], axis=1)

final_df_oh = final_df_oh.drop(['batting_team', 'bowling_team'], axis=1)

In [37]:
final_df_oh.shape

(17197, 33)

In [38]:
y_oh = final_df_oh['total_runs'].to_numpy()
X_oh = final_df_oh.drop('total_runs', axis=1).to_numpy()

In [39]:
X_oh.shape, y_oh.shape

((17197, 32), (17197,))

### Random Forest Regressor
Finding the best hyperparameters for RandomForestRegressor()

In [40]:
#finding the best hyperparameters
clf =  RandomForestRegressor()
grid = GridSearchCV(clf, {'max_depth': [1,2,3,4,5,6,8,10]})
grid.fit(X_train, y_train)
print('\nBest RF Parameters:', grid.best_params_) 
# grid_predictions = grid.predict(X_test)


Best RF Parameters: {'max_depth': 6}


In [41]:
#using the best hyperparameters
clf = RandomForestRegressor(max_depth = 6)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.530507399891316
Test: 4.611761468599762


### Bagging Regressor
Finding the best hyperparameters for BaggingRegressor()

In [42]:
#finding the best hyperparameters
clf =  BaggingRegressor()
grid = GridSearchCV(clf, {'n_estimators': [5,10,20,25]})
grid.fit(X_train, y_train)
print('\nBest RF Parameters:', grid.best_params_) 
# grid_predictions = grid.predict(X_test)


Best RF Parameters: {'n_estimators': 25}


In [43]:
#using the best hyperparameters
clf = BaggingRegressor(n_estimators=25)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.305120659442412
Test: 4.731329525254957


### SGD Regressor

Finding the best hyperparameters for SGDRegressor()

In [44]:
#finding the best hyperparameters
clf = SGDRegressor()
grid = GridSearchCV(clf, {'alpha': [0.1, 0.01, 0.05, 0.001]})
grid.fit(X_train, y_train)
print('\nBest SGD Parameters:')
print(grid.best_params_) 
# grid_predictions = grid.predict(X_test)


Best SGD Parameters:
{'alpha': 0.05}


In [45]:
#using the best hyperparameters
clf = SGDRegressor(alpha=0.05)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train) 

y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.611795962357005
Test: 4.639086467812422


### MLP Regressor
Finding the best hyperparameters for MLPRegressor()

In [46]:
#finding the best hyperparameters
clf =  MLPRegressor()
grid = GridSearchCV(clf, {'hidden_layer_sizes': [(12,10,), (12,10,8,), (10,8,4), (8,4), (6,), (4,)],'activation': ['logistic', 'tanh', 'relu'], 'max_iter': [2000]})
grid.fit(X_train, y_train)
print('\nBest MLP Parameters:', grid.best_params_) 
grid_predictions = grid.predict(X_test) 



Best MLP Parameters: {'activation': 'relu', 'hidden_layer_sizes': (8, 4), 'max_iter': 2000}


In [47]:
#using the best hyperparameters
clf = MLPRegressor(hidden_layer_sizes = (8, 4,), max_iter = 2000, activation = 'relu', random_state=0)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train) 

y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.61972578087601
Test: 4.64940553007213
