In [2]:
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


In [4]:
import os
os.chdir("/content/drive/My Drive/MLGroupProject/Dataset/")

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [6]:
delivery=pd.read_csv('./kaggle_data/deliveries.csv')
matches=pd.read_csv('./kaggle_data/matches.csv')  

In [7]:
over_score = delivery.groupby(['match_id','inning','over'])['total_runs'].sum().reset_index()

In [8]:
df_innings = delivery.groupby(['match_id','inning'])

In [9]:
def calculate_balls_runs(df, nb_balls, df_new):
    for curr_ball in range(0,len(df), nb_balls):
        if (curr_ball+nb_balls) < len(df):

            ## Sumation for the number of balls 
            runs_in_balls = df.iloc[curr_ball: curr_ball+nb_balls]['total_runs'].sum() 

            ## Create a new row for the df
            new_element = [df.iloc[curr_ball]['match_id'], df.iloc[curr_ball]['inning'], df.iloc[curr_ball]['batting_team'], df.iloc[curr_ball]['bowling_team'], runs_in_balls]
            print(new_element)

            ## Add the new row 
            df_new.loc[len(df_new.index)] = new_element 
            
            print(runs_in_balls)

We take two ball intervals:
1. nb_balls = 12
2. nb_balls = 30


In [10]:
def create_new_df(nb_balls=30):

    ## Columns for the new df 
    df_per_balls_columns =  ['match_id', 'inning', 'batting_team', 'bowling_team', 'total_runs']
    
    ## Initialize the new df
    df_per_balls = pd.DataFrame(columns=df_per_balls_columns)

    ## Build the new df 
    df_innings.apply(calculate_balls_runs, nb_balls=10, df_new=df_per_balls)

    return df_per_balls

In [None]:
#new dataframe for the specified interval
new_df = create_new_df(nb_balls=12)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
18
[558, 2, 'Rajasthan Royals', 'Sunrisers Hyderabad', 24]
24
[558, 2, 'Rajasthan Royals', 'Sunrisers Hyderabad', 16]
16
[558, 2, 'Rajasthan Royals', 'Sunrisers Hyderabad', 18]
18
[558, 2, 'Rajasthan Royals', 'Sunrisers Hyderabad', 25]
25
[558, 2, 'Rajasthan Royals', 'Sunrisers Hyderabad', 25]
25
[559, 1, 'Chennai Super Kings', 'Mumbai Indians', 8]
8
[559, 1, 'Chennai Super Kings', 'Mumbai Indians', 16]
16
[559, 1, 'Chennai Super Kings', 'Mumbai Indians', 20]
20
[559, 1, 'Chennai Super Kings', 'Mumbai Indians', 3]
3
[559, 1, 'Chennai Super Kings', 'Mumbai Indians', 13]
13
[559, 1, 'Chennai Super Kings', 'Mumbai Indians', 7]
7
[559, 1, 'Chennai Super Kings', 'Mumbai Indians', 10]
10
[559, 1, 'Chennai Super Kings', 'Mumbai Indians', 9]
9
[559, 1, 'Chennai Super Kings', 'Mumbai Indians', 16]
16
[559, 1, 'Chennai Super Kings', 'Mumbai Indians', 8]
8
[559, 1, 'Chennai Super Kings', 'Mumbai Indians', 23]
23
[559, 1, 'Chennai Su

In [None]:
new_df.columns

In [None]:
new_df.shape

In [None]:
matches = matches.rename(columns={"id": "match_id"})

In [None]:
merged_df = pd.merge(new_df, matches, on='match_id')

In [None]:
merged_df.shape

In [None]:
merged_df.sample(5)

We run the following Regression Models for both Label Encoded and One-Hot Encoded data:
1. Random Forest Regressor
2. Bagging Regressor
3. SGD Regressor
4. MLP Regressor

# Label Encoding

To convert each feature to a number

In [None]:
team_dict = {'Mumbai Indians':'0','Kolkata Knight Riders': '1','Royal Challengers Bangalore': '2','Deccan Chargers': '3','Chennai Super Kings': '4',
                 'Rajasthan Royals': '5','Delhi Daredevils': '6','Gujarat Lions': '7','Kings XI Punjab': '8',
                 'Sunrisers Hyderabad': '9','Rising Pune Supergiants':'10','Kochi Tuskers Kerala': '11','Pune Warriors': '12','Rising Pune Supergiant': '10', 'Delhi Capitals': '6'}

In [None]:
#to label encode data
def convert_string_to_label(df):

    ## batting_team
    df['batting_team'] = df['batting_team'].map(team_dict)
    df['batting_team'] = pd.to_numeric(df['batting_team'])

    ## bowling_team
    df['bowling_team'] = df['bowling_team'].map(team_dict)
    df['bowling_team'] = pd.to_numeric(df['bowling_team'])

    for col in df.columns:
        df[col] = pd.to_numeric(df[col])
    
    df['season'] = df['season'] - 2008

    df = df.dropna()

    return df 

In [None]:
reqd_df = merged_df[['inning','season', 'batting_team', 'bowling_team', 'total_runs']] ## Pick the required features 
use_df = reqd_df.copy() ## Change the copy of the df 

#label encoded data
final_df = convert_string_to_label(use_df)

In [None]:
final_df.shape

In [None]:
final_df.dtypes

In [None]:
X = final_df[['inning','season', 'batting_team', 'bowling_team']].to_numpy()
y = final_df['total_runs'].to_numpy()

In [None]:
#load necesssary libraries
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

#### Random Forest Regressor
Finding the best hyperparameters for RandomForestRegressor()

In [None]:
#finding the best hyperparameters
clf =  RandomForestRegressor()
grid = GridSearchCV(clf, {'max_depth': [1,2,3,4,5,6,7,8,9,10]})
grid.fit(X_train, y_train)
print('\nBest RF Parameters:', grid.best_params_) 


Best RF Parameters: {'max_depth': 6}


In [None]:
#using the best hyperparameters
clf = RandomForestRegressor(max_depth = 6)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.569030003321753
Test: 4.536917640293559


#### Bagging Regressor

Finding the best hyperparameters for BaggingRegressor()

In [None]:
#finding the best hyperparameters
clf =  BaggingRegressor()
grid = GridSearchCV(clf, {'n_estimators': [5,10,20,25]})
grid.fit(X_train, y_train)
print('\nBest RF Parameters:', grid.best_params_) 


Best RF Parameters: {'n_estimators': 25}


In [None]:
#Using the best hyperparameters
clf = BaggingRegressor(n_estimators=25)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.349441966595712
Test: 4.660650422235396


#### SGD Regressor
#### Bagging Regressor
Finding the best hyperparameters for SGDRegressor()


In [None]:
#finding the best hyperparameters
clf = SGDRegressor()
grid = GridSearchCV(clf, {'alpha': [0.1, 0.01, 0.05, 0.001]})
grid.fit(X_train, y_train)
print('\nBest SGD Parameters:')
print(grid.best_params_) 


Best SGD Parameters:
{'alpha': 0.05}


In [None]:
#using the best hyperparameters
clf = SGDRegressor(alpha=0.05)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train) 

y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.720490994050692
Test: 4.646808151293332


#### MLP Regressor
Finding the best hyperparameters for MLPRegressor()

In [None]:
#finding the best hyperparameters
clf =  MLPRegressor()
grid = GridSearchCV(clf, {'hidden_layer_sizes': [(15,), (12,), (10,), (8,), (6,), (4,)],'activation': ['logistic', 'tanh', 'relu'], 'max_iter': [2000]})
grid.fit(X_train, y_train)
print('\nBest MLP Parameters:')
print(grid.best_params_) 
grid_predictions = grid.predict(X_test) 



Best MLP Parameters:
{'activation': 'logistic', 'hidden_layer_sizes': (12,), 'max_iter': 2000}


In [None]:
#using the best hyperparametersclf = MLPRegressor(hidden_layer_sizes = (12,), max_iter = 2000, activation = 'logistic')
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train) 

y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.639061999506147
Test: 4.562362052551329


In [None]:
lin = LinearRegression()
lin.fit(X_train,y_train)
y_pred = lin.predict(X_test)
y_pred_train = lin.predict(X_train)
print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

# One-hot Encoding

To convert categorical data into a useful form

In [35]:
reqd_df_oh = merged_df[['inning','season', 'batting_team', 'bowling_team', 'total_runs']] ## Pick the required features 
use_df_oh = reqd_df_oh.copy() ## Change the copy of the df 

temp = pd.get_dummies(use_df_oh[['batting_team', 'bowling_team']])

final_df_oh = pd.concat([use_df_oh, temp], axis=1)

final_df_oh = final_df_oh.drop(['batting_team', 'bowling_team'], axis=1)

In [36]:
final_df_oh.shape

(17197, 33)

In [37]:
y_oh = final_df_oh['total_runs'].to_numpy()
X_oh = final_df_oh.drop('total_runs', axis=1).to_numpy()

In [38]:
X_oh.shape, y_oh.shape

((17197, 32), (17197,))

In [40]:
X = X_oh
y = y_oh
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

#### Random Forest Regressor
Finding the best hyperparameters for RandomForestRegressor()

In [None]:
#finding the best hyperparameters
clf =  RandomForestRegressor()
grid = GridSearchCV(clf, {'max_depth': [1,2,3,4,5,6,8,10]})
grid.fit(X_train, y_train)
print('\nBest RF Parameters:', grid.best_params_) 
# grid_predictions = grid.predict(X_test)


Best RF Parameters: {'max_depth': 6}


In [None]:
#using the best hyperparameters
clf = RandomForestRegressor(max_depth = 6)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.572062928852723
Test: 4.540367078639878


#### Bagging Regressor
Finding the best hyperparameters for BaggingRegressor()

In [None]:
#finding the best hyperparameters
clf =  BaggingRegressor()
grid = GridSearchCV(clf, {'n_estimators': [5,10,20,25]})
grid.fit(X_train, y_train)
print('\nBest RF Parameters:', grid.best_params_) 
# grid_predictions = grid.predict(X_test)


Best RF Parameters: {'n_estimators': 25}


In [None]:
#using the best hyperparameters
clf = BaggingRegressor(n_estimators=25)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.3472850635304665
Test: 4.659092437175107


#### SGD Regressor

Finding the best hyperparameters for SGDRegressor()

In [None]:
#finding the best hyperparameters
clf = SGDRegressor()
grid = GridSearchCV(clf, {'alpha': [0.1, 0.01, 0.05, 0.001]})
grid.fit(X_train, y_train)
print('\nBest SGD Parameters:')
print(grid.best_params_) 
# grid_predictions = grid.predict(X_test)


Best SGD Parameters:
{'alpha': 0.05}


In [None]:
#using the best hyperparameters
clf = SGDRegressor(alpha=0.1)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train) 

y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.739517893274761
Test: 4.663964291464389


#### MLP Regressor
Finding the best hyperparameters for MLPRegressor()

In [None]:
#finding the best hyperparameters
clf =  MLPRegressor()
grid = GridSearchCV(clf, {'hidden_layer_sizes': [(12,10,), (12,10,8,), (10,8,4), (8,4), (6,), (4,)],'activation': ['logistic', 'tanh', 'relu'], 'max_iter': [2000]})
grid.fit(X_train, y_train)
print('\nBest MLP Parameters:', grid.best_params_) 
grid_predictions = grid.predict(X_test) 



Best MLP Parameters: {'activation': 'tanh', 'hidden_layer_sizes': (12, 10), 'max_iter': 2000}


In [None]:
#using the best hyperparameters
clf = MLPRegressor(hidden_layer_sizes = (12, 10,), max_iter = 2000, activation = 'logistic', random_state=0)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train) 

y_pred = clf.predict(X_test)

print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.655006090141778
Test: 4.573262928226576


In [41]:
lin = LinearRegression()
lin.fit(X_train,y_train)
y_pred = lin.predict(X_test)
y_pred_train = lin.predict(X_train)
print("Train:", mean_absolute_error(y_pred_train, y_train))
print("Test:", mean_absolute_error(y_pred, y_test))

Train: 4.609493299032492
Test: 4.587614384732505
