In [29]:
from google.colab import drive
drive.mount('/content/drive') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
import os
os.chdir("/content/drive/My Drive/MLGroupProject/Dataset/")

In [31]:
import numpy as np 
import pandas as pd 

Read ball by ball data with vectorized player names 

In [32]:
final_df = pd.read_csv("Ball_by_ball.csv")

In [33]:
y = final_df['total_runs'].to_numpy()

X = final_df.drop(['total_runs'], axis=1)
print(X.columns)

X = X.to_numpy()

Index(['Unnamed: 0', 'inning', 'over', 'ball', 'season', 'batting_team',
       'bowling_team', 'batsman_name_0', 'batsman_name_1', 'batsman_name_2',
       'batsman_name_3', 'batsman_name_4', 'batsman_name_5', 'batsman_name_6',
       'batsman_name_7', 'non_striker_name_0', 'non_striker_name_1',
       'non_striker_name_2', 'non_striker_name_3', 'non_striker_name_4',
       'non_striker_name_5', 'non_striker_name_6', 'non_striker_name_7',
       'bowler_name_0', 'bowler_name_1', 'bowler_name_2', 'bowler_name_3',
       'bowler_name_4', 'bowler_name_5', 'bowler_name_6', 'bowler_name_7'],
      dtype='object')


In [34]:
X.shape, y.shape

((176235, 31), (176235,))

In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

Initialize different Regression models 

In [36]:
classifier_dict = {"LinearRegression": LinearRegression(),
                   "BaggingRegressor": BaggingRegressor(),
                   "RandomForestRegressor": RandomForestRegressor(),  
                   "SGDRegressor": SGDRegressor(), 
                   "MLPRegressor": MLPRegressor((31, 24, 20, 18, 14, 10, 4), early_stopping=True, max_iter=200)}

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [39]:
for clf_name, classifier in classifier_dict.items():
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=7)
    kf.get_n_splits(X)
    mae_test = []
    mae_train = [] 
    print("Number of folds: {}, for Model: {}".format(n_splits, clf_name))
    for train_index, test_index in kf.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf =  classifier ## Initialize model 
        clf.fit(X_train, y_train)
        y_pred_train = clf.predict(X_train) 

        y_pred = clf.predict(X_test) ## predict on the test set 

        ## Calculate MAE error 
        mae_train.append(mean_absolute_error(y_pred_train, y_train))
        mae_test.append(mean_absolute_error(y_pred, y_test))
        print(mean_absolute_error(y_pred, y_test))

    print("AVG MAE train: ",clf_name, sum(mae_train) / len(mae_train))
    print("AVG MAE test",clf_name, sum(mae_test) / len(mae_test))

Number of folds: 5, for Model: BaggingRegressor
1.3599426901580276
1.3437966351746249
1.3492155360739921
1.3550685164694867
1.3586149175816382
AVG MAE train:  BaggingRegressor 0.520095469117939
AVG MAE test BaggingRegressor 1.353327659091554
Number of folds: 5, for Model: RandomForestRegressor
1.3162513121684116
1.3051959031974354
1.3052818679603937
1.315931284932051
1.3175365279314553
AVG MAE train:  RandomForestRegressor 0.4857882656680001
AVG MAE test RandomForestRegressor 1.3120393792379494
Number of folds: 5, for Model: SGDRegressor
1.1717421163147203e+18
2.2558309736821322e+17
4.270959496628644e+17
9.21262856960347e+17
4.374386124724628e+17
AVG MAE train:  SGDRegressor 6.375046511937967e+17
AVG MAE test SGDRegressor 6.366245265557215e+17
Number of folds: 5, for Model: MLPRegressor
1.2080828644385686
1.1965462159183768
1.1800820482111118
1.1815684986186707
1.193577019204681
AVG MAE train:  MLPRegressor 1.1914437791752088
AVG MAE test MLPRegressor 1.1919713292782819


Create dictionary for diferent hyperparameters for Grid Search 

In [None]:
classifier_dict = {"BaggingRegressor": [BaggingRegressor, {'n_estimators': [10, 50, 100, 150, 200]}],
                   "RandomForestRegressor": [RandomForestRegressor, {'n_estimators':[50, 100, 150, 200], 
                                                                      'criterion': ['mse','mae'], 
                                                                      'max_depth': [2,3,4,5,10], 
                                                                      'max_features': ['auto', 'sqrt', 'log2']}],  
                   "SGDRegressor": [SGDRegressor, {'loss': ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
                                                   'penalty': ['l1', 'l2', 'elasticnet'],
                                                   'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
                                                   'early_stopping':['True'],
                                                   'max_iter': 2000}], 
                   "MLPRegressor": [MLPRegressor, {'hidden_layer_sizes': [(31, 24, 20, 18, 14, 10, 4), (32, 16, 8, 4), (31, 24, 20, 18, 14, 10, 8, 4)], 
                                                   'activation': ['tanh', 'relu'], 
                                                   'solver': ['lbfgs', 'sgd', 'adam'],
                                                   'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
                                                   'early_stopping':['True'],
                                                   'max_iter': [2000]}]}

In [None]:
for clf_name, [classifier, grid_vals] in classifier_dict.items():
    n_splits = 6
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    mae_test = []
    mae_train = [] 

    clf =  classifier()
    grclf = GridSearchCV(clf, grid_vals) ## intialize the grid search 
    grclf.fit(X_train, y_train)

    y_pred_train = grclf.predict(X_train) 
    y_pred = grclf.predict(X_test)

    ## Calculate MAE error on train and test using the best model 
    mae_train.append(mean_absolute_error(y_pred_train, y_train))
    mae_test.append(mean_absolute_error(y_pred, y_test))

    print("model name : ", clf_name)
    print(mean_absolute_error(y_pred, y_test))

    print("Best parameters: ", grclf.best_params_)
    print("AVG MAE train: ",clf_name, sum(mae_train) / len(mae_train))
    print("AVG MAE test",clf_name, sum(mae_test) / len(mae_test))