In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

Read and pre-process data

In [2]:
df = pd.read_excel('cricket_batting_data.xlsx')
df.head()

Unnamed: 0,Player,Country,Years,Inns,NO,Runs,HS,Ducks,50s,100s,Avg,Position
0,A N Cook,England,2006-2018,278,14,11845,294,9,55,31,44.87,0
1,S M Gavaskar,India,1971-1987,203,12,9607,221,11,42,33,50.3,0
2,D A Warner*,Australia,2011-2024,202,8,8747,335,13,37,26,45.09,0
3,D L Haynes,West Indies,1978-1994,201,25,7472,184,10,39,18,42.45,0
4,M A Atherton,England,1990-2001,197,6,7476,185,17,45,16,39.14,0


In [3]:
print("No.of rows in dataframe are: ", df.shape[0])

No.of rows in dataframe are:  7328


In [4]:
df.isna().sum()

Player        0
Country       0
Years         0
Inns          0
NO            0
Runs          0
HS            0
Ducks         0
50s           0
100s          0
Avg         216
Position      0
dtype: int64

There are 216 NaN values

In [5]:
df['Debut'] = pd.to_numeric(df['Years'].str[0:4])

In [6]:
 df['Tenure'] = pd.to_numeric(df['Years'].str[-4:]) -  pd.to_numeric(df['Years'].str[0:4]) + 1

In [7]:
df['Debut']

0       2006
1       1971
2       2011
3       1978
4       1990
        ... 
7323    2003
7324    2001
7325    2000
7326    2011
7327    2016
Name: Debut, Length: 7328, dtype: int64

In [8]:
 df['Tenure']

0       13
1       17
2       14
3       17
4       12
        ..
7323     1
7324     1
7325     1
7326     1
7327     1
Name: Tenure, Length: 7328, dtype: int64

In [9]:
df.head()

Unnamed: 0,Player,Country,Years,Inns,NO,Runs,HS,Ducks,50s,100s,Avg,Position,Debut,Tenure
0,A N Cook,England,2006-2018,278,14,11845,294,9,55,31,44.87,0,2006,13
1,S M Gavaskar,India,1971-1987,203,12,9607,221,11,42,33,50.3,0,1971,17
2,D A Warner*,Australia,2011-2024,202,8,8747,335,13,37,26,45.09,0,2011,14
3,D L Haynes,West Indies,1978-1994,201,25,7472,184,10,39,18,42.45,0,1978,17
4,M A Atherton,England,1990-2001,197,6,7476,185,17,45,16,39.14,0,1990,12


In [10]:
print("Debut year and tenure for S.M. Gavaskar:")
print(df[['Debut', 'Tenure']].iloc[1])

Debut year and tenure for S.M. Gavaskar:
Debut     1971
Tenure      17
Name: 1, dtype: int64


In [11]:
df = df.drop(columns = ['Player','Years','Avg'])

In [12]:
df.head()

Unnamed: 0,Country,Inns,NO,Runs,HS,Ducks,50s,100s,Position,Debut,Tenure
0,England,278,14,11845,294,9,55,31,0,2006,13
1,India,203,12,9607,221,11,42,33,0,1971,17
2,Australia,202,8,8747,335,13,37,26,0,2011,14
3,West Indies,201,25,7472,184,10,39,18,0,1978,17
4,England,197,6,7476,185,17,45,16,0,1990,12


In [13]:
df.shape

(7328, 11)

11 columns in the resulting dataset after dropping Player, Years, Avg

In [14]:
ohe_country = pd.get_dummies(df['Country'], drop_first=True)

In [15]:
ohe_country

Unnamed: 0,Australia,Bangladesh,England,India,Ireland,New Zealand,Pakistan,South Africa,Sri Lanka,West Indies,Zimbabwe
0,False,False,True,False,False,False,False,False,False,False,False
1,False,False,False,True,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,True,False
4,False,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
7323,False,False,False,False,False,False,False,False,False,False,True
7324,False,False,False,False,False,False,False,False,False,False,True
7325,False,False,False,False,False,False,False,False,False,False,True
7326,False,False,False,False,False,False,False,False,False,False,True


In [16]:
ohe_country = ohe_country.astype(int)

In [17]:
df = df.drop('Country', axis=1)

In [18]:
df = pd.concat([df, ohe_country], axis=1)

In [19]:
df

Unnamed: 0,Inns,NO,Runs,HS,Ducks,50s,100s,Position,Debut,Tenure,...,Bangladesh,England,India,Ireland,New Zealand,Pakistan,South Africa,Sri Lanka,West Indies,Zimbabwe
0,278,14,11845,294,9,55,31,0,2006,13,...,0,1,0,0,0,0,0,0,0,0
1,203,12,9607,221,11,42,33,0,1971,17,...,0,0,1,0,0,0,0,0,0,0
2,202,8,8747,335,13,37,26,0,2011,14,...,0,0,0,0,0,0,0,0,0,0
3,201,25,7472,184,10,39,18,0,1978,17,...,0,0,0,0,0,0,0,0,1,0
4,197,6,7476,185,17,45,16,0,1990,12,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7323,1,0,53,53,0,1,0,8,2003,1,...,0,0,0,0,0,0,0,0,0,1
7324,1,0,52,52,0,1,0,8,2001,1,...,0,0,0,0,0,0,0,0,0,1
7325,1,1,0,0,0,0,0,8,2000,1,...,0,0,0,0,0,0,0,0,0,1
7326,1,0,14,14,0,0,0,8,2011,1,...,0,0,0,0,0,0,0,0,0,1


In [20]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = ['HS']), df['HS'], test_size=0.20, random_state = 35)

In [21]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns, index = X_test.index)
X_train
X_test
y_train
y_test

Unnamed: 0,Inns,NO,Runs,Ducks,50s,100s,Position,Debut,Tenure,Australia,Bangladesh,England,India,Ireland,New Zealand,Pakistan,South Africa,Sri Lanka,West Indies,Zimbabwe
6244,0.43,0.63,0.04,1.33,0.14,-0.26,1.22,-0.63,0.92,2.30,-0.17,-0.55,-0.33,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18
1943,2.67,3.19,2.75,3.78,3.19,2.75,-0.37,0.91,0.23,2.30,-0.17,-0.55,-0.33,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18
2255,-0.28,-0.40,-0.23,-0.51,-0.11,-0.26,-0.37,-0.30,-0.46,-0.43,-0.17,-0.55,3.07,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18
3352,-0.38,-0.40,-0.36,0.10,-0.37,-0.26,0.02,-0.12,0.46,-0.43,-0.17,-0.55,3.07,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18
6257,0.38,-0.40,-0.09,1.33,-0.37,-0.26,1.22,0.83,1.38,-0.43,-0.17,-0.55,-0.33,-0.07,-0.32,3.38,-0.34,-0.23,-0.35,-0.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7148,-0.43,-0.40,-0.33,-0.51,-0.37,-0.26,1.22,-0.58,-0.70,-0.43,-0.17,1.82,-0.33,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18
7201,-0.43,-0.40,-0.36,-0.51,-0.37,-0.26,1.22,-0.56,-0.70,-0.43,-0.17,-0.55,3.07,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18
951,-0.43,0.11,-0.35,-0.51,-0.37,-0.26,-1.97,-0.40,-0.70,2.30,-0.17,-0.55,-0.33,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18
1295,-0.18,-0.40,-0.06,0.10,-0.37,0.17,-0.77,0.96,0.23,-0.43,-0.17,1.82,-0.33,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18


Unnamed: 0,Inns,NO,Runs,Ducks,50s,100s,Position,Debut,Tenure,Australia,Bangladesh,England,India,Ireland,New Zealand,Pakistan,South Africa,Sri Lanka,West Indies,Zimbabwe
3805,1.20,0.63,1.05,2.55,0.65,1.89,0.42,0.98,-0.46,2.30,-0.17,-0.55,-0.33,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18
3822,0.94,0.63,0.75,1.33,1.16,0.17,0.42,1.26,-0.00,2.30,-0.17,-0.55,-0.33,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18
2064,0.13,-0.40,-0.04,-0.51,-0.11,-0.26,-0.37,-0.99,0.23,-0.43,-0.17,-0.55,3.07,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18
4936,3.69,4.22,1.81,3.78,1.41,1.03,0.82,0.88,1.61,-0.43,-0.17,-0.55,-0.33,-0.07,-0.32,-0.30,-0.34,-0.23,2.85,-0.18
6247,0.43,0.11,-0.16,0.72,-0.37,-0.26,1.22,-0.43,0.92,-0.43,-0.17,1.82,-0.33,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5920,-0.43,-0.40,-0.32,-0.51,-0.37,-0.26,0.82,-1.63,-0.70,2.30,-0.17,-0.55,-0.33,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18
3153,-0.28,-0.40,-0.25,-0.51,-0.11,-0.26,0.02,-0.99,-0.70,-0.43,-0.17,-0.55,-0.33,-0.07,3.08,-0.30,-0.34,-0.23,-0.35,-0.18
1054,1.45,0.11,1.47,1.33,0.14,2.32,-0.77,0.75,1.15,-0.43,-0.17,1.82,-0.33,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18
5418,-0.33,-0.40,-0.34,0.10,-0.37,-0.26,0.82,-0.04,-0.70,-0.43,-0.17,-0.55,3.07,-0.07,-0.32,-0.30,-0.34,-0.23,-0.35,-0.18


6244     92
1943    146
2255     50
3352      4
6257     49
       ... 
7148     35
7201      8
951      12
1295    177
5833     19
Name: HS, Length: 5862, dtype: int64

3805    128
3822    114
2064     67
4936    166
6247     25
       ... 
5920     41
3153     54
1054    166
5418     19
1613     28
Name: HS, Length: 1466, dtype: int64

In [165]:
len_train = len(X_train)
print(f"Length of train data: {len_train}")
len_test = len(X_test)
print(f"Length of test data: {len_test}")

Length of train data: 5862
Length of test data: 1466


Multi-method Regression

Linear Regression

In [22]:
linear_model = LinearRegression(fit_intercept = True)
linear_model.fit(X_train, y_train)

In [23]:
linear_model_train_score = linear_model.score(X_train, y_train)
linear_model_test_score = linear_model.score(X_test, y_test)

print("Training Score:", linear_model_train_score)
print("Testing Score:", linear_model_test_score)

Training Score: 0.5684635578598312
Testing Score: 0.5205076381035393


In [24]:
# This is the coefficient Beta_1 (or slope of the Simple Linear Regression line)
linear_model.coef_

# This is the coefficient Beta_0
linear_model.intercept_

array([-24.1845711 ,  -0.22531292,  75.25663497,   2.39075241,
       -13.92359497, -13.66467649,  -7.32847733,   5.56429779,
        20.42274213,   6.28620682,   1.26857456,   5.81237402,
         2.4420437 ,   0.79760512,   2.75358779,   2.94340336,
         3.17452309,   1.63088103,   3.34166933,   1.4775879 ])

56.245479358580695

In [25]:
linear_model_test_output = pd.DataFrame(linear_model.predict(X_test), index = X_test.index, columns = ['pred_HS'])

In [26]:
linear_model_test_output.head()

Unnamed: 0,pred_HS
3805,75.58
3822,83.9
2064,52.66
4936,108.26
6247,53.79


In [27]:
linear_mean_absolute_error = mean_absolute_error(y_test, linear_model_test_output)
print('Mean absolute error is ')
print(linear_mean_absolute_error)

Mean absolute error is 
28.515138694154516


In [28]:
linear_model_error_ratio = linear_mean_absolute_error / y_test.mean()
print("Fraction of Mean Error Ratio:", linear_model_error_ratio)

Fraction of Mean Error Ratio: 0.5055044176940907


Lasso Regression

In [29]:
model_l1 = Lasso(alpha=0.05)
model_l1.fit(X_train, y_train)

In [30]:
train_score_l1 = model_l1.score(X_train, y_train)
test_score_l1 = model_l1.score(X_test, y_test)

print("Lasso regression training Score:", train_score_l1)
print("Lasso regression testing Score:", test_score_l1)

Lasso regression training Score: 0.5674370644948209
Lasso regression testing Score: 0.52114227018153


In [31]:
model_l1.coef_

# This is the coefficient Beta_0
model_l1.intercept_

array([-12.57253043,  -0.53580396,  49.30039993,   1.02874259,
        -7.36617243,  -3.47944572,  -7.28386281,   5.30626064,
        20.19053513,   2.79311644,  -0.21375678,   1.78941512,
        -0.23579476,   0.13232404,   0.        ,   0.39367181,
         0.24645326,  -0.29348571,   0.42134488,  -0.09704281])

56.245479358580695

In [32]:
l1_model_test_output = pd.DataFrame(model_l1.predict(X_test), index = X_test.index, columns = ['pred_HS'])

In [33]:
l1_model_test_output.head()

Unnamed: 0,pred_HS
3805,81.76
3822,82.13
2064,53.0
4936,116.84
6247,56.56


In [34]:
l1_mae = mean_absolute_error(y_test, l1_model_test_output)
print('Mean absolute error is ')
print(l1_mae)

Mean absolute error is 
28.401341686058533


In [35]:
l1_model_error_ratio = l1_mae / y_test.mean()
print("Lasso Regression Error Ratio", l1_model_error_ratio)

Lasso Regression Error Ratio 0.5034870720683202


Bagging Regressor

In [36]:
bagging_regressor = BaggingRegressor(max_samples=100, random_state=50)
bagging_regressor.fit(X_train, y_train)

In [37]:
bagging_regressor_train_score = bagging_regressor.score(X_train, y_train)
bagging_regressor_test_score = bagging_regressor.score(X_test, y_test)

print("Bagging regression training Score:", bagging_regressor_train_score)
print("Bagging regression testing Score:", bagging_regressor_test_score)

Bagging regression training Score: 0.8933916958268912
Bagging regression testing Score: 0.8800206927883711


In [38]:
bagging_regressor_test_output = pd.DataFrame(bagging_regressor.predict(X_test), index = X_test.index, columns = ['pred_HS'])

In [39]:
bagging_regressor_test_output

Unnamed: 0,pred_HS
3805,174.50
3822,130.10
2064,76.10
4936,146.30
6247,36.00
...,...
5920,32.50
3153,78.40
1054,183.70
5418,16.80


In [40]:
bagging_regressor_mean_absolute_error = mean_absolute_error(y_test, bagging_regressor_test_output)
print('Mean absolute error is ')
print(bagging_regressor_mean_absolute_error)

Mean absolute error is 
10.153683492496588


In [41]:
bagging_regressor_error_ratio = bagging_regressor_mean_absolute_error / y_test.mean()
print("Bagging Regression Error Ratio", bagging_regressor_error_ratio)

Bagging Regression Error Ratio 0.18000024184966623


Random forest regressor

In [42]:
rf = RandomForestRegressor(random_state=50, max_features='sqrt', n_estimators=200, min_samples_leaf=2)
rf.fit(X_train, y_train)

In [43]:
rf_train_score = rf.score(X_train, y_train)
rf_test_score = rf.score(X_test, y_test)

print("Random forest regression training Score:", rf_train_score)
print("Random forest  regression testing Score:", rf_test_score)

Random forest regression training Score: 0.9697873039661346
Random forest  regression testing Score: 0.9048626936099803


In [44]:
rf_test_output = pd.DataFrame(rf.predict(X_test), index = X_test.index, columns = ['pred_HS'])

In [45]:
rf_test_output.head()

Unnamed: 0,pred_HS
3805,160.9
3822,137.14
2064,71.27
4936,143.88
6247,41.89


In [46]:
rf_mean_absolute_error = mean_absolute_error(y_test, rf_test_output)
print('Mean absolute error is ')
print(rf_mean_absolute_error)

Mean absolute error is 
9.304859184731233


In [47]:
rf_error_ratio = rf_mean_absolute_error / y_test.mean()
print("Random forest Regression Error Ratio", rf_error_ratio)

Random forest Regression Error Ratio 0.16495264057289336


Gradient Boosting Regression

In [48]:
gb = GradientBoostingRegressor(random_state=50,  min_samples_leaf = 2, min_samples_split = 6, max_depth = 5)
gb.fit(X_train, y_train)

In [49]:
gb_train_score = gb.score(X_train, y_train)
gb_test_score = gb.score(X_test, y_test)

print("Gradient Boosting regression training Score:", gb_train_score)
print("Gradient Boosting regression testing Score:", gb_test_score)

Gradient Boosting regression training Score: 0.9731803176788475
Gradient Boosting regression testing Score: 0.9131866280904226


In [50]:
gb_test_output = pd.DataFrame(gb.predict(X_test), index = X_test.index, columns = ['pred_HS'])

In [51]:
gb_test_output.head()

Unnamed: 0,pred_HS
3805,157.66
3822,130.84
2064,69.93
4936,138.26
6247,37.51


In [52]:
gb_mean_absolute_error = mean_absolute_error(y_test, gb_test_output)
print('Mean absolute error is ')
print(gb_mean_absolute_error)

Mean absolute error is 
8.655616989031328


In [53]:
gb_error_ratio = gb_mean_absolute_error / y_test.mean()
print("Gradient Boosting Regression Error Ratio", gb_error_ratio)

Gradient Boosting Regression Error Ratio 0.15344314726129352


Alternative Metrics

In [54]:
import numpy as np  # Import numpy library for NaN handling

def calculate_smape(actual_values, predicted_values):
    # Combine actual and predicted values into a DataFrame for easier handling
    df = pd.concat([actual_values, predicted_values], axis=1)
    
    # Drop rows with NaN values
    df = df.dropna()
    
    # Reset index to ensure proper iteration
    df.reset_index(drop=True, inplace=True)
    
    n = len(df)
    sum_smape = 0
    
    for i in range(n):
        numerator = abs(df.iloc[i, 0] - df.iloc[i, 1])
        denominator = abs(df.iloc[i, 0]) + abs(df.iloc[i, 1])
        
        if denominator == 0:
            sum_smape += 0
        else:
            sum_smape += (numerator / denominator)
    
    smape = (sum_smape * 100) / n
    return smape

# Example usage:
print("Linear Regression sMAPE:", calculate_smape(y_test, linear_model_test_output))
print("Lasso Regression sMAPE:", calculate_smape(y_test, l1_model_test_output))
print("Bagging Regressor sMAPE:", calculate_smape(y_test, bagging_regressor_test_output))
print("Random Forest Regression sMAPE:", calculate_smape(y_test, rf_test_output))
print("Gradient Boosting Regression sMAPE:", calculate_smape(y_test, gb_test_output))

Linear Regression sMAPE: 33.411934891970866
Lasso Regression sMAPE: 33.37076302496282
Bagging Regressor sMAPE: 8.603323649571495
Random Forest Regression sMAPE: 12.539837323186827
Gradient Boosting Regression sMAPE: 9.723590492389977


In [55]:
def calculate_median_absolute_error_ratio(actual_values, predicted_values):
    # Calculate absolute errors
    df = pd.concat([actual_values, predicted_values], axis=1)
    
    # Drop rows with NaN values
    df = df.dropna()
    
    # Reset index to ensure proper iteration
    df.reset_index(drop=True, inplace=True)
    
    #absolute_errors = np.abs(predicted_values - actual_values)
    absolute_errors = np.abs(df.iloc[:, 1] - df.iloc[:, 0])
    
    # Calculate median absolute error
    median_absolute_error = np.median(absolute_errors)
    median_absolute_error_ratio = median_absolute_error/actual_values.median()
    return median_absolute_error_ratio

In [56]:
MdErrorRatio_linear = calculate_median_absolute_error_ratio(y_test, linear_model_test_output)
MdErrorRatio_lasso = calculate_median_absolute_error_ratio(y_test, l1_model_test_output)
MdErrorRatio_bagging = calculate_median_absolute_error_ratio(y_test, bagging_regressor_test_output)
MdErrorRatio_rf = calculate_median_absolute_error_ratio(y_test, rf_test_output)
MdErrorRatio_gb = calculate_median_absolute_error_ratio(y_test, gb_test_output)

print("Linear Model Median Error Ratio:", MdErrorRatio_linear)
print("Lasso Regression Median Error Ratio:", MdErrorRatio_lasso)
print("Bagging Regressor Median Error Ratio:", MdErrorRatio_bagging)
print("Random Forest Median Error Ratio:", MdErrorRatio_rf)
print("Gradient Boosting Median Error Ratio:", MdErrorRatio_gb)

Linear Model Median Error Ratio: 0.5811531593334515
Lasso Regression Median Error Ratio: 0.5719666249183063
Bagging Regressor Median Error Ratio: 0.13076923076923078
Random Forest Median Error Ratio: 0.1162607604866698
Gradient Boosting Median Error Ratio: 0.08826509483997459


In [114]:
def calculate_sMdPE(actual_values, predicted_values):
    df = pd.concat([actual_values, predicted_values], axis=1)
    
    # Drop rows with NaN values
    df = df.dropna()
    
    # Reset index to ensure proper iteration
    df.reset_index(drop=True, inplace=True)
    
    n = len(df)
    sum_smdpe = []
    
    for i in range(n):
        numerator = np.abs(df.iloc[i, 0] - df.iloc[i, 1])
        denominator = np.abs(df.iloc[i, 0] + df.iloc[i, 1])
        
        if denominator == 0:
            sum_smdpe+=[0]
        else:
            sum_smdpe += [numerator / denominator]
    
    smdpe_median = np.median(sum_smdpe) * 100  # Calculate median of SMAPE values
    return smdpe_median

# Example usage:
# Replace actual_values and predicted_values with appropriate data
# For example:
# print("SMDPE:", calculate_sMdPE(y_test, l1_model_test_output))


In [115]:
sMdpe_linear = calculate_sMdPE(y_test, linear_model_test_output)
sMdpe_lasso = calculate_sMdPE(y_test, l1_model_test_output)
sMdpe_bagging = calculate_sMdPE(y_test, bagging_regressor_test_output)
sMdpe_rf = calculate_sMdPE(y_test, rf_test_output)
sMdpe_gb = calculate_sMdPE(y_test, gb_test_output)

print("Linear Model sMdPE:", sMdpe_linear)
print("Lasso Regression sMdPE:", sMdpe_lasso)
print("Bagging Regressor sMdPE:", sMdpe_bagging)
print("Random Forest sMdPE:", sMdpe_rf)
print("Gradient Boosting sMdPE:", sMdpe_gb)

Linear Model sMdPE: 25.575867068414816
Lasso Regression sMdPE: 25.792280592138695
Bagging Regressor sMdPE: 6.82861197633485
Random Forest sMdPE: 6.630292421608624
Gradient Boosting sMdPE: 5.231818248818511


Gradient Method would be the preferred menthod among the five because it has the lease symmetric median percentage error ratio

Further Improvement Attempts

In [132]:
gb2 = GradientBoostingRegressor(random_state=50,  min_samples_leaf = 2, min_samples_split = 6, max_depth = 5)
gb2.fit(X_train, y_train)

In [133]:
gb_y_hat = pd.DataFrame(gb2.predict(X_train), index = X_train.index, columns = ['pred_HS'])

In [134]:
gb_residuals = y_train - gb_y_hat['pred_HS']
print(gb_residuals)

6244    12.79
1943   -18.40
2255   -13.98
3352    -0.01
6257     6.63
        ...  
7148     0.30
7201    -0.02
951     -0.11
1295    29.13
5833     0.14
Length: 5862, dtype: float64


In [135]:
rf2 = RandomForestRegressor(random_state=50, max_features='sqrt', n_estimators=200, min_samples_leaf=2)
rf2.fit(X_train, gb_residuals)

In [136]:
rf2_test_output = pd.DataFrame(rf2.predict(X_test), index = X_test.index, columns = ['pred_HS'])
rf2_test_output

Unnamed: 0,pred_HS
3805,2.48
3822,3.90
2064,-4.64
4936,-1.65
6247,-0.21
...,...
5920,0.78
3153,1.33
1054,-2.79
5418,-0.43


In [137]:
gb2_test_output = pd.DataFrame(gb2.predict(X_test), index = X_test.index, columns = ['pred_HS'])
gb2_test_output

Unnamed: 0,pred_HS
3805,157.66
3822,130.84
2064,69.93
4936,138.26
6247,37.51
...,...
5920,39.78
3153,64.90
1054,192.93
5418,16.42


In [138]:
hybrid_output = rf2_test_output + gb2_test_output
hybrid_output

Unnamed: 0,pred_HS
3805,160.15
3822,134.74
2064,65.28
4936,136.61
6247,37.30
...,...
5920,40.56
3153,66.23
1054,190.14
5418,15.99


In [139]:
MdErrorRatio_hybrid = calculate_median_absolute_error_ratio(y_test, hybrid_output)
print("Hybrid Median Error Ratio:", MdErrorRatio_hybrid)

Hybrid Median Error Ratio: 0.08001605664064425


In [140]:
sMdpe_hybrid = calculate_sMdPE(y_test, hybrid_output)
print("Hybrid Model sMdPE:", sMdpe_hybrid)

Hybrid Model sMdPE: 4.981677392372257


Performance of hybrid model is better than original five models

In [144]:
df3 = pd.read_excel('cricket_batting_data.xlsx')
df3.head()

Unnamed: 0,Player,Country,Years,Inns,NO,Runs,HS,Ducks,50s,100s,Avg,Position
0,A N Cook,England,2006-2018,278,14,11845,294,9,55,31,44.87,0
1,S M Gavaskar,India,1971-1987,203,12,9607,221,11,42,33,50.3,0
2,D A Warner*,Australia,2011-2024,202,8,8747,335,13,37,26,45.09,0
3,D L Haynes,West Indies,1978-1994,201,25,7472,184,10,39,18,42.45,0
4,M A Atherton,England,1990-2001,197,6,7476,185,17,45,16,39.14,0


In [145]:
df3['Debut'] = pd.to_numeric(df3['Years'].str[0:4])

In [146]:
df3['Tenure'] = pd.to_numeric(df3['Years'].str[-4:]) -  pd.to_numeric(df3['Years'].str[0:4]) + 1

In [147]:
df3

Unnamed: 0,Player,Country,Years,Inns,NO,Runs,HS,Ducks,50s,100s,Avg,Position,Debut,Tenure
0,A N Cook,England,2006-2018,278,14,11845,294,9,55,31,44.87,0,2006,13
1,S M Gavaskar,India,1971-1987,203,12,9607,221,11,42,33,50.30,0,1971,17
2,D A Warner*,Australia,2011-2024,202,8,8747,335,13,37,26,45.09,0,2011,14
3,D L Haynes,West Indies,1978-1994,201,25,7472,184,10,39,18,42.45,0,1978,17
4,M A Atherton,England,1990-2001,197,6,7476,185,17,45,16,39.14,0,1990,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7323,S M Ervine,Zimbabwe,2003-2003,1,0,53,53,0,1,0,53.00,8,2003,1
7324,D A Marillier,Zimbabwe,2001-2001,1,0,52,52,0,1,0,52.00,8,2001,1
7325,B A Murphy,Zimbabwe,2000-2000,1,1,0,0,0,0,0,,8,2000,1
7326,N Ncube,Zimbabwe,2011-2011,1,0,14,14,0,0,0,14.00,8,2011,1


In [148]:
df3 = df3.drop(columns = ['Player','Years','Avg', 'Country'])

In [149]:
df3

Unnamed: 0,Inns,NO,Runs,HS,Ducks,50s,100s,Position,Debut,Tenure
0,278,14,11845,294,9,55,31,0,2006,13
1,203,12,9607,221,11,42,33,0,1971,17
2,202,8,8747,335,13,37,26,0,2011,14
3,201,25,7472,184,10,39,18,0,1978,17
4,197,6,7476,185,17,45,16,0,1990,12
...,...,...,...,...,...,...,...,...,...,...
7323,1,0,53,53,0,1,0,8,2003,1
7324,1,0,52,52,0,1,0,8,2001,1
7325,1,1,0,0,0,0,0,8,2000,1
7326,1,0,14,14,0,0,0,8,2011,1


In [150]:
df3.isna().sum()

Inns        0
NO          0
Runs        0
HS          0
Ducks       0
50s         0
100s        0
Position    0
Debut       0
Tenure      0
dtype: int64

In [151]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(df3.drop(columns = ['HS']), df3['HS'], test_size=0.20, random_state = 35)

In [152]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train3 = pd.DataFrame(sc.fit_transform(X_train3), columns = X_train3.columns, index = X_train3.index)
X_test3 = pd.DataFrame(sc.transform(X_test3), columns = X_test3.columns, index = X_test3.index)
X_train3
X_test3
y_train3
y_test3

Unnamed: 0,Inns,NO,Runs,Ducks,50s,100s,Position,Debut,Tenure
6244,0.43,0.63,0.04,1.33,0.14,-0.26,1.22,-0.63,0.92
1943,2.67,3.19,2.75,3.78,3.19,2.75,-0.37,0.91,0.23
2255,-0.28,-0.40,-0.23,-0.51,-0.11,-0.26,-0.37,-0.30,-0.46
3352,-0.38,-0.40,-0.36,0.10,-0.37,-0.26,0.02,-0.12,0.46
6257,0.38,-0.40,-0.09,1.33,-0.37,-0.26,1.22,0.83,1.38
...,...,...,...,...,...,...,...,...,...
7148,-0.43,-0.40,-0.33,-0.51,-0.37,-0.26,1.22,-0.58,-0.70
7201,-0.43,-0.40,-0.36,-0.51,-0.37,-0.26,1.22,-0.56,-0.70
951,-0.43,0.11,-0.35,-0.51,-0.37,-0.26,-1.97,-0.40,-0.70
1295,-0.18,-0.40,-0.06,0.10,-0.37,0.17,-0.77,0.96,0.23


Unnamed: 0,Inns,NO,Runs,Ducks,50s,100s,Position,Debut,Tenure
3805,1.20,0.63,1.05,2.55,0.65,1.89,0.42,0.98,-0.46
3822,0.94,0.63,0.75,1.33,1.16,0.17,0.42,1.26,-0.00
2064,0.13,-0.40,-0.04,-0.51,-0.11,-0.26,-0.37,-0.99,0.23
4936,3.69,4.22,1.81,3.78,1.41,1.03,0.82,0.88,1.61
6247,0.43,0.11,-0.16,0.72,-0.37,-0.26,1.22,-0.43,0.92
...,...,...,...,...,...,...,...,...,...
5920,-0.43,-0.40,-0.32,-0.51,-0.37,-0.26,0.82,-1.63,-0.70
3153,-0.28,-0.40,-0.25,-0.51,-0.11,-0.26,0.02,-0.99,-0.70
1054,1.45,0.11,1.47,1.33,0.14,2.32,-0.77,0.75,1.15
5418,-0.33,-0.40,-0.34,0.10,-0.37,-0.26,0.82,-0.04,-0.70


6244     92
1943    146
2255     50
3352      4
6257     49
       ... 
7148     35
7201      8
951      12
1295    177
5833     19
Name: HS, Length: 5862, dtype: int64

3805    128
3822    114
2064     67
4936    166
6247     25
       ... 
5920     41
3153     54
1054    166
5418     19
1613     28
Name: HS, Length: 1466, dtype: int64

In [157]:
linear_model3 = LinearRegression(fit_intercept = True)
l1_model3 = Lasso(alpha=0.05)
bagging_regressor3 = BaggingRegressor(max_samples=100, random_state=50)
rf3 = RandomForestRegressor(random_state=50, max_features='sqrt', n_estimators=200, min_samples_leaf=2)
gb3 = GradientBoostingRegressor(random_state=50,  min_samples_leaf = 2, min_samples_split = 6, max_depth = 5)

In [158]:
linear_model3.fit(X_train3, y_train3)
l1_model3.fit(X_train3, y_train3)
bagging_regressor3.fit(X_train3, y_train3)
rf3.fit(X_train3, y_train3)
gb3.fit(X_train3, y_train3)

In [159]:
linear_model3_test_output = pd.DataFrame(linear_model3.predict(X_test3), index = X_test3.index, columns = ['pred_HS'])
l1_model3_test_output = pd.DataFrame(l1_model3.predict(X_test3), index = X_test3.index, columns = ['pred_HS'])
bagging_regressor3_test_output = pd.DataFrame(bagging_regressor3.predict(X_test3), index = X_test3.index, columns = ['pred_HS'])
rf3_test_output = pd.DataFrame(rf3.predict(X_test3), index = X_test3.index, columns = ['pred_HS'])
gb3_test_output = pd.DataFrame(gb3.predict(X_test3), index = X_test3.index, columns = ['pred_HS'])

In [160]:
sMdpe_linear3 = calculate_sMdPE(y_test3, linear_model3_test_output)
sMdpe_lasso3 = calculate_sMdPE(y_test3, l1_model3_test_output)
sMdpe_bagging3 = calculate_sMdPE(y_test3, bagging_regressor3_test_output)
sMdpe_rf3 = calculate_sMdPE(y_test3, rf3_test_output)
sMdpe_gb3 = calculate_sMdPE(y_test3, gb3_test_output)

print("Linear Model sMdPE:", sMdpe_linear3)
print("Lasso Regression sMdPE:", sMdpe_lasso3)
print("Bagging Regressor sMdPE:", sMdpe_bagging3)
print("Random Forest sMdPE:", sMdpe_rf3)
print("Gradient Boosting sMdPE:", sMdpe_gb3)

Linear Model sMdPE: 25.870087496082146
Lasso Regression sMdPE: 25.986106797451274
Bagging Regressor sMdPE: 6.761565836298933
Random Forest sMdPE: 5.2187880893311505
Gradient Boosting sMdPE: 5.130628799601575


There is an improvement in performance among the tree based methods from when we had more features

In [161]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [1000, 1500, 2000],
    'max_depth': [5, 10, 15, 20, 25, 30],
    'min_samples_split': [10, 20, 30, 40, 50],
    'min_samples_leaf': [8, 16, 20, 30]
}

# Initialize Random Forest Regressor
rf_model_tuned = RandomForestRegressor(random_state=50, max_features='sqrt')

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model_tuned, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'max_depth': 30, 'min_samples_leaf': 8, 'min_samples_split': 10, 'n_estimators': 1500}


In [162]:
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X_train, y_train)
best_rf_pred_test = pd.DataFrame(best_rf_model.predict(X_test), index = X_test.index, columns = ['pred_HS'])

In [164]:
#Calculate sMAPE for the best model
#best_rf_smape = np.mean(2 * np.abs(y_test - best_rf_pred_test) / (np.abs(y_test) + np.abs(best_rf_pred_test)))
print(f"Best Random Forest Model - Symmetric Median Absolute Percentage Error (SMAPE)", calculate_smape(y_test, best_rf_pred_test))

Best Random Forest Model - Symmetric Median Absolute Percentage Error (SMAPE) 14.220022377792185


Tried tuning the Random Forest model using GridSearchCV with various combinations of hyperparameters.
Observed: The best parameters identified by GridSearchCV are max_depth=30, min_samples_leaf=8, min_samples_split=10, and n_estimators=1500. After training the model with these best parameters and making predictions on the test data, the SMAPE value computed between the predicted and actual values is approximately 14.22. This SMAPE value indicates the performance of the Random Forest model with the best parameter set.