In [2]:
import pandas as pd

# Read the Excel file
data = pd.read_excel('cricket_batting_data.xlsx')

# Quick description of columns
data.dtypes





Player       object
Country      object
Years        object
Inns          int64
NO            int64
Runs          int64
HS            int64
Ducks         int64
50s           int64
100s          int64
Avg         float64
Position      int64
dtype: object

In [3]:
# Get the number of rows in the dataframe
num_rows = len(data)
print(f"Number of rows in the dataframe: {num_rows}")

Number of rows in the dataframe: 7328


In [4]:
# Check for NaN values
nan_values = data.isna().sum()
print("NaN values in the dataframe:")
print(nan_values)

NaN values in the dataframe:
Player        0
Country       0
Years         0
Inns          0
NO            0
Runs          0
HS            0
Ducks         0
50s           0
100s          0
Avg         216
Position      0
dtype: int64


In [5]:
# Create the 'Debut' column
data['Debut'] = data['Years'].astype(str).str[:4].astype(int)

# Create the 'Tenure' column
data['Tenure'] = data['Years'].astype(str).str[-4:].astype(int) - data['Debut'] + 1
data.head()


Unnamed: 0,Player,Country,Years,Inns,NO,Runs,HS,Ducks,50s,100s,Avg,Position,Debut,Tenure
0,A N Cook,England,2006-2018,278,14,11845,294,9,55,31,44.87,0,2006,13
1,S M Gavaskar,India,1971-1987,203,12,9607,221,11,42,33,50.3,0,1971,17
2,D A Warner*,Australia,2011-2024,202,8,8747,335,13,37,26,45.09,0,2011,14
3,D L Haynes,West Indies,1978-1994,201,25,7472,184,10,39,18,42.45,0,1978,17
4,M A Atherton,England,1990-2001,197,6,7476,185,17,45,16,39.14,0,1990,12


In [6]:
# Check the values for A.N. Cook and S.M. Gavaskar
print("Debut year and tenure for A N Cook:")
print(data[['Debut', 'Tenure']].iloc[0])


Debut year and tenure for A N Cook:
Debut     2006
Tenure      13
Name: 0, dtype: int64


In [7]:
print("Debut year and tenure for S.M. Gavaskar:")
print(data[['Debut', 'Tenure']].iloc[1])


Debut year and tenure for S.M. Gavaskar:
Debut     1971
Tenure      17
Name: 1, dtype: int64


In [8]:
# Drop columns Player, Years, and Avg
data.drop(['Player', 'Years', 'Avg'], axis=1, inplace=True)

# Get the number of columns in the resulting dataframe
num_columns = len(data.columns)
print(f"Number of columns in the resulting dataframe: {num_columns}")



Number of columns in the resulting dataframe: 11


In [9]:
# Display the header of the dataframe
data.head()


Unnamed: 0,Country,Inns,NO,Runs,HS,Ducks,50s,100s,Position,Debut,Tenure
0,England,278,14,11845,294,9,55,31,0,2006,13
1,India,203,12,9607,221,11,42,33,0,1971,17
2,Australia,202,8,8747,335,13,37,26,0,2011,14
3,West Indies,201,25,7472,184,10,39,18,0,1978,17
4,England,197,6,7476,185,17,45,16,0,1990,12


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Convert categorical column 'Country' to quantitative using get dummies
data = pd.get_dummies(data, columns=['Country'], drop_first=True)

data.head()

Unnamed: 0,Inns,NO,Runs,HS,Ducks,50s,100s,Position,Debut,Tenure,...,Country_Bangladesh,Country_England,Country_India,Country_Ireland,Country_New Zealand,Country_Pakistan,Country_South Africa,Country_Sri Lanka,Country_West Indies,Country_Zimbabwe
0,278,14,11845,294,9,55,31,0,2006,13,...,False,True,False,False,False,False,False,False,False,False
1,203,12,9607,221,11,42,33,0,1971,17,...,False,False,True,False,False,False,False,False,False,False
2,202,8,8747,335,13,37,26,0,2011,14,...,False,False,False,False,False,False,False,False,False,False
3,201,25,7472,184,10,39,18,0,1978,17,...,False,False,False,False,False,False,False,False,True,False
4,197,6,7476,185,17,45,16,0,1990,12,...,False,True,False,False,False,False,False,False,False,False


In [11]:
# Split the data into train and test sets
X = data.drop('HS', axis=1)  # Features
y = data['HS']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=35)

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

len_train = len(X_train)
print(f"Length of train data: {len_train}")

Length of train data: 5862


In [12]:
len_test = len(X_test)
print(f"Length of test data: {len_test}")

Length of test data: 1466


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error


# (a) Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_r2 = lr_model.score(X_train, y_train)
lr_intercept = lr_model.intercept_
lr_pred = lr_model.predict(X_test)
lr_error_ratio = mean_absolute_error(y_test, lr_pred) / y_test.mean()

print(f"Linear Regression - R2: {lr_r2:.2f}, Intercept: {lr_intercept:.2f}, Error Ratio: {lr_error_ratio:.2f}")

Linear Regression - R2: 0.57, Intercept: -249.59, Error Ratio: 0.51


In [187]:
# (b) Lasso Regression
lasso_model = Lasso(alpha=0.05)
lasso_model.fit(X_train, y_train)
lasso_score = lasso_model.score(X_train, y_train)
lasso_intercept = lasso_model.intercept_
lasso_pred = lasso_model.predict(X_test)
lasso_error_ratio = mean_absolute_error(y_test, lasso_pred) / y_test.mean()

print(f"Lasso Regression - Score: {lasso_score:.2f}, Intercept: {lasso_intercept:.2f}, Error Ratio: {lasso_error_ratio:.2f}")


Lasso Regression - Score: 0.57, Intercept: -224.10, Error Ratio: 0.51


  model = cd_fast.enet_coordinate_descent(


In [188]:
# (c) Bagging Regressor
bagging_model = BaggingRegressor(random_state=50, max_samples=100)
bagging_model.fit(X_train, y_train)
bagging_pred = bagging_model.predict(X_test)
bagging_error_ratio = mean_absolute_error(y_test, bagging_pred) / y_test.mean()

print(f"Bagging Regressor - MAE: {mean_absolute_error(y_test, bagging_pred):.2f}, Error Ratio: {bagging_error_ratio:.2f}")


Bagging Regressor - MAE: 10.12, Error Ratio: 0.18


In [189]:
# (d) Random Forest Regressor
rf_model = RandomForestRegressor(random_state=50, max_features='sqrt', n_estimators=200, min_samples_leaf=2)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_error_ratio = mean_absolute_error(y_test, rf_pred) / y_test.mean()

print(f"Random Forest Regressor - MAE: {mean_absolute_error(y_test, rf_pred):.2f}, Error Ratio: {rf_error_ratio:.2f}")


Random Forest Regressor - MAE: 9.31, Error Ratio: 0.17


In [190]:
# (e) Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=50, min_samples_split=6, min_samples_leaf=2, max_depth=5)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_error_ratio = mean_absolute_error(y_test, gb_pred) / y_test.mean()

print(f"Gradient Boosting Regressor - MAE: {mean_absolute_error(y_test, gb_pred):.2f}, Error Ratio: {gb_error_ratio:.2f}")


Gradient Boosting Regressor - MAE: 8.65, Error Ratio: 0.15


In [191]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np

# Initialize lists to store error ratios and sMAPE values for all five methods
error_ratios = []
smape_values = []

# (a) Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_error_ratio = mean_absolute_error(y_test, lr_pred) / y_test.mean()
lr_smape = np.mean(np.abs(y_test - lr_pred) / (np.abs(y_test) + np.abs(lr_pred)))
error_ratios.append(lr_error_ratio)
smape_values.append(lr_smape)
print(f"Linear Regression - Error Ratio: {lr_error_ratio:.2f}, SMAPE: {lr_smape:.2f}")

Linear Regression - Error Ratio: 0.51, SMAPE: 0.33


In [192]:
# (b) Lasso Regression
lasso_model = Lasso(alpha=0.05)
lasso_model.fit(X_train, y_train)
lasso_pred = lasso_model.predict(X_test)
lasso_error_ratio = mean_absolute_error(y_test, lasso_pred) / y_test.mean()
lasso_smape = np.mean(np.abs(y_test - lasso_pred) / (np.abs(y_test) + np.abs(lasso_pred)))
error_ratios.append(lasso_error_ratio)
smape_values.append(lasso_smape)
print(f"Lasso Regression - Error Ratio: {lasso_error_ratio:.2f}, SMAPE: {lasso_smape:.2f}")

Lasso Regression - Error Ratio: 0.51, SMAPE: 0.33


  model = cd_fast.enet_coordinate_descent(


In [193]:
# (c) Bagging Regressor
bagging_model = BaggingRegressor(random_state=50, max_samples=100)
bagging_model.fit(X_train, y_train)
bagging_pred = bagging_model.predict(X_test)
bagging_error_ratio = mean_absolute_error(y_test, bagging_pred) / y_test.mean()
bagging_smape = np.mean(np.abs(y_test - bagging_pred) / (np.abs(y_test) + np.abs(bagging_pred)))
error_ratios.append(bagging_error_ratio)
smape_values.append(bagging_smape)
print(f"Bagging Regressor - Error Ratio: {bagging_error_ratio:.2f}, SMAPE: {bagging_smape:.2f}")

Bagging Regressor - Error Ratio: 0.18, SMAPE: 0.09


In [194]:
# (d) Random Forest Regressor
rf_model = RandomForestRegressor(random_state=50, max_features='sqrt', n_estimators=200, min_samples_leaf=2)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_error_ratio = mean_absolute_error(y_test, rf_pred) / y_test.mean()
rf_smape = np.mean(np.abs(y_test - rf_pred) / (np.abs(y_test) + np.abs(rf_pred)))
error_ratios.append(rf_error_ratio)
smape_values.append(rf_smape)
print(f"Random Forest Regressor - Error Ratio: {rf_error_ratio:.2f}, SMAPE: {rf_smape:.2f}")


Random Forest Regressor - Error Ratio: 0.17, SMAPE: 0.13


In [195]:
# (e) Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=50, min_samples_split=6, min_samples_leaf=2, max_depth=5)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_error_ratio = mean_absolute_error(y_test, gb_pred) / y_test.mean()
gb_smape = np.mean(np.abs(y_test - gb_pred) / (np.abs(y_test) + np.abs(gb_pred)))
error_ratios.append(gb_error_ratio)
smape_values.append(gb_smape)
print(f"Gradient Boosting Regressor - Error Ratio: {gb_error_ratio:.2f}, SMAPE: {gb_smape:.2f}")

Gradient Boosting Regressor - Error Ratio: 0.15, SMAPE: 0.10


In [196]:
# Calculate median error ratio and SMAPE
median_error_ratio = np.median(error_ratios)
median_smape = np.median(smape_values)

print(f"\nMedian Error Ratio for all five methods: {median_error_ratio:.2f}")
print(f"Median SMAPE for all five methods: {median_smape:.2f}")



Median Error Ratio for all five methods: 0.18
Median SMAPE for all five methods: 0.13


The "Gradient Boosting Regressor" could be the preferred method among the five with lower Error ratio and SMAPE.

In [197]:
# Train Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=50, min_samples_split=6, min_samples_leaf=2, max_depth=5)
gb_model.fit(X_train, y_train)
gb_pred_train = gb_model.predict(X_train)
gb_pred_test = gb_model.predict(X_test)

# Train Random Forest Regressor on residuals
rf_residuals = y_train - gb_pred_train
rf_model_residuals = RandomForestRegressor(random_state=50, max_features='sqrt', n_estimators=200, min_samples_leaf=2)
rf_model_residuals.fit(X_train, rf_residuals)

# Predict using hybrid method
hybrid_pred_test = gb_pred_test + rf_model_residuals.predict(X_test)

# Calculate sMAPE for hybrid method
hybrid_smape = np.mean(2 * np.abs(y_test - hybrid_pred_test) / (np.abs(y_test) + np.abs(hybrid_pred_test)))
print(f"Hybrid Method - Symmetric Median Absolute Percentage Error (SMAPE): {hybrid_smape:.2f}")


Hybrid Method - Symmetric Median Absolute Percentage Error (SMAPE): 0.19


In [198]:
# Read the Excel file
data1 = pd.read_excel('cricket_batting_data.xlsx')
# Create the 'Debut' column
data1['Debut'] = data1['Years'].astype(str).str[:4].astype(int)

# Create the 'Tenure' column
data1['Tenure'] = data1['Years'].astype(str).str[-4:].astype(int) - data1['Debut'] + 1


# Drop 'Country' along with 'Player', 'Years', and 'Avg'
data_reduced = data1.drop(['Country', 'Player', 'Years', 'Avg'], axis=1)

# Split into X and y
X_reduced = data_reduced.drop('HS', axis=1)
y_reduced = data_reduced['HS']

# Split the reduced data into train and test sets
X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(X_reduced, y_reduced, test_size=0.2, random_state=35)

# Initialize lists to store SMAPE values for all methods with reduced features
smape_values_reduced = []

# Calculate SMAPE for all five methods with reduced features
for model, model_name in [(lr_model, 'Linear Regression'), (lasso_model, 'Lasso Regression'),
                          (bagging_model, 'Bagging Regressor'), (rf_model, 'Random Forest Regressor'),
                          (gb_model, 'Gradient Boosting Regressor'), (hybrid_pred_test, 'Hybrid Method')]:
    if isinstance(model, np.ndarray):  # Hybrid method
        pred_test_reduced = model
    else:
        model.fit(X_train_reduced, y_train_reduced)
        pred_test_reduced = model.predict(X_test_reduced)
    smape_reduced = np.mean(2 * np.abs(y_test_reduced - pred_test_reduced) / (np.abs(y_test_reduced) + np.abs(pred_test_reduced)))
    smape_values_reduced.append(smape_reduced)
    print(f"{model_name} - Symmetric Median Absolute Percentage Error (SMAPE) with Reduced Features: {smape_reduced:.2f}")

# Check for improvement compared to previous features
if all(smape_reduced < smape_full for smape_reduced, smape_full in zip(smape_values_reduced, smape_values)):
    print("There is an improvement in SMAPE with reduced features.")
else:
    print("There is no improvement in SMAPE with reduced features.")


Linear Regression - Symmetric Median Absolute Percentage Error (SMAPE) with Reduced Features: 0.67
Lasso Regression - Symmetric Median Absolute Percentage Error (SMAPE) with Reduced Features: 0.67
Bagging Regressor - Symmetric Median Absolute Percentage Error (SMAPE) with Reduced Features: 0.19


  model = cd_fast.enet_coordinate_descent(


Random Forest Regressor - Symmetric Median Absolute Percentage Error (SMAPE) with Reduced Features: 0.20
Gradient Boosting Regressor - Symmetric Median Absolute Percentage Error (SMAPE) with Reduced Features: 0.19
Hybrid Method - Symmetric Median Absolute Percentage Error (SMAPE) with Reduced Features: 0.19
There is no improvement in SMAPE with reduced features.


In [199]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3]
}

# Initialize Random Forest Regressor
rf_model_tuned = RandomForestRegressor(random_state=50, max_features='sqrt')

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model_tuned, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Calculate sMAPE for the best model
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X_train, y_train)
best_rf_pred_train = best_rf_model.predict(X_train)
best_rf_pred_test = best_rf_model.predict(X_test)
best_rf_smape = np.mean(2 * np.abs(y_test - best_rf_pred_test) / (np.abs(y_test) + np.abs(best_rf_pred_test)))
print(f"Best Random Forest Model - Symmetric Median Absolute Percentage Error (SMAPE): {best_rf_smape:.2f}")


Best Parameters: {'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best Random Forest Model - Symmetric Median Absolute Percentage Error (SMAPE): 0.37
