In [90]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.exceptions import ConvergenceWarning
import warnings

from tqdm import tqdm

In [47]:
filepath = './FinalDFs/PostEDA/'

batting_df = pd.read_pickle(filepath+'batting_filtered.pkl')
bowling_df = pd.read_pickle(filepath+'bowling_filtered.pkl')

# **Part 1: Data Preprocessing**

## **(I) Dropping Irrelevant Columns**

As we saw in our EDA notebook, there were quite a few of the numeric variables that were very closely related to each other. For example, in `batting_df`, the columns with `balls_faced` and `total_runs` had very high correlation numbers. Therefore, we will exclude one of these sets from our analysis.

In [48]:
bat_cols_to_drop = ['NY_SalaryUSD', 'balls_faced_1', 'balls_faced_2', 'balls_faced_3', 'boundary_prob_1', 'boundary_prob_2', 'boundary_prob_3']
batting_df = batting_df.drop(columns=bat_cols_to_drop)

In [50]:
bat_corr = batting_df.corr()

threshold = 0.8 
highly_correlated_features = set()

# Iterate through the correlation matrix
for i in range(len(bat_corr.columns)):
    for j in range(i):
        if abs(bat_corr.iloc[i, j]) > threshold:
            # Add the feature names to the set of highly correlated features
            feature_i = bat_corr.columns[i]
            feature_j = bat_corr.columns[j]
            highly_correlated_features.add((feature_i, feature_j))

print("Highly Correlated Features:")
for feature_pair in highly_correlated_features:
    print(feature_pair)

Highly Correlated Features:


In [49]:
bowl_cols_to_drop = ['NY_SalaryUSD', 'balls_bowled_1', 'balls_bowled_2', 'balls_bowled_3', 'boundary_prob_1', 'boundary_prob_2', 'boundary_prob_3']
bowling_df = bowling_df.drop(columns=bowl_cols_to_drop)

In [51]:
bowl_corr = bowling_df.corr()

threshold = 0.8 
highly_correlated_features = set()

# Iterate through the correlation matrix
for i in range(len(bowl_corr.columns)):
    for j in range(i):
        if abs(bowl_corr.iloc[i, j]) > threshold:
            # Add the feature names to the set of highly correlated features
            feature_i = bowl_corr.columns[i]
            feature_j = bowl_corr.columns[j]
            highly_correlated_features.add((feature_i, feature_j))

print("Highly Correlated Features:")
for feature_pair in highly_correlated_features:
    print(feature_pair)

Highly Correlated Features:
('strike_rate_2', 'bowling_avg_2')
('strike_rate_1', 'bowling_avg_1')
('strike_rate_3', 'bowling_avg_3')


In [54]:
bowling_df = bowling_df.drop(columns=['strike_rate_1', 'strike_rate_2', 'strike_rate_3'])

## **(II) Scaling the Numeric Columns**

The next step is to scale the numeric columns using the `StandardScaler`.

In [55]:
scaler = StandardScaler()

bat_num_cols = [col for col in batting_df.select_dtypes(include=[np.number]).columns if col not in ['Season', 'Role', 'changed_teams']]
bowl_num_cols = [col for col in bowling_df.select_dtypes(include=[np.number]).columns if col not in ['Season', 'Role', 'changed_teams']]

In [56]:
batting_df[bat_num_cols] = scaler.fit_transform(batting_df[bat_num_cols])
bowling_df[bowl_num_cols] = scaler.fit_transform(bowling_df[bowl_num_cols])

## **(III) Getting Dummies for the Categorical Columns**

The final step in data preprocessing is getting dummies for the categorical columns.

In [57]:
batting_df = pd.get_dummies(batting_df, columns=['Country', 'Team'])

In [58]:
bowling_df = pd.get_dummies(bowling_df, columns=['Country', 'Team'])

# **Part 2: Train-Test Split**

In [59]:
batting_df = batting_df.sort_values(by='Season')
bowling_df = bowling_df.sort_values(by='Season')

In [74]:
batting_train, batting_test = train_test_split(batting_df, test_size=.2)
bowling_train, bowling_test = train_test_split(bowling_df, test_size=.2)

In [58]:
# batting_train.to_pickle('./FinalDFs/TrainTestSplit/batting_train.pkl')
# batting_test.to_pickle('./FinalDFs/TrainTestSplit/batting_test.pkl')

In [59]:
# bowling_train.to_pickle('./FinalDFs/TrainTestSplit/bowling_train.pkl')
# bowling_test.to_pickle('./FinalDFs/TrainTestSplit/bowling_test.pkl')

In [75]:
X_train_bat = batting_train.drop(columns=['Player', 'salary_diff'])
y_train_bat = batting_train['salary_diff']

X_test_bat = batting_test.drop(columns=['Player', 'salary_diff'])
y_test_bat = batting_test['salary_diff']

In [76]:
X_train_bowl = bowling_train.drop(columns=['Player', 'salary_diff'])
y_train_bowl = bowling_train['salary_diff']

X_test_bowl = bowling_test.drop(columns=['Player', 'salary_diff'])
y_test_bowl = bowling_test['salary_diff']

# **Part 3: Batting Models**

## **(I) Linear Model**

In [77]:
model = LinearRegression()
model.fit(X_train_bat, y_train_bat)
y_pred_bat = model.predict(X_test_bat)

mse = mean_squared_error(y_test_bat, y_pred_bat)
mae = mean_absolute_error(y_test_bat, y_pred_bat)
r2 = r2_score(y_test_bat, y_pred_bat)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.7139749880209585
Mean Absolute Error: 0.5562322266808408
R-squared: -0.11256348682648043


The relatively high MAE and MSE values suggest that the model's predictions have some level of error when compared to the actual salary differences.
The R2 score of 0.164 indicates that the linear model's performance is limited in explaining the variability in the salary differences. There might be other factors and complexities in the data that the linear model cannot capture.

## **(II) Polynomial Models**

In [78]:
degrees = [2,3,4]
alphas = [.01, .1, 1.0, 10.0]

param_grid = {
    'poly__degree': degrees,
    'ridge__alpha': alphas
}

In [79]:
pipeline = Pipeline([
    ('poly', PolynomialFeatures()),
    ('ridge', Ridge())
])

In [80]:
grid_search = GridSearchCV(pipeline, param_grid, cv=10)

warnings_list = []

with warnings.catch_warnings(record=True) as w:
    warnings.filterwarnings("ignore")  # Ignore all warnings
    try:
        grid_search.fit(X_train_bat, y_train_bat)
    except (UserWarning, Exception):  # Catch both UserWarning and other exceptions
        for warning in w:
            if "hyperparameters" in str(warning.message):
                hyperparameters = warning.message.split(": ")[-1]
                warnings_list.append(hyperparameters)
        print("Hyperparameters causing warnings:")
        for hyperparameter in np.unique(warnings_list):  # Remove duplicates
            print(hyperparameter)

In [81]:
best_model = grid_search.best_estimator_
y_pred_bat = best_model.predict(X_test_bat)

mse = mean_squared_error(y_test_bat, y_pred_bat)
mae = mean_absolute_error(y_test_bat, y_pred_bat)
r2 = r2_score(y_test_bat, y_pred_bat)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2) Score: {r2}")
print("Best Model Parameters:", grid_search.best_params_)

Mean Squared Error (MSE): 1.9142251379837025
Mean Absolute Error (MAE): 0.9494821371150615
R-squared (R2) Score: -1.9828733916705943
Best Model Parameters: {'poly__degree': 2, 'ridge__alpha': 10.0}


## **(III) Tree-based Models**

In [85]:
decision_tree = DecisionTreeRegressor()
decision_tree.fit(X_train_bat, y_train_bat)

y_pred_bat = decision_tree.predict(X_test_bat)

mse = mean_squared_error(y_test_bat, y_pred_bat)
mae = mean_absolute_error(y_test_bat, y_pred_bat)
r2 = r2_score(y_test_bat, y_pred_bat)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2) Score: {r2}")

Mean Squared Error (MSE): 2.4084981873837115
Mean Absolute Error (MAE): 0.8704307885866559
R-squared (R2) Score: -2.7530826518144367


In [100]:
param_grid = {
    'n_estimators': [200],                   # Number of trees in the forest
    'max_depth': [None, 5, 10, 15, 20],      # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],         # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]            # Minimum number of samples required to be at a leaf node
}

In [101]:
random_forest = RandomForestRegressor()
random_search = RandomizedSearchCV(random_forest, param_grid, n_iter=10, scoring='r2')

random_search.fit(X_train_bat, y_train_bat)
best_model = random_search.best_estimator_

In [102]:
y_pred_bat = best_model.predict(X_test_bat)

# Evaluate the model's performance using metrics like MSE, MAE, and R-squared
mse = mean_squared_error(y_test_bat, y_pred_bat)
mae = mean_absolute_error(y_test_bat, y_pred_bat)
r2 = r2_score(y_test_bat, y_pred_bat)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2) Score: {r2}")
print("Best Model Parameters:", random_search.best_params_)

Mean Squared Error (MSE): 0.6890361605497399
Mean Absolute Error (MAE): 0.5094036085864996
R-squared (R2) Score: -0.07370214110111939
Best Model Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 10}
