In [16]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.exceptions import ConvergenceWarning
import warnings

from tqdm import tqdm

In [2]:
filepath = './FinalDFs/PostEDA/'

batting_df = pd.read_pickle(filepath+'batting_filtered.pkl')
bowling_df = pd.read_pickle(filepath+'bowling_filtered.pkl')

# **Part 1: Data Preprocessing**

## **(I) Dropping Irrelevant Columns**

As we saw in our EDA notebook, there were quite a few of the numeric variables that were very closely related to each other. For example, in `batting_df`, the columns with `balls_faced` and `total_runs` had very high correlation numbers. Therefore, we will exclude one of these sets from our analysis.

In [3]:
bat_cols_to_drop = ['NY_SalaryUSD', 'balls_faced_1', 'balls_faced_2', 'balls_faced_3', 'boundary_prob_1', 'boundary_prob_2', 'boundary_prob_3']
batting_df = batting_df.drop(columns=bat_cols_to_drop)

In [4]:
bat_corr = batting_df.corr()

threshold = 0.8 
highly_correlated_features = set()

# Iterate through the correlation matrix
for i in range(len(bat_corr.columns)):
    for j in range(i):
        if abs(bat_corr.iloc[i, j]) > threshold:
            # Add the feature names to the set of highly correlated features
            feature_i = bat_corr.columns[i]
            feature_j = bat_corr.columns[j]
            highly_correlated_features.add((feature_i, feature_j))

print("Highly Correlated Features:")
for feature_pair in highly_correlated_features:
    print(feature_pair)

Highly Correlated Features:


In [5]:
bowl_cols_to_drop = ['NY_SalaryUSD', 'balls_bowled_1', 'balls_bowled_2', 'balls_bowled_3', 'boundary_prob_1', 'boundary_prob_2', 'boundary_prob_3']
bowling_df = bowling_df.drop(columns=bowl_cols_to_drop)

In [6]:
bowl_corr = bowling_df.corr()

threshold = 0.8 
highly_correlated_features = set()

# Iterate through the correlation matrix
for i in range(len(bowl_corr.columns)):
    for j in range(i):
        if abs(bowl_corr.iloc[i, j]) > threshold:
            # Add the feature names to the set of highly correlated features
            feature_i = bowl_corr.columns[i]
            feature_j = bowl_corr.columns[j]
            highly_correlated_features.add((feature_i, feature_j))

print("Highly Correlated Features:")
for feature_pair in highly_correlated_features:
    print(feature_pair)

Highly Correlated Features:
('strike_rate_3', 'bowling_avg_3')
('strike_rate_2', 'bowling_avg_2')
('strike_rate_1', 'bowling_avg_1')


In [7]:
bowling_df = bowling_df.drop(columns=['strike_rate_1', 'strike_rate_2', 'strike_rate_3'])

## **(II) Scaling the Numeric Columns**

The next step is to scale the numeric columns using the `StandardScaler`.

In [8]:
scaler = StandardScaler()

bat_num_cols = [col for col in batting_df.select_dtypes(include=[np.number]).columns if col not in ['Season', 'Role', 'changed_teams']]
bowl_num_cols = [col for col in bowling_df.select_dtypes(include=[np.number]).columns if col not in ['Season', 'Role', 'changed_teams']]

In [9]:
batting_df[bat_num_cols] = scaler.fit_transform(batting_df[bat_num_cols])
bowling_df[bowl_num_cols] = scaler.fit_transform(bowling_df[bowl_num_cols])

## **(III) Getting Dummies for the Categorical Columns**

The final step in data preprocessing is getting dummies for the categorical columns.

In [10]:
batting_df = pd.get_dummies(batting_df, columns=['Country', 'Team'])

In [11]:
bowling_df = pd.get_dummies(bowling_df, columns=['Country', 'Team'])

# **Part 2: Train-Test Split**

In [59]:
batting_df = batting_df.sort_values(by='Season')
bowling_df = bowling_df.sort_values(by='Season')

In [12]:
# batting_train, batting_test = train_test_split(batting_df, test_size=.2)
# bowling_train, bowling_test = train_test_split(bowling_df, test_size=.2)

batting_train = batting_df[batting_df.Season <=2017]
batting_test = batting_df[batting_df.Season > 2017]

In [58]:
# batting_train.to_pickle('./FinalDFs/TrainTestSplit/batting_train.pkl')
# batting_test.to_pickle('./FinalDFs/TrainTestSplit/batting_test.pkl')

In [59]:
# bowling_train.to_pickle('./FinalDFs/TrainTestSplit/bowling_train.pkl')
# bowling_test.to_pickle('./FinalDFs/TrainTestSplit/bowling_test.pkl')

In [13]:
X_train_bat = batting_train.drop(columns=['Player', 'salary_diff'])
y_train_bat = batting_train['salary_diff']

X_test_bat = batting_test.drop(columns=['Player', 'salary_diff'])
y_test_bat = batting_test['salary_diff']

In [76]:
# X_train_bowl = bowling_train.drop(columns=['Player', 'salary_diff'])
# y_train_bowl = bowling_train['salary_diff']

# X_test_bowl = bowling_test.drop(columns=['Player', 'salary_diff'])
# y_test_bowl = bowling_test['salary_diff']

# **Part 3: Batting Models**

In [14]:
mean_target = y_train_bat.mean()

y_pred_mean = np.full(y_test_bat.shape, mean_target)

mae_mean = mean_absolute_error(y_test_bat, y_pred_mean)
mse_mean = mean_squared_error(y_test_bat, y_pred_mean)
r2_mean = r2_score(y_test_bat, y_pred_mean)

print("Mean-Based Model:")
print("MAE:", mae_mean)
print("MSE:", mse_mean)
print("R^2:", r2_mean)

Mean-Based Model:
MAE: 0.2398704733761003
MSE: 0.2246048756962985
R^2: -0.19181353807481316


In [15]:
linear_model = LinearRegression()
linear_model.fit(X_train_bat, y_train_bat)
y_pred_linear = linear_model.predict(X_test_bat)

mae_linear = mean_absolute_error(y_test_bat, y_pred_linear)
mse_linear = mean_squared_error(y_test_bat, y_pred_linear)
r2_linear = r2_score(y_test_bat, y_pred_linear)

print("\nLinear Regression Model:")
print("MAE:", mae_linear)
print("MSE:", mse_linear)
print("R^2:", r2_linear)


Linear Regression Model:
MAE: 0.5583367658800904
MSE: 0.4961853514368858
R^2: -1.6328921729931754


In [24]:
constant_feature_mask = X_train_bat.apply(lambda col: col.nunique() != 1)
X_train_bat = X_train_bat.loc[:, constant_feature_mask]
X_test_bat = X_test_bat.loc[:, constant_feature_mask]

In [25]:
k=10
selector = SelectKBest(score_func=f_regression, k=k)

X_train_bat_selected = selector.fit_transform(X_train_bat, y_train_bat)
X_test_bat_selected = selector.transform(X_test_bat)

selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = X_train_bat.columns[selected_feature_indices]

In [27]:
linear_model_selected = LinearRegression()
linear_model_selected.fit(X_train_bat_selected, y_train_bat)
y_pred_linear_selected = linear_model_selected.predict(X_test_bat_selected)

mae_linear_selected = mean_absolute_error(y_test_bat, y_pred_linear_selected)
mse_linear_selected = mean_squared_error(y_test_bat, y_pred_linear_selected)
r2_linear_selected = r2_score(y_test_bat, y_pred_linear_selected)

print("\nLinear Regression Model with Selected Features:")
print("Selected Features:", selected_feature_names)
print("MAE:", mae_linear_selected)
print("MSE:", mse_linear_selected)
print("R^2:", r2_linear_selected)


Linear Regression Model with Selected Features:
Selected Features: Index(['SalaryUSD', 'SeasonCount', 'total_runs_2', 'total_runs_3',
       'batting_avg_3', 'strike_rate_2', 'strike_rate_3', 'changed_teams',
       'Team_Kochi Tuskers Kerala', 'Team_Rajasthan Royals'],
      dtype='object')
MAE: 0.46211888005134716
MSE: 0.38145948588299555
R^2: -1.0241260484351296
