In [54]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [47]:
filepath = './FinalDFs/PostEDA/'

batting_df = pd.read_pickle(filepath+'batting_filtered.pkl')
bowling_df = pd.read_pickle(filepath+'bowling_filtered.pkl')

# **Part 1: Data Preprocessing**

## **(I) Dropping Irrelevant Columns**

As we saw in our EDA notebook, there were quite a few of the numeric variables that were very closely related to each other. For example, in `batting_df`, the columns with `balls_faced` and `total_runs` had very high correlation numbers. Therefore, we will exclude one of these sets from our analysis.

In [48]:
bat_cols_to_drop = ['SalaryUSD', 'NY_SalaryUSD', 'Season', 'balls_faced_1', 'balls_faced_2', 'balls_faced_3', 'boundary_prob_1', 'boundary_prob_2', 'boundary_prob_3']
batting_df = batting_df.drop(columns=bat_cols_to_drop)

In [49]:
bowl_cols_to_drop = ['SalaryUSD', 'NY_SalaryUSD', 'Season', 'balls_bowled_1', 'balls_bowled_2', 'balls_bowled_3', 'boundary_prob_1', 'boundary_prob_2', 'boundary_prob_3']
bowling_df = bowling_df.drop(columns=bowl_cols_to_drop)

## **(II) Scaling the Numeric Columns**

The next step is to scale the numeric columns using the `StandardScaler`.

In [50]:
scaler = StandardScaler()

bat_num_cols = [col for col in batting_df.select_dtypes(include=[np.number]).columns if col not in ['Role', 'changed_teams']]
bowl_num_cols = [col for col in bowling_df.select_dtypes(include=[np.number]).columns if col not in ['Role', 'changed_teams']]

In [51]:
batting_df[bat_num_cols] = scaler.fit_transform(batting_df[bat_num_cols])
bowling_df[bowl_num_cols] = scaler.fit_transform(bowling_df[bowl_num_cols])

## **(III) Getting Dummies for the Categorical Columns**

The final step in data preprocessing is getting dummies for the categorical columns.

In [52]:
batting_df = pd.get_dummies(batting_df, columns=['Country', 'Team'])

In [53]:
bowling_df = pd.get_dummies(bowling_df, columns=['Country', 'Team'])

# **Part 2: Train-Test Split**

In [57]:
batting_train, batting_test = train_test_split(batting_df, test_size=.2, random_state=42)
bowling_train, bowling_test = train_test_split(bowling_df, test_size=.2, random_state=42)

In [58]:
batting_train.to_pickle('./FinalDFs/TrainTestSplit/batting_train.pkl')
batting_test.to_pickle('./FinalDFs/TrainTestSplit/batting_test.pkl')

In [59]:
bowling_train.to_pickle('./FinalDFs/TrainTestSplit/bowling_train.pkl')
bowling_test.to_pickle('./FinalDFs/TrainTestSplit/bowling_test.pkl')