# Preferred Results Classification - Data Cleaning

### Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from sklearn.ensemble import RandomForestClassifier
import optuna
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier
import scipy.stats as stats

### Read in Statcast Data

In [2]:
df = pd.read_feather("cleaned_data.feather")
df

Unnamed: 0,player_name,pitch_type,p_throws,release_pos_x,release_pos_z,release_extension,release_speed,effective_speed,release_spin_rate,spin_axis,...,b_called_strike,b_swinging_strike,poorlyweak_percent,vertical_approach_angle,horizontal_approach_angle,release_speed_difference,spin_axis_difference,pfx_x_difference,pfx_z_difference,preferred_results
0,"Smith, Will",FF,L,1.40,6.80,6.5,92.3,92.8,2330.0,148.0,...,294.0,230.0,5.2,0.073386,0.038684,0.000000,0.000000,0.000000,0.000000,0.0
1,"Smith, Will",SL,L,1.60,6.64,6.4,80.6,81.2,2254.0,315.0,...,294.0,230.0,5.2,0.074171,0.042673,12.182745,-166.362745,1.363451,0.965118,0.0
2,"Smith, Will",CU,L,1.46,6.88,6.2,75.5,75.2,1940.0,328.0,...,294.0,230.0,5.2,0.081221,0.027617,17.282745,-179.362745,1.243451,1.955118,0.0
3,"Smith, Will",CU,L,1.53,6.83,5.9,75.0,74.5,2017.0,330.0,...,294.0,230.0,5.2,0.053794,-0.010439,17.782745,-181.362745,1.283451,2.135118,0.0
4,"Smith, Will",FF,L,1.49,6.66,6.3,91.2,90.9,2281.0,143.0,...,284.0,148.0,5.2,0.071098,0.033198,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
706332,"Márquez, Germán",FF,R,-2.14,6.08,5.4,96.2,95.4,2165.0,216.0,...,111.0,209.0,4.1,0.065062,-0.036100,0.000000,0.000000,0.000000,0.000000,0.0
706333,"Márquez, Germán",SL,R,-1.86,6.07,5.6,88.4,88.3,2383.0,158.0,...,435.0,151.0,2.6,0.086125,-0.037505,6.418979,60.146730,-0.130000,0.672010,1.0
706334,"Márquez, Germán",FF,R,-1.92,6.15,5.4,95.9,95.4,2113.0,212.0,...,435.0,151.0,2.6,0.055478,-0.036100,0.000000,0.000000,0.000000,0.000000,1.0
706335,"Márquez, Germán",FF,R,-1.76,6.27,5.5,96.1,95.6,2063.0,208.0,...,435.0,151.0,2.6,0.047600,-0.020906,0.000000,0.000000,0.000000,0.000000,0.0


### Train Test Split

In [3]:
model_data = df.select_dtypes(exclude=['object'])
data = np.array(model_data.drop(["preferred_results"], axis=1))
labels = np.array(model_data["preferred_results"])
X, X_test, Y, y_test = train_test_split(data, labels, test_size = 0.2, random_state = 0)
X = np.array(sklearn.preprocessing.normalize(X, norm='l2', axis=0))
X_test = np.array(sklearn.preprocessing.normalize(X_test, norm='l2', axis=0))
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size = 0.25, random_state = 0)

### Hyperparameter Tuning for Neural Network Model

In [4]:
def objective(trial, X = X_train, y = y_train):
    layer2 = trial.suggest_int('layer2', 20, 24)
    layer3 = trial.suggest_int('layer3', 15, 19)
    layer4 = trial.suggest_int('layer4', 10, 14)
    layer5 = trial.suggest_int('layer5', 2, 9)
    lr = trial.suggest_float('learning rate', 0.0001, 0.01)
    bs = trial.suggest_int('batch size', 1000, 5000)
    e = trial.suggest_int('epochs', 50, 100)
    model = Sequential()
    model.add(Dense(25, input_shape = (25,), activation = 'gelu'))
    model.add(Dense(layer2, activation = 'tanh'))
    model.add(Dense(layer3, activation = 'tanh'))
    model.add(Dense(layer4, activation = 'tanh'))
    model.add(Dense(layer5, activation = 'tanh'))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=lr), loss = 'binary_crossentropy', metrics = ['accuracy'])
    model.fit(X_train, y_train, verbose = 0, batch_size = bs, epochs = e, class_weight = dict(enumerate(class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train), y = y_train))))
    return f1_score((model.predict(X_val) >= 0.5).astype("int32"), y_val, pos_label = 0) + f1_score((model.predict(X_val) >= 0.5).astype("int32"), y_val, pos_label = 1)
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
trial = study.best_trial
print('Val F1 Score: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2022-05-06 13:54:28,513][0m A new study created in memory with name: no-name-edb321f0-5b58-4182-9554-c3f2f7c217d8[0m
[32m[I 2022-05-06 13:55:21,618][0m Trial 0 finished with value: 1.0114405689849224 and parameters: {'layer2': 24, 'layer3': 19, 'layer4': 13, 'layer5': 6, 'learning rate': 0.005717477988070832, 'batch size': 1693, 'epochs': 79}. Best is trial 0 with value: 1.0114405689849224.[0m
[32m[I 2022-05-06 13:55:53,327][0m Trial 1 finished with value: 1.0587155631451406 and parameters: {'layer2': 24, 'layer3': 16, 'layer4': 11, 'layer5': 3, 'learning rate': 0.0034818121672667516, 'batch size': 4200, 'epochs': 76}. Best is trial 1 with value: 1.0587155631451406.[0m
[32m[I 2022-05-06 13:56:42,352][0m Trial 2 finished with value: 1.0417677512328938 and parameters: {'layer2': 22, 'layer3': 18, 'layer4': 12, 'layer5': 3, 'learning rate': 0.004958305342879822, 'batch size': 1960, 'epochs': 83}. Best is trial 1 with value: 1.0587155631451406.[0m
[32m[I 2022-05-06 13:5

Val F1 Score: 1.2508757487563762
Best hyperparameters: {'layer2': 21, 'layer3': 16, 'layer4': 14, 'layer5': 9, 'learning rate': 0.009051767072261154, 'batch size': 1552, 'epochs': 95}


### Build A Neural Network Model

In [5]:
model = Sequential()
model.add(Dense(25, input_shape = (25,), activation = 'gelu'))
model.add(Dense(list(trial.params.values())[0], activation = 'tanh'))
model.add(Dense(list(trial.params.values())[1], activation = 'tanh'))
model.add(Dense(list(trial.params.values())[2], activation = 'tanh'))
model.add(Dense(list(trial.params.values())[3], activation = 'tanh'))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=list(trial.params.values())[4]), loss = 'binary_crossentropy', metrics = ['accuracy'])

### Train the Neural Network Model

In [6]:
_ = model.fit(X, Y, verbose = 0, batch_size = list(trial.params.values())[5], epochs = list(trial.params.values())[6], class_weight = dict(enumerate(class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train), y = y_train))))

### Neural Network Model Evaluation

In [7]:
model.evaluate(X_test, y_test)



[1.1049250364303589, 0.6545148491859436]

##### Find the Best Threshold

In [8]:
def objective(trial, X = X_test, y = y_test):
    threshold = trial.suggest_float('threshold', 0.4, 0.6)
    return f1_score((model.predict(X) >= threshold).astype("int32"), y, pos_label = 0) + f1_score((model.predict(X) >= threshold).astype("int32"), y, pos_label = 1)
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
trial = study.best_trial
print('Micro F1 Score: {}'.format(trial.value))
print("Best F1 Score: {}".format(trial.params))

[32m[I 2022-05-06 15:50:13,975][0m A new study created in memory with name: no-name-43306689-924a-456a-af02-dc83f27a0390[0m
[32m[I 2022-05-06 15:50:19,301][0m Trial 0 finished with value: 0.9993590562451524 and parameters: {'threshold': 0.5796608397837266}. Best is trial 0 with value: 0.9993590562451524.[0m
[32m[I 2022-05-06 15:50:24,272][0m Trial 1 finished with value: 1.0493211478805438 and parameters: {'threshold': 0.41537577543945114}. Best is trial 1 with value: 1.0493211478805438.[0m
[32m[I 2022-05-06 15:50:29,206][0m Trial 2 finished with value: 1.0502598590330565 and parameters: {'threshold': 0.4116241160909183}. Best is trial 2 with value: 1.0502598590330565.[0m
[32m[I 2022-05-06 15:50:34,008][0m Trial 3 finished with value: 1.025026963576342 and parameters: {'threshold': 0.5083576575331848}. Best is trial 2 with value: 1.0502598590330565.[0m
[32m[I 2022-05-06 15:50:39,183][0m Trial 4 finished with value: 1.0478372008528385 and parameters: {'threshold': 0.4220

Micro F1 Score: 1.0523254297232632
Best F1 Score: {'threshold': 0.4001833185718144}


##### Classification Report for Neural Network Model

In [9]:
print(classification_report((model.predict(X_test) >= list(trial.params.values())[0]).astype("int32"), y_test))

              precision    recall  f1-score   support

           0       0.85      0.70      0.77    116981
           1       0.22      0.41      0.29     24287

    accuracy                           0.65    141268
   macro avg       0.54      0.55      0.53    141268
weighted avg       0.74      0.65      0.68    141268



### Dropping Correlations for Random Forest Model

In [10]:
corr = df.corr(method = 'spearman').abs()
upper = corr.where(np.triu(np.ones(corr.shape), k = 1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(f'Dropping {to_drop}')
df = df.drop(columns = to_drop)

Dropping ['effective_speed']


### Using SMOTE to Upsamlple the Minority Class

In [14]:
X_train, y_train = SMOTE().fit_resample(X_train, y_train)
X_val, y_val = SMOTE().fit_resample(X_val, y_val)

### Hyperparameter Tuning for Random Forest Model

In [17]:
def objective(trial, X = X_train, y = y_train):
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    max_depth = int(trial.suggest_int('max_depth', 1, 50))
    min_samples_split = int(trial.suggest_int('min_samples_split', 2, 10))
    min_samples_leaf = int(trial.suggest_int('min_samples_leaf', 1, 10))
    max_features = int(trial.suggest_int('max_features', 2, 25))
    rf = RandomForestClassifier(random_state = 0, n_estimators = n_estimators, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, max_features = max_features, verbose = 0, class_weight = dict(enumerate(class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train), y = y_train))))
    rf.fit(X_train, y_train)
    return f1_score((rf.predict(X_val)).astype("int32"), y_val, pos_label = 0) + f1_score((rf.predict(X_val)).astype("int32"), y_val, pos_label = 1)
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
trial = study.best_trial
print('Val F1 Score: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2022-05-06 16:41:46,088][0m A new study created in memory with name: no-name-71c171c8-3dae-442d-8e34-cd74f54ff21f[0m
[32m[I 2022-05-06 17:00:03,453][0m Trial 0 finished with value: 1.418721448065666 and parameters: {'n_estimators': 83, 'max_depth': 21, 'min_samples_split': 7, 'min_samples_leaf': 3, 'max_features': 21}. Best is trial 0 with value: 1.418721448065666.[0m
[32m[I 2022-05-06 17:02:35,768][0m Trial 1 finished with value: 1.3801703099413423 and parameters: {'n_estimators': 55, 'max_depth': 22, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 4}. Best is trial 0 with value: 1.418721448065666.[0m
[32m[I 2022-05-06 17:03:31,647][0m Trial 2 finished with value: 1.4031255358741834 and parameters: {'n_estimators': 27, 'max_depth': 47, 'min_samples_split': 9, 'min_samples_leaf': 7, 'max_features': 2}. Best is trial 0 with value: 1.418721448065666.[0m
[32m[I 2022-05-06 17:10:00,337][0m Trial 3 finished with value: 1.370723523807626 and parameters: {'

Val F1 Score: 1.4628804844949959
Best hyperparameters: {'n_estimators': 97, 'max_depth': 42, 'min_samples_split': 6, 'min_samples_leaf': 9, 'max_features': 25}


### Build A Random Forest Model

In [22]:
rf = RandomForestClassifier(n_jobs = -1, **trial.params)

### Train the Random Forest Model

In [23]:
_ = rf.fit(X_train, y_train)

### Random Forest Model Evaluation

In [24]:
rf.score(X_test, y_test)

0.6587974629781691

##### Classification Report for Random Forest Model

In [25]:
print(classification_report((rf.predict(X_test)).astype("int32"), y_test))

              precision    recall  f1-score   support

           0       0.92      0.69      0.79    128843
           1       0.11      0.39      0.17     12425

    accuracy                           0.66    141268
   macro avg       0.51      0.54      0.48    141268
weighted avg       0.85      0.66      0.73    141268

