# Preferred Results Classification - Machine Learning

### Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from sklearn.ensemble import RandomForestClassifier
import optuna
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier
import scipy.stats as stats

### Read in Statcast Data

In [2]:
df = pd.read_feather("cleaned_data.feather")
df

Unnamed: 0,player_name,release_pos_x,release_pos_z,release_extension,release_speed,effective_speed,release_spin_rate,spin_axis,pfx_x,pfx_z,...,pitch_type_EP,pitch_type_FA,pitch_type_FC,pitch_type_FF,pitch_type_FS,pitch_type_KC,pitch_type_KN,pitch_type_SI,pitch_type_SL,p_throws_R
0,"Smith, Will",1.40,6.80,6.5,92.3,92.8,2330.0,148.0,0.69,1.38,...,0,0,0,1,0,0,0,0,0,0
1,"Smith, Will",1.60,6.64,6.4,80.6,81.2,2254.0,315.0,-0.77,0.48,...,0,0,0,0,0,0,0,0,1,0
2,"Smith, Will",1.46,6.88,6.2,75.5,75.2,1940.0,328.0,-0.65,-0.51,...,0,0,0,0,0,0,0,0,0,0
3,"Smith, Will",1.53,6.83,5.9,75.0,74.5,2017.0,330.0,-0.69,-0.69,...,0,0,0,0,0,0,0,0,0,0
4,"Smith, Will",1.49,6.66,6.3,91.2,90.9,2281.0,143.0,0.63,1.28,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
706332,"Márquez, Germán",-2.14,6.08,5.4,96.2,95.4,2165.0,216.0,-0.24,0.79,...,0,0,0,1,0,0,0,0,0,1
706333,"Márquez, Germán",-1.86,6.07,5.6,88.4,88.3,2383.0,158.0,0.13,0.29,...,0,0,0,0,0,0,0,0,1,1
706334,"Márquez, Germán",-1.92,6.15,5.4,95.9,95.4,2113.0,212.0,-0.17,0.86,...,0,0,0,1,0,0,0,0,0,1
706335,"Márquez, Germán",-1.76,6.27,5.5,96.1,95.6,2063.0,208.0,-0.42,0.86,...,0,0,0,1,0,0,0,0,0,1


### Train Test Split

In [3]:
model_data = df.select_dtypes(exclude=['object'])
data = np.array(model_data.drop(["preferred_results"], axis=1))
labels = np.array(model_data["preferred_results"])
X, X_test, Y, y_test = train_test_split(data, labels, test_size = 0.2, random_state = 0, stratify = labels)
X = np.array(sklearn.preprocessing.normalize(X, norm='l2', axis=0))
X_test = np.array(sklearn.preprocessing.normalize(X_test, norm='l2', axis=0))
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size = 0.25, random_state = 0)

### Hyperparameter Tuning for Neural Network Model

In [4]:
def objective(trial, X = X_train, y = y_train):
    layer2 = trial.suggest_int('layer2', 30, 36)
    layer3 = trial.suggest_int('layer3', 25, 29)
    layer4 = trial.suggest_int('layer4', 20, 24)
    layer5 = trial.suggest_int('layer5', 15, 19)
    layer6 = trial.suggest_int('layer6', 10, 14)
    layer7 = trial.suggest_int('layer7', 5, 9)
    layer8 = trial.suggest_int('layer8', 2, 4)
    lr = trial.suggest_float('learning rate', 0.0001, 0.01)
    bs = trial.suggest_int('batch size', 1000, 5000)
    e = trial.suggest_int('epochs', 50, 100)
    model = Sequential()
    model.add(Dense(37, input_shape = (37,), activation = 'gelu'))
    model.add(Dense(layer2, activation = 'tanh'))
    model.add(Dense(layer3, activation = 'tanh'))
    model.add(Dense(layer4, activation = 'tanh'))
    model.add(Dense(layer5, activation = 'tanh'))
    model.add(Dense(layer6, activation = 'tanh'))
    model.add(Dense(layer7, activation = 'tanh'))
    model.add(Dense(layer8, activation = 'tanh'))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=lr), loss = 'binary_crossentropy', metrics = ['accuracy'])
    model.fit(X_train, y_train, verbose = 0, batch_size = bs, epochs = e, class_weight = dict(enumerate(class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train), y = y_train))))
    return f1_score((model.predict(X_val) >= 0.5).astype("int32"), y_val, pos_label = 0) + f1_score((model.predict(X_val) >= 0.5).astype("int32"), y_val, pos_label = 1)
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
trial = study.best_trial
print('Val F1 Score: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2022-05-16 14:31:10,100][0m A new study created in memory with name: no-name-3ed01032-242c-4a6e-86c5-307e92058d8e[0m
[32m[I 2022-05-16 14:32:39,776][0m Trial 0 finished with value: 0.8098535779878347 and parameters: {'layer2': 31, 'layer3': 27, 'layer4': 24, 'layer5': 19, 'layer6': 11, 'layer7': 6, 'layer8': 4, 'learning rate': 0.009111276333030863, 'batch size': 1062, 'epochs': 79}. Best is trial 0 with value: 0.8098535779878347.[0m
[32m[I 2022-05-16 14:34:21,805][0m Trial 1 finished with value: 0.48431397794086095 and parameters: {'layer2': 32, 'layer3': 25, 'layer4': 24, 'layer5': 19, 'layer6': 11, 'layer7': 9, 'layer8': 2, 'learning rate': 0.005158111414986226, 'batch size': 1167, 'epochs': 97}. Best is trial 0 with value: 0.8098535779878347.[0m
[32m[I 2022-05-16 14:35:32,851][0m Trial 2 finished with value: 0.48431397794086095 and parameters: {'layer2': 36, 'layer3': 29, 'layer4': 22, 'layer5': 16, 'layer6': 14, 'layer7': 7, 'layer8': 3, 'learning rate': 0.007658

Val F1 Score: 1.2645954874924792
Best hyperparameters: {'layer2': 36, 'layer3': 28, 'layer4': 23, 'layer5': 18, 'layer6': 12, 'layer7': 9, 'layer8': 2, 'learning rate': 0.004504405051406625, 'batch size': 2056, 'epochs': 87}


### Build A Neural Network Model

In [5]:
model = Sequential()
model.add(Dense(37, input_shape = (37,), activation = 'gelu'))
model.add(Dense(list(trial.params.values())[0], activation = 'tanh'))
model.add(Dense(list(trial.params.values())[1], activation = 'tanh'))
model.add(Dense(list(trial.params.values())[2], activation = 'tanh'))
model.add(Dense(list(trial.params.values())[3], activation = 'tanh'))
model.add(Dense(list(trial.params.values())[4], activation = 'tanh'))
model.add(Dense(list(trial.params.values())[5], activation = 'tanh'))
model.add(Dense(list(trial.params.values())[6], activation = 'tanh'))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=list(trial.params.values())[7]), loss = 'binary_crossentropy', metrics = ['accuracy'])

### Train the Neural Network Model

In [6]:
_ = model.fit(X, Y, verbose = 0, batch_size = list(trial.params.values())[8], epochs = list(trial.params.values())[9], class_weight = dict(enumerate(class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train), y = y_train))))

### Neural Network Model Evaluation

In [7]:
model.evaluate(X_test, y_test)



[0.9520080089569092, 0.6363649368286133]

##### Find the Best Threshold

In [8]:
def objective(trial, X = X_test, y = y_test):
    threshold = trial.suggest_float('threshold', 0.4, 0.6)
    return f1_score((model.predict(X) >= threshold).astype("int32"), y, pos_label = 0) + f1_score((model.predict(X) >= threshold).astype("int32"), y, pos_label = 1)
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
trial = study.best_trial
print('Micro F1 Score: {}'.format(trial.value))
print("Best F1 Score: {}".format(trial.params))

[32m[I 2022-05-16 17:06:08,550][0m A new study created in memory with name: no-name-1d9f9e95-3266-4ec6-a19c-4d0ea2344f85[0m
[32m[I 2022-05-16 17:06:13,765][0m Trial 0 finished with value: 1.0431944232676833 and parameters: {'threshold': 0.5897762536285025}. Best is trial 0 with value: 1.0431944232676833.[0m
[32m[I 2022-05-16 17:06:18,848][0m Trial 1 finished with value: 1.0666277605098813 and parameters: {'threshold': 0.4907714961743729}. Best is trial 1 with value: 1.0666277605098813.[0m
[32m[I 2022-05-16 17:06:25,098][0m Trial 2 finished with value: 1.049819728489074 and parameters: {'threshold': 0.5700721889968822}. Best is trial 1 with value: 1.0666277605098813.[0m
[32m[I 2022-05-16 17:06:30,241][0m Trial 3 finished with value: 1.0743570484680212 and parameters: {'threshold': 0.4497983992209789}. Best is trial 3 with value: 1.0743570484680212.[0m
[32m[I 2022-05-16 17:06:35,405][0m Trial 4 finished with value: 1.0804582270750873 and parameters: {'threshold': 0.41064

Micro F1 Score: 1.0827008737774628
Best F1 Score: {'threshold': 0.4004798500943893}


##### Classification Report for Neural Network Model

In [9]:
print(classification_report((model.predict(X_test) >= list(trial.params.values())[0]).astype("int32"), y_test))

              precision    recall  f1-score   support

           0       0.79      0.70      0.74    107298
           1       0.30      0.40      0.34     33970

    accuracy                           0.63    141268
   macro avg       0.54      0.55      0.54    141268
weighted avg       0.67      0.63      0.65    141268



### Dropping Correlations for Random Forest Model

In [4]:
corr = df.corr(method = 'spearman').abs()
upper = corr.where(np.triu(np.ones(corr.shape), k = 1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(f'Dropping {to_drop}')
df = df.drop(columns = to_drop)

Dropping ['effective_speed']


### Using SMOTE to Upsamlple the Minority Class

In [5]:
# X_train, y_train = SMOTE().fit_resample(X_train, y_train)
# X_val, y_val = SMOTE().fit_resample(X_val, y_val)

### Undersampling the Minority Class

In [6]:
X_train, y_train = RandomUnderSampler(sampling_strategy = "all").fit_resample(X_train, y_train)
X_val, y_val = RandomUnderSampler(sampling_strategy = "all").fit_resample(X_val, y_val)

### Hyperparameter Tuning for Random Forest Model

In [12]:
def objective(trial, X = X_train, y = y_train):
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    max_depth = int(trial.suggest_int('max_depth', 1, 50))
    min_samples_split = int(trial.suggest_int('min_samples_split', 2, 10))
    min_samples_leaf = int(trial.suggest_int('min_samples_leaf', 1, 10))
    max_features = int(trial.suggest_int('max_features', 2, 25))
    rf = RandomForestClassifier(random_state = 0, n_jobs = -1, n_estimators = n_estimators, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, max_features = max_features, class_weight = dict(enumerate(class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train), y = y_train))))
    rf.fit(X_train, y_train)
    return f1_score((rf.predict(X_val)).astype("int32"), y_val, pos_label = 0) + f1_score((rf.predict(X_val)).astype("int32"), y_val, pos_label = 1)
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
trial = study.best_trial
print('Val F1 Score: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2022-05-17 13:51:32,656][0m A new study created in memory with name: no-name-fa0228f1-fc2a-4d01-8e4b-ca532ed66578[0m
[32m[I 2022-05-17 13:51:34,587][0m Trial 0 finished with value: 1.2894218980986494 and parameters: {'n_estimators': 15, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 3}. Best is trial 0 with value: 1.2894218980986494.[0m
[32m[I 2022-05-17 13:52:39,684][0m Trial 1 finished with value: 1.3349590121703898 and parameters: {'n_estimators': 61, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 23}. Best is trial 1 with value: 1.3349590121703898.[0m
[32m[I 2022-05-17 13:52:47,145][0m Trial 2 finished with value: 1.3078540446468319 and parameters: {'n_estimators': 48, 'max_depth': 14, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 3}. Best is trial 1 with value: 1.3349590121703898.[0m
[32m[I 2022-05-17 13:53:55,825][0m Trial 3 finished with value: 1.340052197001502 and parameters

Val F1 Score: 1.3439754639501669
Best hyperparameters: {'n_estimators': 94, 'max_depth': 48, 'min_samples_split': 3, 'min_samples_leaf': 10, 'max_features': 15}


### Build A Random Forest Model

In [13]:
rf = RandomForestClassifier(random_state = 0, n_jobs = -1, **trial.params, class_weight = dict(enumerate(class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train), y = y_train))))

### Train the Random Forest Model

In [14]:
_ = rf.fit(X_train, y_train)

### Random Forest Model Evaluation

In [15]:
rf.score(X_test, y_test)

0.6539909958376986

##### Classification Report for Random Forest Model

In [16]:
print(classification_report((rf.predict(X_test)).astype("int32"), y_test))

              precision    recall  f1-score   support

           0       0.90      0.69      0.78    126053
           1       0.13      0.39      0.19     15215

    accuracy                           0.65    141268
   macro avg       0.52      0.54      0.49    141268
weighted avg       0.82      0.65      0.72    141268

