In [124]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV, LogisticRegressionCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, mean_squared_error, roc_auc_score
from sklearn.feature_selection import RFE
from itertools import combinations
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.svm import SVC


In [125]:
import statsmodels.api as sm

In [126]:
X_train=pd.read_pickle('Datasets/X_train2.plk')
X_test=pd.read_pickle('Datasets/X_test.plk')
y_train=pd.read_pickle('Datasets/y_train2.plk')
y_test=pd.read_pickle('Datasets/y_test.plk')

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4415, 69)
(9880, 69)
(4415,)
(9880,)


In [127]:
X_train.drop('cum_bats_left', axis=1, inplace=True)
X_test.drop('cum_bats_left', axis=1, inplace=True)

X_train.drop('constant', axis=1, inplace=True)
X_test.drop('constant', axis=1, inplace=True)

Three algorithms were tested, to know if they are suitable for capturing the differences between the last plate apperances for the pitchers, and a "normal" plate appearance.
They are simple algorithms, whose decision boundaries may be calculated by hand.

Different parameters were randomly tested and all of the variables were used. Here, the "best" results are shown.

# Training the model

### Logistic Regression

In [128]:
lr=LogisticRegression(random_state=42, penalty='none', max_iter=10000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='none',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [129]:
y_pred=lr.predict(X_test)

pd.crosstab(y_test, y_pred, margins=True)

col_0,0,1,All
last_batter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8354,1131,9485
1,58,337,395
All,8412,1468,9880


In [130]:
print( accuracy_score(y_test, (y_pred>0.5)+0 ))
print( precision_score(y_test, (y_pred>0.5)+0 ))
print( roc_auc_score(y_test, (y_pred>0.5)+0 ))
print( f1_score(y_test, (y_pred>0.5)+0 ))

0.8796558704453441
0.2295640326975477
0.8669618251336221
0.36178207192699946


### SVM 

In [131]:
svm=SVC(kernel='linear', C=0.5)
svm.fit(X_train, y_train)

SVC(C=0.5, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [132]:
y_pred2=svm.predict(X_test)

pd.crosstab(y_test, np.round(y_pred2,0), margins=True)

col_0,0,1,All
last_batter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8325,1160,9485
1,56,339,395
All,8381,1499,9880


In [133]:
print( accuracy_score(y_test, (y_pred2>0.5)+0 ))
print( precision_score(y_test, (y_pred2>0.5)+0 ))
print( roc_auc_score(y_test, (y_pred2>0.5)+0 ))
print( f1_score(y_test, (y_pred2>0.5)+0 ))

0.8769230769230769
0.22615076717811874
0.8679647411302324
0.3579725448785639


### Tree

In [134]:
tree=DecisionTreeClassifier(max_depth=3)#, criterion='gini', random_state=42)
tree.fit(X_train, y_train)

## ¡¡VOLVER A ENCONTRAR DATOS QUE PRODUCEN F1=40!!!
## ¡¡GUARDAR ESE MODELO!!

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [135]:
y_pred3=tree.predict(X_test)

pd.crosstab(y_test, (y_pred3>0.5)+0, margins=True)

col_0,0,1,All
last_batter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,7957,1528,9485
1,51,344,395
All,8008,1872,9880


In [136]:
print( accuracy_score(y_test, (y_pred3>0.5)+0 ))
print( precision_score(y_test, (y_pred3>0.5)+0 ))
print( roc_auc_score(y_test, (y_pred3>0.5)+0 ))
print( f1_score(y_test, (y_pred3>0.5)+0 ))

0.8401821862348178
0.18376068376068377
0.8548948039209144
0.30348478164975745


In [137]:
X_train.columns[tree.feature_importances_>0]

Index(['pbp_idx', 'end_of_inning', 'field_out', 'inning_cum_bats_left',
       'cum_points_allowed'],
      dtype='object')

The three algorithms performed similar. In particular, the SVM and the Logistic Regression had a similar prediction on the testing set. That was expected, as they both aim to generate the best “hyperplane” that separates the two classes. However, the logistic regression runs noticeably faster, and has a more concise formula to determine the decision boundary. Therefore, it will be chosen over the SVM with a linear kernel.

On the other hand, the Decision Tree performed slightly better on most of the metrics while using less variables. That is because the maximum depth was set at four. Thus, the Decision Tree will be used as a base model, while we attempt to have a better model by using a Logistic Regression. 


In [138]:
print( export_text(tree,  feature_names=X_train.columns.tolist()) )

|--- pbp_idx <= 18.50
|   |--- cum_points_allowed <= 4.50
|   |   |--- pbp_idx <= 11.50
|   |   |   |--- class: 0
|   |   |--- pbp_idx >  11.50
|   |   |   |--- class: 0
|   |--- cum_points_allowed >  4.50
|   |   |--- inning_cum_bats_left <= 0.50
|   |   |   |--- class: 0
|   |   |--- inning_cum_bats_left >  0.50
|   |   |   |--- class: 1
|--- pbp_idx >  18.50
|   |--- end_of_inning <= 0.50
|   |   |--- field_out <= 0.50
|   |   |   |--- class: 1
|   |   |--- field_out >  0.50
|   |   |   |--- class: 0
|   |--- end_of_inning >  0.50
|   |   |--- pbp_idx <= 20.50
|   |   |   |--- class: 1
|   |   |--- pbp_idx >  20.50
|   |   |   |--- class: 1

