# Additional Models For Baseball Predictions
The following code utilizes multiple models to compare to the Random Forest
models built for each inning previously.  (Reference:  Class Activity 13.3-1)


In [1]:
# Import required dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Import data
df_scores = pd.read_csv("baseball_output/Scrubbed_9_Inning_Stats_2016to2021.csv")
df_scores.head()

Unnamed: 0,2,3,4,5,6,7,8,9,W-L,Score_1,...,Score_9,delta_1,delta_2,delta_3,delta_4,delta_5,delta_6,delta_7,delta_8,delta_9
0,0,0,0,0,0,0,0,1,L,0,...,1,0,-2,-2,-2,-2,-3,-3,-4,-3
1,2,0,0,0,1,0,1,0,W,0,...,4,0,2,2,2,2,3,3,4,3
2,0,0,1,0,0,0,2,0,W,2,...,5,2,2,1,2,2,2,2,4,2
3,0,1,0,0,0,0,0,2,L,0,...,3,-2,-2,-1,-2,-2,-2,-2,-4,-2
4,0,0,0,0,0,0,3,0,L,0,...,3,-1,-1,-1,-2,-2,-4,-4,-1,-1


## Preprocess the data

In [3]:
# Check the data types
df_scores.dtypes

2           int64
3           int64
4           int64
5           int64
6           int64
7           int64
8           int64
9           int64
W-L        object
Score_1     int64
Score_2     int64
Score_3     int64
Score_4     int64
Score_5     int64
Score_6     int64
Score_7     int64
Score_8     int64
Score_9     int64
delta_1     int64
delta_2     int64
delta_3     int64
delta_4     int64
delta_5     int64
delta_6     int64
delta_7     int64
delta_8     int64
delta_9     int64
dtype: object

###First, check nine inning model as a baseline.

In [4]:
# Get the target variable (the "W-L" column)
# Since the target column is an object, we need to convert the data to numerical classes
# Use the LabelEncoder
# Create an instance of the label encoder
le = LabelEncoder()

y = le.fit_transform(df_scores["W-L"])

y

array([0, 1, 1, ..., 0, 1, 0])

In [5]:
# Get the features (everything except the "W-L" column)
X = df_scores.copy()
X = X.drop(columns="W-L")
X.head()

Unnamed: 0,2,3,4,5,6,7,8,9,Score_1,Score_2,...,Score_9,delta_1,delta_2,delta_3,delta_4,delta_5,delta_6,delta_7,delta_8,delta_9
0,0,0,0,0,0,0,0,1,0,0,...,1,0,-2,-2,-2,-2,-3,-3,-4,-3
1,2,0,0,0,1,0,1,0,0,2,...,4,0,2,2,2,2,3,3,4,3
2,0,0,1,0,0,0,2,0,2,2,...,5,2,2,1,2,2,2,2,4,2
3,0,1,0,0,0,0,0,2,0,0,...,3,-2,-2,-1,-2,-2,-2,-2,-4,-2
4,0,0,0,0,0,0,3,0,0,0,...,3,-1,-1,-1,-2,-2,-4,-4,-1,-1


In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

## Model and Fit to a Logistic Regression Classifier

In [7]:
# Create the logistic regression classifier model with a random_state of 1
lr_model = LogisticRegression(random_state=1)

# Fit the model to the training data
lr_model.fit(X_train, y_train)

In [8]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % lr_model.score(X_train, y_train))
print('Test Accuracy: %.3f' % lr_model.score(X_test, y_test))

Train Accuracy: 1.000
Test Accuracy: 1.000


### Note:  Both accuracy scores are 1 or 100%.  This was the ninth inning model.
Try Innings 1 through 8.

In [9]:
# Get the target variable (the "W-L" column)
# Since the target column is an object, we need to convert the data to numerical classes
# Use the LabelEncoder. Create an instance of the label encoder
le = LabelEncoder()
y = le.fit_transform(df_scores["W-L"])
y

array([0, 1, 1, ..., 0, 1, 0])

# 1st Inning:  Logistic Regression Classifer

In [10]:
# Get the features for 1st Inning
X = df_scores.copy()
columns_to_drop = ['2','3', '4', '5', '6', '7', '8', '9', 'W-L','Score_2',
       'Score_3', 'Score_4', 'Score_5', 'Score_6', 'Score_7','Score_8', 
       'Score_9','delta_2','delta_3', 'delta_4','delta_5', 'delta_6', 'delta_7', 
       'delta_8', 'delta_9']
X = X.drop(columns= columns_to_drop, axis=1)
X.head()

Unnamed: 0,Score_1,delta_1
0,0,0
1,0,0
2,2,2
3,0,-2
4,0,-1


In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

In [12]:
# Create the logistic regression classifier model with a random_state of 1
lr_model = LogisticRegression(random_state=1)

# Fit the model to the training data
lr_model.fit(X_train, y_train)

In [13]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % lr_model.score(X_train, y_train))
print('Test Accuracy: %.3f' % lr_model.score(X_test, y_test))

Train Accuracy: 0.598
Test Accuracy: 0.597


#This is similar to Random Forest Model

## Model and Fit to a Support Vector Machine

In [14]:
# Create the support vector machine classifier model with a 'linear' kernel
svm_model = SVC(kernel='linear')

# Fit the model to the training data
svm_model.fit(X_train, y_train)

In [15]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % svm_model.score(X_train, y_train))
print('Test Accuracy: %.3f' % svm_model.score(X_test, y_test))

Train Accuracy: 0.598
Test Accuracy: 0.597


###This is similar to Random Forest Model

## Model and Fit to a KNN model

In [16]:
# Create the KNN model with 5 neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)

# Fit the model to the training data
knn_model.fit(X_train, y_train)

In [17]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % knn_model.score(X_train, y_train))
print('Test Accuracy: %.3f' % knn_model.score(X_test, y_test))

Train Accuracy: 0.542
Test Accuracy: 0.550


###This is lower than Random Forest Model. 

## Model and Fit to a Decision Tree Classifier

In [18]:
# Create the decision tree classifier model
dt_model = DecisionTreeClassifier()

# Fit the model to the training data
dt_model.fit(X_train, y_train)

In [19]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % dt_model.score(X_train, y_train))
print('Test Accuracy: %.3f' % dt_model.score(X_test, y_test))

Train Accuracy: 0.599
Test Accuracy: 0.593


### This is similar to Random Forest Model. 
Conclusions:  Since there are not any improvements in the lower innings where
it would be beneficial to have better models, all innings were not re-evaluated.