In [1]:
import os
import pandas as pd
import psycopg2
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.linear_model import LogisticRegression

from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification

import warnings
warnings.filterwarnings('ignore')

In [2]:
aws_psql_conn = psycopg2.connect(
    database=os.getenv("AWS_PSQL_DB"),
    user=os.getenv("AWS_PSQL_USER"),
    password=os.getenv("AWS_PSQL_PASSWORD"),
    host=os.getenv("AWS_PSQL_HOST"),
    port=os.getenv("AWS_PSQL_PORT"),
)

cursor = aws_psql_conn.cursor()

In [3]:
sql = f"select * from games"

cursor.execute(sql)

rows = cursor.fetchall()

column_names = [desc[0] for desc in cursor.description]

In [4]:
df = pd.DataFrame(rows, columns=column_names)

df.head()

Unnamed: 0,game_id,home_team_id,home_team_name,away_team_id,away_team_name,home_pitcher,home_pitcher_id,home_pitcher_era,home_pitcher_win_percentage,home_pitcher_wins,...,home_pitcher_bb_nine,home_pitcher_k_bb_diff,home_pitcher_whip,home_pitcher_babip,away_pitcher_k_nine,away_pitcher_bb_nine,away_pitcher_k_bb_diff,away_pitcher_whip,away_pitcher_babip,predicted_winner
0,718698,158,Milwaukee Brewers,121,New York Mets,Corbin Burnes,669203.0,7.2,0.0,0.0,...,5.4,0.0,1.4,0.25,9.0,1.8,0.181818,1.8,0.466667,1.0
1,718781,147,New York Yankees,137,San Francisco Giants,Gerrit Cole,543037.0,,,,...,,,,,,,,,,
2,718782,111,Boston Red Sox,110,Baltimore Orioles,Corey Kluber,446372.0,,,,...,,,,,,,,,,
3,718777,112,Chicago Cubs,158,Milwaukee Brewers,Marcus Stroman,573186.0,,,,...,,,,,,,,,,
4,718776,139,Tampa Bay Rays,116,Detroit Tigers,Shane McClanahan,663556.0,,,,...,,,,,,,,,,


In [5]:
columns_to_remove = [
    "home_team_name",
    "away_team_name",
    "home_pitcher",
    "home_pitcher_id",
    "away_pitcher",
    "away_pitcher_id",
]

df = df.drop(columns=columns_to_remove)

df.head()

Unnamed: 0,game_id,home_team_id,away_team_id,home_pitcher_era,home_pitcher_win_percentage,home_pitcher_wins,home_pitcher_losses,home_pitcher_innings_pitched,away_pitcher_era,away_pitcher_win_percentage,...,home_pitcher_bb_nine,home_pitcher_k_bb_diff,home_pitcher_whip,home_pitcher_babip,away_pitcher_k_nine,away_pitcher_bb_nine,away_pitcher_k_bb_diff,away_pitcher_whip,away_pitcher_babip,predicted_winner
0,718698,158,121,7.2,0.0,0.0,1.0,5.0,1.8,0.0,...,5.4,0.0,1.4,0.25,9.0,1.8,0.181818,1.8,0.466667,1.0
1,718781,147,137,,,,,,,,...,,,,,,,,,,
2,718782,111,110,,,,,,,,...,,,,,,,,,,
3,718777,112,158,,,,,,,,...,,,,,,,,,,
4,718776,139,116,,,,,,,,...,,,,,,,,,,


In [6]:
df = df.dropna()

df.head()

Unnamed: 0,game_id,home_team_id,away_team_id,home_pitcher_era,home_pitcher_win_percentage,home_pitcher_wins,home_pitcher_losses,home_pitcher_innings_pitched,away_pitcher_era,away_pitcher_win_percentage,...,home_pitcher_bb_nine,home_pitcher_k_bb_diff,home_pitcher_whip,home_pitcher_babip,away_pitcher_k_nine,away_pitcher_bb_nine,away_pitcher_k_bb_diff,away_pitcher_whip,away_pitcher_babip,predicted_winner
0,718698,158,121,7.2,0.0,0.0,1.0,5.0,1.8,0.0,...,5.4,0.0,1.4,0.25,9.0,1.8,0.181818,1.8,0.466667,1.0
72,718703,119,115,3.0,1.0,1.0,0.0,6.0,3.0,1.0,...,0.0,0.285714,0.67,0.285714,7.5,0.0,0.208333,0.83,0.294118,1.0
73,718704,120,139,6.0,0.0,0.0,1.0,3.0,0.0,1.0,...,9.0,0.0,3.33,0.5,9.0,1.5,0.227273,0.83,0.266667,0.0
74,718701,146,142,0.0,1.0,1.0,0.0,5.6,0.0,1.0,...,6.35,0.047619,1.06,0.166667,13.5,5.06,0.238095,0.94,0.222222,1.0
77,717821,116,109,5.23,0.429,3.0,4.0,53.3,5.4,0.4,...,3.71,0.134199,1.33,0.285714,5.4,3.21,0.056391,1.49,0.314721,1.0


In [7]:
df.columns

Index(['game_id', 'home_team_id', 'away_team_id', 'home_pitcher_era',
       'home_pitcher_win_percentage', 'home_pitcher_wins',
       'home_pitcher_losses', 'home_pitcher_innings_pitched',
       'away_pitcher_era', 'away_pitcher_win_percentage', 'away_pitcher_wins',
       'away_pitcher_losses', 'away_pitcher_innings_pitched', 'winning_team',
       'home_pitcher_k_nine', 'home_pitcher_bb_nine', 'home_pitcher_k_bb_diff',
       'home_pitcher_whip', 'home_pitcher_babip', 'away_pitcher_k_nine',
       'away_pitcher_bb_nine', 'away_pitcher_k_bb_diff', 'away_pitcher_whip',
       'away_pitcher_babip', 'predicted_winner'],
      dtype='object')

In [8]:
X = []
Y = []

for index, row in df.iterrows():
    pitcher_innings_pitched_comp = row['away_pitcher_innings_pitched'] - row['home_pitcher_innings_pitched']
    pitcher_k_nine_comp = row['away_pitcher_k_nine'] - row['home_pitcher_k_nine']
    pitcher_bb_nine_comp = row['away_pitcher_bb_nine'] - row['home_pitcher_bb_nine']
    pitcher_k_bb_diff_comp = row['away_pitcher_k_bb_diff'] - row['home_pitcher_k_bb_diff']
    pitcher_whip_comp = row['away_pitcher_whip'] - row['home_pitcher_whip']
    pitcher_babip_comp = row['away_pitcher_babip'] - row['home_pitcher_babip']

    comparison = [pitcher_innings_pitched_comp, pitcher_k_nine_comp, pitcher_bb_nine_comp, pitcher_k_bb_diff_comp, pitcher_whip_comp, pitcher_babip_comp]

    X.append(comparison)
    Y.append(1 if row['winning_team'] == row['home_team_id'] else 0)


In [9]:
# Convert the lists to NumPy arrays for compatibility with scikit-learn
X = pd.np.array(X)
Y = pd.np.array(Y)
X = X.reshape(X.shape[0], -1)

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Print the shape of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

X_train shape: (921, 6)
X_test shape: (231, 6)
Y_train shape: (921,)
Y_test shape: (231,)


In [11]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [12]:
model = Sequential()
model.add(Dense(units=10, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=1, activation='sigmoid'))

In [13]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [14]:
model.fit(X_train, Y_train, epochs=10, batch_size=32, verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x292d6a0d0>

In [15]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')


Test Loss: 0.7072
Test Accuracy: 0.4978
