# Neural Network for Predicting World Series Champions
The procedure has been borrowed in part from the machine learning class examples

# Read in data and pre-process

In [1]:
# import dependencies
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings('ignore')

In [2]:
# read in data
pd.set_option('display.max_columns', None)
mlb_df = pd.read_csv('resources/mlb_data.csv')
# mlb_df.head()

In [3]:
# change the League column to be binary instead of string
for i, row in mlb_df.iterrows():
#     print(i,row['Lg'])
    if row['Lg'] == 'AL':
        mlb_df.at[i,'Lg'] = 0
#         print(f'new value at {i} is {mlb_df.at[i,"Lg"]}')
    elif row['Lg'] == 'NL':
        mlb_df.at[i,'Lg'] = 1
#         print(f'new value at {i} is {mlb_df.at[i,"Lg"]}')
    

In [4]:
# league is now a 1 or 0
# mlb_df.head()
# mlb_df.dtypes

# Split into training and testing set
Manually split the data by season. Odd years will be used for training, while even years will be used for testing

In [5]:
# get X and Y train for making the model
train_df = mlb_df.loc[mlb_df['Year'] % 2 == 1]
train_data = train_df.values
X_train = train_data[:, 3:22]
y_train = train_data[:,23]

In [6]:
# get X and Y for making the model
test_df = mlb_df.loc[mlb_df['Year'] % 2 == 0]
test_data = test_df.values
X_test = test_data[:, 3:22]
y_test = test_data[:,23]

# Scaling and One-hot encoding

In [7]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

In [8]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# label encode the winner column
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
label_encoder.fit(y_test)
encoded_y_test = label_encoder.transform(y_test)

In [10]:
# One-hot encoding
from keras.utils import to_categorical

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
# y_train_categorical

Using TensorFlow backend.


# Create model

In [11]:
from tensorflow.keras.models import Sequential

model = Sequential()

In [12]:
X_train.shape

(300, 19)

In [13]:
from tensorflow.keras.layers import Dense
number_inputs = 19
number_hidden_nodes = 4
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))
model.add(Dense(4))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [14]:
number_classes = 2
model.add(Dense(units=number_classes, activation='softmax'))

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 4)                 80        
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 20        
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 10        
Total params: 110
Trainable params: 110
Non-trainable params: 0
_________________________________________________________________


In [16]:
# # Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [17]:
print(X_train_scaled.shape)
print(y_train_categorical.shape)

(300, 19)
(300, 2)


In [18]:
# Fit the model
model.fit(
    x=X_train_scaled,
    y=y_train_categorical,
    epochs=1000,
    shuffle=False,
    verbose=0
)

<tensorflow.python.keras.callbacks.History at 0x20913ed0ac8>

In [19]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

330/330 - 0s - loss: 1.2238 - acc: 0.9424
Loss: 1.2238410682389231, Accuracy: 0.9424242377281189


In [20]:
encoded_predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
# print(f"Predicted classes: {prediction_labels}")
# print(f"Actual Labels: {list(y_test)}")

# Post-process and analyze results

In [21]:
pd.set_option('display.max_rows', None)
predictions_df = pd.DataFrame({'Prediction':prediction_labels, 'Actual':list(y_test)})
# predictions_df.head()

In [22]:
winners_df = predictions_df.loc[predictions_df['Actual']]
# winners_df

In [23]:
# connect predictions to the team and season using iloc and test_df
teams = []
years = []
for i, row in winners_df.iterrows():
    team = test_df.iloc[i,2]
    year = test_df.iloc[i,1]
    teams.append(team)
    years.append(year)

In [24]:
# display predictions by team
winners_df['Team'] = teams
winners_df['Year'] = years
# winners_df

In [25]:
# using probabilities 
probs = model.predict_proba(X_test_scaled)
# probs

In [26]:
# dataframe cleanup
probs_df = pd.DataFrame.from_records(probs)
probs_df[0] = probs_df[0] * 100
probs_df[1] = probs_df[1] * 100
probs_df.rename(columns={0:'False', 1:'True'}, inplace=True)
# probs_df.head()

In [27]:
# connect predictions to the team and season using iloc and test_df
all_teams = []
all_years = []
for i, row in probs_df.iterrows():
    team = test_df.iloc[i,2]
    year = test_df.iloc[i,1]
    all_teams.append(team)
    all_years.append(year)
# print(all_years)

In [28]:
# add teams to dataframe
probs_df['Year'] = all_years
probs_df['Team'] = all_teams
probs_df.sort_values(by=['Year','True'],ascending=False)
# probs_df.head()

Unnamed: 0,False,True,Year,Team
285,100.0,5.091336e-21,2020,LAD
313,100.0,1.2365239999999999e-26,2020,TBR
317,100.0,3.548343e-33,2020,OAK
326,100.0,2.74134e-33,2020,CLE
301,100.0,8.436959e-35,2020,SDP
329,100.0,7.766978e-35,2020,STL
305,100.0,6.810755e-35,2020,MIN
316,100.0,1.424783e-35,2020,CHC
297,100.0,0.0,2020,ATL
300,100.0,0.0,2020,CHW


In [29]:
# add top three most likely teams for each season to a dictionary
seasons = probs_df['Year'].unique()
top_three = {}
for season in seasons:
    df = probs_df.loc[probs_df['Year'] == season]
    df = df.sort_values(by='True', ascending=False)
    top_three[season] = {'first_prediction':df.iloc[0,3], 'second_prediction': df.iloc[1,3], 'third_prediction': df.iloc[2,3]}
    

In [30]:
# put dictionary into dataframe
top_three_df = pd.DataFrame(top_three)
top_three_df = top_three_df.transpose()

In [31]:
# sort by year
top_three_df = top_three_df.sort_index()

In [32]:
# adding year as a column
top_three_df.reset_index(inplace=True)
top_three_df = top_three_df.rename(columns={'index':'Year'})

In [33]:
# top_three_df

In [34]:
# bring in actual winners
even_df = top_three_df.merge(winners_df, on='Year')

In [35]:
# dataframe cleanup
even_df.drop(columns=['Prediction','Actual'], inplace=True)

In [36]:
even_df.head()

Unnamed: 0,Year,first_prediction,second_prediction,third_prediction,Team
0,2000,SEA,ATL,STL,NYY
1,2002,ANA,NYY,BOS,ANA
2,2004,BOS,STL,SFG,BOS
3,2006,NYM,MIN,LAA,STL
4,2008,BOS,LAA,NYM,PHI


In [37]:
even_df.rename(columns={'Team':'actual'}, inplace=True)

In [38]:
even_df.head()

Unnamed: 0,Year,first_prediction,second_prediction,third_prediction,actual
0,2000,SEA,ATL,STL,NYY
1,2002,ANA,NYY,BOS,ANA
2,2004,BOS,STL,SFG,BOS
3,2006,NYM,MIN,LAA,STL
4,2008,BOS,LAA,NYM,PHI


# Part 2 - Repeat for odd years

In [39]:
# import dependencies
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings('ignore')

In [40]:
# read in data
pd.set_option('display.max_columns', None)
mlb_df = pd.read_csv('resources/mlb_data.csv')
# mlb_df.head()

In [41]:
# change the League column to be binary instead of string
for i, row in mlb_df.iterrows():
#     print(i,row['Lg'])
    if row['Lg'] == 'AL':
        mlb_df.at[i,'Lg'] = 0
#         print(f'new value at {i} is {mlb_df.at[i,"Lg"]}')
    elif row['Lg'] == 'NL':
        mlb_df.at[i,'Lg'] = 1
#         print(f'new value at {i} is {mlb_df.at[i,"Lg"]}')
    

In [42]:
# league is now a 1 or 0
# mlb_df.head()
# mlb_df.dtypes

# Split into training and testing set
Manually split the data by season. Even years will be used for training, while odd years will be used for testing

In [43]:
# get X and Y train for making the model
train_df = mlb_df.loc[mlb_df['Year'] % 2 == 0]
train_data = train_df.values
X_train = train_data[:, 3:22]
y_train = train_data[:,23]

In [44]:
# get X and Y for making the model
test_df = mlb_df.loc[mlb_df['Year'] % 2 == 1]
test_data = test_df.values
X_test = test_data[:, 3:22]
y_test = test_data[:,23]

# Scaling and One-hot encoding

In [45]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

In [46]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [47]:
# label encode the winner column
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
label_encoder.fit(y_test)
encoded_y_test = label_encoder.transform(y_test)

In [48]:
# One-hot encoding
from keras.utils import to_categorical

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
# y_train_categorical

# Create model

In [49]:
from tensorflow.keras.models import Sequential

model = Sequential()

In [50]:
X_train.shape

(330, 19)

In [51]:
from tensorflow.keras.layers import Dense
number_inputs = 19
number_hidden_nodes = 4
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))
model.add(Dense(4))

In [52]:
number_classes = 2
model.add(Dense(units=number_classes, activation='softmax'))

In [53]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 4)                 80        
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 20        
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 10        
Total params: 110
Trainable params: 110
Non-trainable params: 0
_________________________________________________________________


In [54]:
# # Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [55]:
print(X_train_scaled.shape)
print(y_train_categorical.shape)

(330, 19)
(330, 2)


In [56]:
# Fit the model
model.fit(
    x=X_train_scaled,
    y=y_train_categorical,
    epochs=1000,
    shuffle=False,
    verbose=0
)

<tensorflow.python.keras.callbacks.History at 0x209153cabe0>

In [57]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

300/300 - 0s - loss: 1.0833 - acc: 0.9400
Loss: 1.0833356810268016, Accuracy: 0.9399999976158142


In [58]:
encoded_predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
# print(f"Predicted classes: {prediction_labels}")
# print(f"Actual Labels: {list(y_test)}")

# Post-process and analyze results

In [59]:
pd.set_option('display.max_rows', None)
predictions_df = pd.DataFrame({'Prediction':prediction_labels, 'Actual':list(y_test)})
# predictions_df.head()

In [60]:
winners_df = predictions_df.loc[predictions_df['Actual']]
# winners_df

In [61]:
# connect predictions to the team and season using iloc and test_df
teams = []
years = []
for i, row in winners_df.iterrows():
    team = test_df.iloc[i,2]
    year = test_df.iloc[i,1]
    teams.append(team)
    years.append(year)

In [62]:
# display predictions by team
winners_df['Team'] = teams
winners_df['Year'] = years
# winners_df

In [63]:
# using probabilities 
probs = model.predict_proba(X_test_scaled)
# probs

In [64]:
# dataframe cleanup
probs_df = pd.DataFrame.from_records(probs)
probs_df[0] = probs_df[0] * 100
probs_df[1] = probs_df[1] * 100
probs_df.rename(columns={0:'False', 1:'True'}, inplace=True)
# probs_df.head()

In [65]:
# connect predictions to the team and season using iloc and test_df
all_teams = []
all_years = []
for i, row in probs_df.iterrows():
    team = test_df.iloc[i,2]
    year = test_df.iloc[i,1]
    all_teams.append(team)
    all_years.append(year)
# print(all_years)

In [66]:
# add teams to dataframe
probs_df['Year'] = all_years
probs_df['Team'] = all_teams
probs_df.sort_values(by=['Year','True'],ascending=False)
# probs_df.head()

Unnamed: 0,False,True,Year,Team
25,100.0,6.472664e-21,2019,WSN
3,100.0,1.64379e-22,2019,LAD
2,100.0,2.019826e-26,2019,HOU
63,100.0,4.131683e-30,2019,STL
5,100.0,2.481988e-31,2019,OAK
0,100.0,9.046191e-34,2019,MIN
1,100.0,0.0,2019,NYY
6,100.0,0.0,2019,CHC
7,100.0,0.0,2019,MIL
8,100.0,0.0,2019,ATL


In [67]:
# add top three most likely teams for each season to a dictionary
seasons = probs_df['Year'].unique()
top_three = {}
for season in seasons:
    df = probs_df.loc[probs_df['Year'] == season]
    df = df.sort_values(by='True', ascending=False)
    top_three[season] = {'first_prediction':df.iloc[0,3], 'second_prediction': df.iloc[1,3], 'third_prediction': df.iloc[2,3]}
    

In [68]:
# put dictionary into dataframe
top_three_df = pd.DataFrame(top_three)
top_three_df = top_three_df.transpose()

In [69]:
# sort by year
top_three_df = top_three_df.sort_index()

In [70]:
# adding year as a column
top_three_df.reset_index(inplace=True)
top_three_df = top_three_df.rename(columns={'index':'Year'})

In [71]:
# top_three_df

In [72]:
# bring in actual winners
odd_df = top_three_df.merge(winners_df, on='Year')

In [73]:
# dataframe cleanup
odd_df.drop(columns=['Prediction','Actual'], inplace=True)

In [74]:
odd_df.head()

Unnamed: 0,Year,first_prediction,second_prediction,third_prediction,Team
0,2001,SEA,CHC,CLE,ARI
1,2003,SFG,HOU,TOR,FLA
2,2005,OAK,ATL,STL,CHW
3,2007,SEA,COL,DET,BOS
4,2009,LAA,STL,FLA,NYY


In [75]:
odd_df.rename(columns={'Team':'actual'}, inplace=True)

In [76]:
# odd_df.head()

In [77]:
# even_df.head()

In [78]:
df = pd.concat([even_df, odd_df])
# df.head()

In [79]:
df = df.sort_values(by='Year')
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,Year,first_prediction,second_prediction,third_prediction,actual
0,2000,SEA,ATL,STL,NYY
1,2001,SEA,CHC,CLE,ARI
2,2002,ANA,NYY,BOS,ANA
3,2003,SFG,HOU,TOR,FLA
4,2004,BOS,STL,SFG,BOS


In [82]:
# add full names to df
teams =  ['ANA','ARI','ATL','BAL','BOS','CHC','CHW','CIN','CLE','COL','DET','FLA','HOU','KCR','LAA','LAD','MIA','MIL','MIN','MON','NYM','NYY','OAK','PHI','PIT','SDP','SEA','SFG','STL','TBD','TBR','TEX','TOR','WSN']
full_names = ['Anahiem Angels','Arizona Diamondbacks','Atlanta Braves','Baltimore Orioles','Boston Red Sox','Chicago Cubs','Chicago White Sox','Cincinnati Reds','Cleveland Indians','Colorado Rockies','Detroit Tigers','Florida Marlins','Houston Astros','Kansas City Royals','Los Angeles Angels','Los Angeles Dodgers','Miami Marlins','Milwaukee Brewers','Minnesota Twins','Montreal Expos','New York Mets','New York Yankees','Oakland Athletics','Philadelphia Phillies','Pittsburgh Pirates','San Diego Padres','Seattle Mariners','San Francisco Giants','St. Louis Cardinals','Tampa Bay Devil Rays','Tampa Bay Rays','Texas Rangers','Toronto Blue Jays','Washington Nationals']
full_names_df = pd.DataFrame({'abbr':teams,'fn':full_names})
# full_names_df

In [85]:
df = df.merge(full_names_df, right_on='abbr', left_on='actual')

In [87]:
df.rename(columns={'fn':'actual_fn'},inplace=True)
df.drop(columns='abbr',inplace=True)

In [88]:
df

Unnamed: 0,Year,first_prediction,second_prediction,third_prediction,actual,first_prediction_fn
0,2000,SEA,ATL,STL,NYY,New York Yankees
1,2009,LAA,STL,FLA,NYY,New York Yankees
2,2001,SEA,CHC,CLE,ARI,Arizona Diamondbacks
3,2002,ANA,NYY,BOS,ANA,Anahiem Angels
4,2003,SFG,HOU,TOR,FLA,Florida Marlins
5,2004,BOS,STL,SFG,BOS,Boston Red Sox
6,2007,SEA,COL,DET,BOS,Boston Red Sox
7,2013,STL,BOS,CIN,BOS,Boston Red Sox
8,2018,BOS,CLE,COL,BOS,Boston Red Sox
9,2005,OAK,ATL,STL,CHW,Chicago White Sox


In [81]:
df.to_json('nn_predictions.json')