# Neural Network for Predicting World Series Champions
The procedure has been borrowed in part from the machine learning class examples

# Read in data and pre-process

In [1]:
# import dependencies
import pandas as pd

In [2]:
# read in data
pd.set_option('display.max_columns', None)
mlb_df = pd.read_csv('resources/mlb_data.csv')
# mlb_df.head()

In [3]:
# change the League column to be binary instead of string
for i, row in mlb_df.iterrows():
#     print(i,row['Lg'])
    if row['Lg'] == 'AL':
        mlb_df.at[i,'Lg'] = 0
#         print(f'new value at {i} is {mlb_df.at[i,"Lg"]}')
    elif row['Lg'] == 'NL':
        mlb_df.at[i,'Lg'] = 1
#         print(f'new value at {i} is {mlb_df.at[i,"Lg"]}')
    

In [4]:
# league is now a 1 or 0
# mlb_df.head()
# mlb_df.dtypes

# Split into training and testing set
Manually split the data by season. Odd years will be used for training, while even years will be used for testing

In [5]:
# get X and Y train for making the model
train_df = mlb_df.loc[mlb_df['Year'] % 2 == 1]
train_data = train_df.values
X_train = train_data[:, 3:22]
y_train = train_data[:,23]

In [6]:
# get X and Y for making the model
test_df = mlb_df.loc[mlb_df['Year'] % 2 == 0]
test_data = test_df.values
X_test = test_data[:, 3:22]
y_test = test_data[:,23]

# Scaling and One-hot encoding

In [7]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

In [8]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# label encode the winner column
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
label_encoder.fit(y_test)
encoded_y_test = label_encoder.transform(y_test)

In [10]:
# One-hot encoding
from keras.utils import to_categorical

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
# y_train_categorical

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Create model

In [11]:
from tensorflow.keras.models import Sequential

model = Sequential()

In [12]:
X_train.shape

(300, 19)

In [13]:
from tensorflow.keras.layers import Dense
number_inputs = 19
number_hidden_nodes = 4
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [14]:
number_classes = 2
model.add(Dense(units=number_classes, activation='softmax'))

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 4)                 80        
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 10        
Total params: 90
Trainable params: 90
Non-trainable params: 0
_________________________________________________________________


In [16]:
# # Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [17]:
print(X_train_scaled.shape)
print(y_train_categorical.shape)

(300, 19)
(300, 2)


In [18]:
# Fit the model
model.fit(
    x=X_train_scaled,
    y=y_train_categorical,
    epochs=1000,
    shuffle=False,
    verbose=2
)

Epoch 1/1000
300/300 - 0s - loss: 1.3970 - acc: 0.2967
Epoch 2/1000
300/300 - 0s - loss: 1.3001 - acc: 0.3333
Epoch 3/1000
300/300 - 0s - loss: 1.2184 - acc: 0.3700
Epoch 4/1000
300/300 - 0s - loss: 1.1428 - acc: 0.4200
Epoch 5/1000
300/300 - 0s - loss: 1.0726 - acc: 0.4400
Epoch 6/1000
300/300 - 0s - loss: 1.0077 - acc: 0.4633
Epoch 7/1000
300/300 - 0s - loss: 0.9482 - acc: 0.4900
Epoch 8/1000
300/300 - 0s - loss: 0.8935 - acc: 0.5133
Epoch 9/1000
300/300 - 0s - loss: 0.8437 - acc: 0.5533
Epoch 10/1000
300/300 - 0s - loss: 0.7977 - acc: 0.5733
Epoch 11/1000
300/300 - 0s - loss: 0.7549 - acc: 0.5967
Epoch 12/1000
300/300 - 0s - loss: 0.7155 - acc: 0.6233
Epoch 13/1000
300/300 - 0s - loss: 0.6789 - acc: 0.6400
Epoch 14/1000
300/300 - 0s - loss: 0.6448 - acc: 0.6500
Epoch 15/1000
300/300 - 0s - loss: 0.6132 - acc: 0.6833
Epoch 16/1000
300/300 - 0s - loss: 0.5845 - acc: 0.6833
Epoch 17/1000
300/300 - 0s - loss: 0.5582 - acc: 0.7067
Epoch 18/1000
300/300 - 0s - loss: 0.5338 - acc: 0.7267
E

Epoch 147/1000
300/300 - 0s - loss: 0.0969 - acc: 0.9667
Epoch 148/1000
300/300 - 0s - loss: 0.0963 - acc: 0.9667
Epoch 149/1000
300/300 - 0s - loss: 0.0957 - acc: 0.9667
Epoch 150/1000
300/300 - 0s - loss: 0.0951 - acc: 0.9667
Epoch 151/1000
300/300 - 0s - loss: 0.0946 - acc: 0.9667
Epoch 152/1000
300/300 - 0s - loss: 0.0941 - acc: 0.9667
Epoch 153/1000
300/300 - 0s - loss: 0.0936 - acc: 0.9667
Epoch 154/1000
300/300 - 0s - loss: 0.0932 - acc: 0.9667
Epoch 155/1000
300/300 - 0s - loss: 0.0928 - acc: 0.9667
Epoch 156/1000
300/300 - 0s - loss: 0.0924 - acc: 0.9667
Epoch 157/1000
300/300 - 0s - loss: 0.0920 - acc: 0.9667
Epoch 158/1000
300/300 - 0s - loss: 0.0916 - acc: 0.9667
Epoch 159/1000
300/300 - 0s - loss: 0.0913 - acc: 0.9667
Epoch 160/1000
300/300 - 0s - loss: 0.0909 - acc: 0.9667
Epoch 161/1000
300/300 - 0s - loss: 0.0905 - acc: 0.9667
Epoch 162/1000
300/300 - 0s - loss: 0.0901 - acc: 0.9667
Epoch 163/1000
300/300 - 0s - loss: 0.0897 - acc: 0.9667
Epoch 164/1000
300/300 - 0s - l

Epoch 291/1000
300/300 - 0s - loss: 0.0631 - acc: 0.9767
Epoch 292/1000
300/300 - 0s - loss: 0.0629 - acc: 0.9767
Epoch 293/1000
300/300 - 0s - loss: 0.0627 - acc: 0.9767
Epoch 294/1000
300/300 - 0s - loss: 0.0627 - acc: 0.9767
Epoch 295/1000
300/300 - 0s - loss: 0.0625 - acc: 0.9767
Epoch 296/1000
300/300 - 0s - loss: 0.0624 - acc: 0.9767
Epoch 297/1000
300/300 - 0s - loss: 0.0622 - acc: 0.9767
Epoch 298/1000
300/300 - 0s - loss: 0.0620 - acc: 0.9767
Epoch 299/1000
300/300 - 0s - loss: 0.0619 - acc: 0.9767
Epoch 300/1000
300/300 - 0s - loss: 0.0617 - acc: 0.9767
Epoch 301/1000
300/300 - 0s - loss: 0.0616 - acc: 0.9767
Epoch 302/1000
300/300 - 0s - loss: 0.0615 - acc: 0.9767
Epoch 303/1000
300/300 - 0s - loss: 0.0613 - acc: 0.9767
Epoch 304/1000
300/300 - 0s - loss: 0.0612 - acc: 0.9767
Epoch 305/1000
300/300 - 0s - loss: 0.0611 - acc: 0.9767
Epoch 306/1000
300/300 - 0s - loss: 0.0608 - acc: 0.9767
Epoch 307/1000
300/300 - 0s - loss: 0.0608 - acc: 0.9767
Epoch 308/1000
300/300 - 0s - l

Epoch 435/1000
300/300 - 0s - loss: 0.0474 - acc: 0.9800
Epoch 436/1000
300/300 - 0s - loss: 0.0473 - acc: 0.9800
Epoch 437/1000
300/300 - 0s - loss: 0.0471 - acc: 0.9800
Epoch 438/1000
300/300 - 0s - loss: 0.0470 - acc: 0.9800
Epoch 439/1000
300/300 - 0s - loss: 0.0469 - acc: 0.9800
Epoch 440/1000
300/300 - 0s - loss: 0.0469 - acc: 0.9800
Epoch 441/1000
300/300 - 0s - loss: 0.0467 - acc: 0.9800
Epoch 442/1000
300/300 - 0s - loss: 0.0465 - acc: 0.9800
Epoch 443/1000
300/300 - 0s - loss: 0.0465 - acc: 0.9800
Epoch 444/1000
300/300 - 0s - loss: 0.0463 - acc: 0.9800
Epoch 445/1000
300/300 - 0s - loss: 0.0462 - acc: 0.9800
Epoch 446/1000
300/300 - 0s - loss: 0.0461 - acc: 0.9800
Epoch 447/1000
300/300 - 0s - loss: 0.0460 - acc: 0.9800
Epoch 448/1000
300/300 - 0s - loss: 0.0458 - acc: 0.9800
Epoch 449/1000
300/300 - 0s - loss: 0.0458 - acc: 0.9800
Epoch 450/1000
300/300 - 0s - loss: 0.0457 - acc: 0.9800
Epoch 451/1000
300/300 - 0s - loss: 0.0456 - acc: 0.9800
Epoch 452/1000
300/300 - 0s - l

Epoch 579/1000
300/300 - 0s - loss: 0.0358 - acc: 0.9800
Epoch 580/1000
300/300 - 0s - loss: 0.0357 - acc: 0.9800
Epoch 581/1000
300/300 - 0s - loss: 0.0356 - acc: 0.9800
Epoch 582/1000
300/300 - 0s - loss: 0.0357 - acc: 0.9800
Epoch 583/1000
300/300 - 0s - loss: 0.0356 - acc: 0.9800
Epoch 584/1000
300/300 - 0s - loss: 0.0355 - acc: 0.9800
Epoch 585/1000
300/300 - 0s - loss: 0.0355 - acc: 0.9800
Epoch 586/1000
300/300 - 0s - loss: 0.0355 - acc: 0.9800
Epoch 587/1000
300/300 - 0s - loss: 0.0354 - acc: 0.9800
Epoch 588/1000
300/300 - 0s - loss: 0.0353 - acc: 0.9800
Epoch 589/1000
300/300 - 0s - loss: 0.0352 - acc: 0.9800
Epoch 590/1000
300/300 - 0s - loss: 0.0353 - acc: 0.9800
Epoch 591/1000
300/300 - 0s - loss: 0.0352 - acc: 0.9800
Epoch 592/1000
300/300 - 0s - loss: 0.0351 - acc: 0.9800
Epoch 593/1000
300/300 - 0s - loss: 0.0350 - acc: 0.9800
Epoch 594/1000
300/300 - 0s - loss: 0.0351 - acc: 0.9800
Epoch 595/1000
300/300 - 0s - loss: 0.0350 - acc: 0.9800
Epoch 596/1000
300/300 - 0s - l

Epoch 723/1000
300/300 - 0s - loss: 0.0287 - acc: 0.9800
Epoch 724/1000
300/300 - 0s - loss: 0.0287 - acc: 0.9800
Epoch 725/1000
300/300 - 0s - loss: 0.0286 - acc: 0.9800
Epoch 726/1000
300/300 - 0s - loss: 0.0287 - acc: 0.9800
Epoch 727/1000
300/300 - 0s - loss: 0.0286 - acc: 0.9800
Epoch 728/1000
300/300 - 0s - loss: 0.0286 - acc: 0.9800
Epoch 729/1000
300/300 - 0s - loss: 0.0286 - acc: 0.9800
Epoch 730/1000
300/300 - 0s - loss: 0.0285 - acc: 0.9800
Epoch 731/1000
300/300 - 0s - loss: 0.0286 - acc: 0.9800
Epoch 732/1000
300/300 - 0s - loss: 0.0286 - acc: 0.9800
Epoch 733/1000
300/300 - 0s - loss: 0.0286 - acc: 0.9800
Epoch 734/1000
300/300 - 0s - loss: 0.0285 - acc: 0.9800
Epoch 735/1000
300/300 - 0s - loss: 0.0284 - acc: 0.9800
Epoch 736/1000
300/300 - 0s - loss: 0.0284 - acc: 0.9800
Epoch 737/1000
300/300 - 0s - loss: 0.0283 - acc: 0.9800
Epoch 738/1000
300/300 - 0s - loss: 0.0285 - acc: 0.9800
Epoch 739/1000
300/300 - 0s - loss: 0.0284 - acc: 0.9800
Epoch 740/1000
300/300 - 0s - l

Epoch 867/1000
300/300 - 0s - loss: 0.0244 - acc: 0.9867
Epoch 868/1000
300/300 - 0s - loss: 0.0244 - acc: 0.9867
Epoch 869/1000
300/300 - 0s - loss: 0.0244 - acc: 0.9867
Epoch 870/1000
300/300 - 0s - loss: 0.0244 - acc: 0.9867
Epoch 871/1000
300/300 - 0s - loss: 0.0244 - acc: 0.9867
Epoch 872/1000
300/300 - 0s - loss: 0.0243 - acc: 0.9867
Epoch 873/1000
300/300 - 0s - loss: 0.0244 - acc: 0.9867
Epoch 874/1000
300/300 - 0s - loss: 0.0243 - acc: 0.9867
Epoch 875/1000
300/300 - 0s - loss: 0.0243 - acc: 0.9867
Epoch 876/1000
300/300 - 0s - loss: 0.0242 - acc: 0.9867
Epoch 877/1000
300/300 - 0s - loss: 0.0243 - acc: 0.9867
Epoch 878/1000
300/300 - 0s - loss: 0.0242 - acc: 0.9867
Epoch 879/1000
300/300 - 0s - loss: 0.0242 - acc: 0.9867
Epoch 880/1000
300/300 - 0s - loss: 0.0243 - acc: 0.9867
Epoch 881/1000
300/300 - 0s - loss: 0.0242 - acc: 0.9867
Epoch 882/1000
300/300 - 0s - loss: 0.0242 - acc: 0.9867
Epoch 883/1000
300/300 - 0s - loss: 0.0241 - acc: 0.9867
Epoch 884/1000
300/300 - 0s - l

<tensorflow.python.keras.callbacks.History at 0x1f3eb1fe9e8>

In [19]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

330/330 - 0s - loss: 6.7193 - acc: 0.8667
Loss: 6.719335439698735, Accuracy: 0.8666666746139526


In [20]:
encoded_predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
# print(f"Predicted classes: {prediction_labels}")
# print(f"Actual Labels: {list(y_test)}")

In [28]:
pd.set_option('display.max_rows', None)
predictions_df = pd.DataFrame({'Prediction':prediction_labels, 'Actual':list(y_test)})
# predictions_df.head()

In [22]:
winners_df = predictions_df.loc[predictions_df['Actual']]
winners_df

Unnamed: 0,Prediction,Actual
23,False,True
39,False,True
48,True,True
54,False,True
69,False,True
103,False,True
173,False,True
210,False,True
257,False,True
285,True,True


In [29]:
# connect predictions to the team and season using iloc and test_df

teams = []
years = []
for i, row in winners_df.iterrows():
    team = test_df.iloc[i,2]
    year = test_df.iloc[i,1]
    teams.append(team)
    years.append(year)


['BOS', 'PHI', 'BOS', 'NYY', 'CHC', 'STL', 'SFG', 'ANA', 'SFG', 'LAD', 'SFG']
[2004, 2008, 2018, 2000, 2016, 2006, 2010, 2002, 2014, 2020, 2012]


In [30]:
winners_df['Team'] = teams
winners_df['Year'] = years
winners_df

Unnamed: 0,Prediction,Actual,Team,Year
23,False,True,BOS,2004
39,False,True,PHI,2008
48,True,True,BOS,2018
54,False,True,NYY,2000
69,False,True,CHC,2016
103,False,True,STL,2006
173,False,True,SFG,2010
210,False,True,ANA,2002
257,False,True,SFG,2014
285,True,True,LAD,2020


In [39]:
# can we have it predict the most likely winner by season? Maybe use some sort of "confidence" metric
probs = model.predict_proba(X_test_scaled)
probs

array([[1.0000000e+00, 2.7390731e-08],
       [9.9999726e-01, 2.7623105e-06],
       [1.0000000e+00, 3.1595673e-10],
       [1.0000000e+00, 1.5135898e-11],
       [9.8911357e-01, 1.0886383e-02],
       [1.0000000e+00, 5.3547696e-12],
       [1.0000000e+00, 5.9243413e-12],
       [9.9999988e-01, 1.4492335e-07],
       [1.0000000e+00, 2.6976830e-08],
       [1.0000000e+00, 6.8433677e-09],
       [1.0000000e+00, 5.2957247e-09],
       [1.0000000e+00, 8.2878962e-14],
       [9.9999392e-01, 6.1278661e-06],
       [9.9993587e-01, 6.4112559e-05],
       [5.1752961e-01, 4.8247042e-01],
       [1.0000000e+00, 8.1101581e-10],
       [1.0000000e+00, 2.0431631e-10],
       [1.0000000e+00, 8.1938831e-11],
       [4.2560124e-01, 5.7439876e-01],
       [1.0000000e+00, 1.8422434e-09],
       [9.9999988e-01, 6.9052440e-08],
       [9.7550893e-01, 2.4491118e-02],
       [9.9999893e-01, 1.0520904e-06],
       [9.8306262e-01, 1.6937401e-02],
       [1.0000000e+00, 4.6910520e-10],
       [9.9999940e-01, 5.

In [56]:
import numpy as np
probs_df = pd.DataFrame.from_records(probs)
probs_df[0] = probs_df[0] * 100
probs_df[1] = probs_df[1] * 100
probs_df.rename(columns={0:False, 1:True}, inplace=True)
probs_df.head()

Unnamed: 0,False,True
0,100.0,2.739073e-06
1,99.999726,0.000276231
2,100.0,3.159567e-08
3,100.0,1.51359e-09
4,98.911357,1.088638


In [57]:
# connect predictions to the team and season using iloc and test_df

all_teams = []
all_years = []
for i, row in probs_df.iterrows():
    team = test_df.iloc[i,2]
    year = test_df.iloc[i,1]
    all_teams.append(team)
    all_years.append(year)
print(all_years)

[2018, 2010, 2016, 2000, 2012, 2000, 2004, 2004, 2000, 2006, 2000, 2018, 2008, 2004, 2000, 2002, 2018, 2004, 2000, 2016, 2016, 2002, 2006, 2004, 2016, 2000, 2018, 2016, 2018, 2006, 2002, 2018, 2016, 2006, 2000, 2016, 2004, 2018, 2012, 2008, 2004, 2016, 2014, 2012, 2010, 2000, 2018, 2006, 2018, 2016, 2008, 2018, 2018, 2002, 2000, 2016, 2016, 2006, 2004, 2012, 2004, 2010, 2004, 2016, 2012, 2008, 2006, 2002, 2000, 2016, 2006, 2016, 2012, 2008, 2002, 2000, 2000, 2006, 2012, 2018, 2016, 2012, 2008, 2004, 2006, 2002, 2018, 2004, 2016, 2006, 2016, 2004, 2018, 2010, 2012, 2008, 2004, 2002, 2018, 2014, 2016, 2004, 2008, 2006, 2004, 2000, 2016, 2006, 2004, 2000, 2018, 2010, 2006, 2010, 2008, 2008, 2006, 2000, 2000, 2004, 2000, 2016, 2014, 2010, 2002, 2000, 2000, 2018, 2018, 2018, 2012, 2006, 2002, 2008, 2006, 2010, 2008, 2000, 2018, 2012, 2008, 2008, 2006, 2008, 2018, 2012, 2016, 2004, 2002, 2016, 2000, 2018, 2008, 2002, 2002, 2000, 2018, 2012, 2010, 2006, 2012, 2012, 2002, 2002, 2002, 2016, 200

In [58]:
probs_df['Year'] = all_years
probs_df['Team'] = all_teams
probs_df.sort_values(by=['Year',True],ascending=False)

Unnamed: 0,False,True,Year,Team
285,0.0,100.0,2020,LAD
297,1.5050670000000001e-33,100.0,2020,ATL
300,0.0,100.0,2020,CHW
301,0.0,100.0,2020,SDP
303,2.711991e-31,100.0,2020,NYY
305,0.0,100.0,2020,MIN
306,8.240673e-34,100.0,2020,CIN
307,8.209675e-26,100.0,2020,TOR
308,3.417332e-33,100.0,2020,NYM
309,2.605635e-27,100.0,2020,LAA
