# Neural Network for Predicting World Series Champions
The procedure has been borrowed in part from the machine learning class examples

# Read in data and pre-process

In [1]:
# import dependencies
import pandas as pd

In [2]:
# read in data
pd.set_option('display.max_columns', None)
mlb_df = pd.read_csv('resources/mlb_data.csv')
# mlb_df.head()

In [3]:
# change the League column to be binary instead of string
for i, row in mlb_df.iterrows():
#     print(i,row['Lg'])
    if row['Lg'] == 'AL':
        mlb_df.at[i,'Lg'] = 0
#         print(f'new value at {i} is {mlb_df.at[i,"Lg"]}')
    elif row['Lg'] == 'NL':
        mlb_df.at[i,'Lg'] = 1
#         print(f'new value at {i} is {mlb_df.at[i,"Lg"]}')
    

In [4]:
# league is now a 1 or 0
# mlb_df.head()
# mlb_df.dtypes

# Split into training and testing set
Manually split the data by season. Odd years will be used for training, while even years will be used for testing

In [5]:
# get X and Y train for making the model
train_df = mlb_df.loc[mlb_df['Year'] % 2 == 1]
train_data = train_df.values
X_train = train_data[:, 3:22]
y_train = train_data[:,23]

In [6]:
# get X and Y for making the model
test_df = mlb_df.loc[mlb_df['Year'] % 2 == 0]
test_data = test_df.values
X_test = test_data[:, 3:22]
y_test = test_data[:,23]

# Scaling and One-hot encoding

In [7]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

In [8]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# label encode the winner column
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
label_encoder.fit(y_test)
encoded_y_test = label_encoder.transform(y_test)

In [10]:
# One-hot encoding
from keras.utils import to_categorical

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
# y_train_categorical

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Create model

In [11]:
from tensorflow.keras.models import Sequential

model = Sequential()

In [12]:
X_train.shape

(300, 19)

In [13]:
from tensorflow.keras.layers import Dense
number_inputs = 19
number_hidden_nodes = 4
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [14]:
number_classes = 2
model.add(Dense(units=number_classes, activation='softmax'))

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 4)                 80        
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 10        
Total params: 90
Trainable params: 90
Non-trainable params: 0
_________________________________________________________________


In [16]:
# # Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [17]:
print(X_train_scaled.shape)
print(y_train_categorical.shape)

(300, 19)
(300, 2)


In [18]:
# Fit the model
model.fit(
    x=X_train_scaled,
    y=y_train_categorical,
    epochs=1000,
    shuffle=False,
    verbose=2
)

Epoch 1/1000
300/300 - 0s - loss: 0.6605 - acc: 0.7200
Epoch 2/1000
300/300 - 0s - loss: 0.5962 - acc: 0.7333
Epoch 3/1000
300/300 - 0s - loss: 0.5421 - acc: 0.7633
Epoch 4/1000
300/300 - 0s - loss: 0.4942 - acc: 0.7900
Epoch 5/1000
300/300 - 0s - loss: 0.4521 - acc: 0.8100
Epoch 6/1000
300/300 - 0s - loss: 0.4155 - acc: 0.8400
Epoch 7/1000
300/300 - 0s - loss: 0.3838 - acc: 0.8633
Epoch 8/1000
300/300 - 0s - loss: 0.3566 - acc: 0.8833
Epoch 9/1000
300/300 - 0s - loss: 0.3334 - acc: 0.8967
Epoch 10/1000
300/300 - 0s - loss: 0.3134 - acc: 0.9233
Epoch 11/1000
300/300 - 0s - loss: 0.2960 - acc: 0.9233
Epoch 12/1000
300/300 - 0s - loss: 0.2807 - acc: 0.9333
Epoch 13/1000
300/300 - 0s - loss: 0.2673 - acc: 0.9333
Epoch 14/1000
300/300 - 0s - loss: 0.2555 - acc: 0.9433
Epoch 15/1000
300/300 - 0s - loss: 0.2449 - acc: 0.9433
Epoch 16/1000
300/300 - 0s - loss: 0.2353 - acc: 0.9500
Epoch 17/1000
300/300 - 0s - loss: 0.2266 - acc: 0.9500
Epoch 18/1000
300/300 - 0s - loss: 0.2187 - acc: 0.9500
E

Epoch 147/1000
300/300 - 0s - loss: 0.0685 - acc: 0.9667
Epoch 148/1000
300/300 - 0s - loss: 0.0682 - acc: 0.9667
Epoch 149/1000
300/300 - 0s - loss: 0.0679 - acc: 0.9667
Epoch 150/1000
300/300 - 0s - loss: 0.0676 - acc: 0.9667
Epoch 151/1000
300/300 - 0s - loss: 0.0673 - acc: 0.9667
Epoch 152/1000
300/300 - 0s - loss: 0.0671 - acc: 0.9667
Epoch 153/1000
300/300 - 0s - loss: 0.0668 - acc: 0.9667
Epoch 154/1000
300/300 - 0s - loss: 0.0665 - acc: 0.9667
Epoch 155/1000
300/300 - 0s - loss: 0.0662 - acc: 0.9667
Epoch 156/1000
300/300 - 0s - loss: 0.0660 - acc: 0.9667
Epoch 157/1000
300/300 - 0s - loss: 0.0657 - acc: 0.9667
Epoch 158/1000
300/300 - 0s - loss: 0.0654 - acc: 0.9667
Epoch 159/1000
300/300 - 0s - loss: 0.0652 - acc: 0.9667
Epoch 160/1000
300/300 - 0s - loss: 0.0649 - acc: 0.9667
Epoch 161/1000
300/300 - 0s - loss: 0.0646 - acc: 0.9667
Epoch 162/1000
300/300 - 0s - loss: 0.0644 - acc: 0.9667
Epoch 163/1000
300/300 - 0s - loss: 0.0642 - acc: 0.9667
Epoch 164/1000
300/300 - 0s - l

Epoch 291/1000
300/300 - 0s - loss: 0.0436 - acc: 0.9733
Epoch 292/1000
300/300 - 0s - loss: 0.0435 - acc: 0.9733
Epoch 293/1000
300/300 - 0s - loss: 0.0434 - acc: 0.9733
Epoch 294/1000
300/300 - 0s - loss: 0.0434 - acc: 0.9733
Epoch 295/1000
300/300 - 0s - loss: 0.0433 - acc: 0.9733
Epoch 296/1000
300/300 - 0s - loss: 0.0432 - acc: 0.9733
Epoch 297/1000
300/300 - 0s - loss: 0.0429 - acc: 0.9767
Epoch 298/1000
300/300 - 0s - loss: 0.0428 - acc: 0.9767
Epoch 299/1000
300/300 - 0s - loss: 0.0427 - acc: 0.9767
Epoch 300/1000
300/300 - 0s - loss: 0.0426 - acc: 0.9767
Epoch 301/1000
300/300 - 0s - loss: 0.0424 - acc: 0.9767
Epoch 302/1000
300/300 - 0s - loss: 0.0424 - acc: 0.9767
Epoch 303/1000
300/300 - 0s - loss: 0.0422 - acc: 0.9767
Epoch 304/1000
300/300 - 0s - loss: 0.0422 - acc: 0.9767
Epoch 305/1000
300/300 - 0s - loss: 0.0419 - acc: 0.9767
Epoch 306/1000
300/300 - 0s - loss: 0.0418 - acc: 0.9767
Epoch 307/1000
300/300 - 0s - loss: 0.0418 - acc: 0.9767
Epoch 308/1000
300/300 - 0s - l

Epoch 435/1000
300/300 - 0s - loss: 0.0296 - acc: 0.9900
Epoch 436/1000
300/300 - 0s - loss: 0.0294 - acc: 0.9933
Epoch 437/1000
300/300 - 0s - loss: 0.0294 - acc: 0.9867
Epoch 438/1000
300/300 - 0s - loss: 0.0294 - acc: 0.9933
Epoch 439/1000
300/300 - 0s - loss: 0.0292 - acc: 0.9900
Epoch 440/1000
300/300 - 0s - loss: 0.0291 - acc: 0.9933
Epoch 441/1000
300/300 - 0s - loss: 0.0292 - acc: 0.9900
Epoch 442/1000
300/300 - 0s - loss: 0.0292 - acc: 0.9900
Epoch 443/1000
300/300 - 0s - loss: 0.0290 - acc: 0.9933
Epoch 444/1000
300/300 - 0s - loss: 0.0289 - acc: 0.9933
Epoch 445/1000
300/300 - 0s - loss: 0.0288 - acc: 0.9933
Epoch 446/1000
300/300 - 0s - loss: 0.0288 - acc: 0.9900
Epoch 447/1000
300/300 - 0s - loss: 0.0288 - acc: 0.9933
Epoch 448/1000
300/300 - 0s - loss: 0.0287 - acc: 0.9933
Epoch 449/1000
300/300 - 0s - loss: 0.0287 - acc: 0.9933
Epoch 450/1000
300/300 - 0s - loss: 0.0286 - acc: 0.9933
Epoch 451/1000
300/300 - 0s - loss: 0.0284 - acc: 0.9933
Epoch 452/1000
300/300 - 0s - l

Epoch 579/1000
300/300 - 0s - loss: 0.0223 - acc: 0.9933
Epoch 580/1000
300/300 - 0s - loss: 0.0223 - acc: 0.9933
Epoch 581/1000
300/300 - 0s - loss: 0.0222 - acc: 0.9933
Epoch 582/1000
300/300 - 0s - loss: 0.0223 - acc: 0.9933
Epoch 583/1000
300/300 - 0s - loss: 0.0222 - acc: 0.9933
Epoch 584/1000
300/300 - 0s - loss: 0.0221 - acc: 0.9933
Epoch 585/1000
300/300 - 0s - loss: 0.0220 - acc: 0.9933
Epoch 586/1000
300/300 - 0s - loss: 0.0221 - acc: 0.9933
Epoch 587/1000
300/300 - 0s - loss: 0.0221 - acc: 0.9933
Epoch 588/1000
300/300 - 0s - loss: 0.0221 - acc: 0.9933
Epoch 589/1000
300/300 - 0s - loss: 0.0220 - acc: 0.9933
Epoch 590/1000
300/300 - 0s - loss: 0.0220 - acc: 0.9933
Epoch 591/1000
300/300 - 0s - loss: 0.0219 - acc: 0.9933
Epoch 592/1000
300/300 - 0s - loss: 0.0218 - acc: 0.9933
Epoch 593/1000
300/300 - 0s - loss: 0.0217 - acc: 0.9933
Epoch 594/1000
300/300 - 0s - loss: 0.0218 - acc: 0.9933
Epoch 595/1000
300/300 - 0s - loss: 0.0217 - acc: 0.9933
Epoch 596/1000
300/300 - 0s - l

Epoch 723/1000
300/300 - 0s - loss: 0.0188 - acc: 0.9933
Epoch 724/1000
300/300 - 0s - loss: 0.0187 - acc: 0.9933
Epoch 725/1000
300/300 - 0s - loss: 0.0187 - acc: 0.9933
Epoch 726/1000
300/300 - 0s - loss: 0.0188 - acc: 0.9933
Epoch 727/1000
300/300 - 0s - loss: 0.0188 - acc: 0.9933
Epoch 728/1000
300/300 - 0s - loss: 0.0188 - acc: 0.9933
Epoch 729/1000
300/300 - 0s - loss: 0.0187 - acc: 0.9933
Epoch 730/1000
300/300 - 0s - loss: 0.0186 - acc: 0.9933
Epoch 731/1000
300/300 - 0s - loss: 0.0187 - acc: 0.9933
Epoch 732/1000
300/300 - 0s - loss: 0.0187 - acc: 0.9933
Epoch 733/1000
300/300 - 0s - loss: 0.0186 - acc: 0.9933
Epoch 734/1000
300/300 - 0s - loss: 0.0187 - acc: 0.9933
Epoch 735/1000
300/300 - 0s - loss: 0.0187 - acc: 0.9933
Epoch 736/1000
300/300 - 0s - loss: 0.0186 - acc: 0.9933
Epoch 737/1000
300/300 - 0s - loss: 0.0185 - acc: 0.9933
Epoch 738/1000
300/300 - 0s - loss: 0.0185 - acc: 0.9933
Epoch 739/1000
300/300 - 0s - loss: 0.0185 - acc: 0.9933
Epoch 740/1000
300/300 - 0s - l

Epoch 867/1000
300/300 - 0s - loss: 0.0172 - acc: 0.9933
Epoch 868/1000
300/300 - 0s - loss: 0.0172 - acc: 0.9933
Epoch 869/1000
300/300 - 0s - loss: 0.0172 - acc: 0.9933
Epoch 870/1000
300/300 - 0s - loss: 0.0171 - acc: 0.9933
Epoch 871/1000
300/300 - 0s - loss: 0.0172 - acc: 0.9933
Epoch 872/1000
300/300 - 0s - loss: 0.0172 - acc: 0.9933
Epoch 873/1000
300/300 - 0s - loss: 0.0172 - acc: 0.9933
Epoch 874/1000
300/300 - 0s - loss: 0.0172 - acc: 0.9933
Epoch 875/1000
300/300 - 0s - loss: 0.0172 - acc: 0.9933
Epoch 876/1000
300/300 - 0s - loss: 0.0172 - acc: 0.9933
Epoch 877/1000
300/300 - 0s - loss: 0.0171 - acc: 0.9933
Epoch 878/1000
300/300 - 0s - loss: 0.0171 - acc: 0.9933
Epoch 879/1000
300/300 - 0s - loss: 0.0171 - acc: 0.9933
Epoch 880/1000
300/300 - 0s - loss: 0.0171 - acc: 0.9933
Epoch 881/1000
300/300 - 0s - loss: 0.0171 - acc: 0.9933
Epoch 882/1000
300/300 - 0s - loss: 0.0171 - acc: 0.9933
Epoch 883/1000
300/300 - 0s - loss: 0.0171 - acc: 0.9933
Epoch 884/1000
300/300 - 0s - l

<tensorflow.python.keras.callbacks.History at 0x1825043ad30>

In [19]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

330/330 - 0s - loss: 0.6761 - acc: 0.9121
Loss: 0.6760580225424333, Accuracy: 0.9121212363243103


In [20]:
encoded_predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
# print(f"Predicted classes: {prediction_labels}")
# print(f"Actual Labels: {list(y_test)}")

In [21]:
pd.set_option('display.max_rows', None)
predictions_df = pd.DataFrame({'Prediction':prediction_labels, 'Actual':list(y_test)})
# predictions_df.head()

In [22]:
winners_df = predictions_df.loc[predictions_df['Actual']]
winners_df

Unnamed: 0,Prediction,Actual
23,False,True
39,False,True
48,False,True
54,False,True
69,False,True
103,False,True
173,False,True
210,False,True
257,False,True
285,True,True


In [23]:
# connect predictions to the team and season using iloc and test_df

teams = []
years = []
for i, row in winners_df.iterrows():
    team = test_df.iloc[i,2]
    year = test_df.iloc[i,1]
    teams.append(team)
    years.append(year)


In [24]:
winners_df['Team'] = teams
winners_df['Year'] = years
winners_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Prediction,Actual,Team,Year
23,False,True,BOS,2004
39,False,True,PHI,2008
48,False,True,BOS,2018
54,False,True,NYY,2000
69,False,True,CHC,2016
103,False,True,STL,2006
173,False,True,SFG,2010
210,False,True,ANA,2002
257,False,True,SFG,2014
285,True,True,LAD,2020


In [25]:
# can we have it predict the most likely winner by season? Maybe use some sort of "confidence" metric
probs = model.predict_proba(X_test_scaled)
# probs

In [26]:
import numpy as np
probs_df = pd.DataFrame.from_records(probs)
probs_df[0] = probs_df[0] * 100
probs_df[1] = probs_df[1] * 100
probs_df.rename(columns={0:'False', 1:'True'}, inplace=True)
probs_df.head()

Unnamed: 0,False,True
0,100.0,8.627349e-14
1,100.0,6.203078e-15
2,100.0,3.661308e-14
3,100.0,3.554561e-18
4,99.965084,0.03491151


In [27]:
# connect predictions to the team and season using iloc and test_df

all_teams = []
all_years = []
for i, row in probs_df.iterrows():
    team = test_df.iloc[i,2]
    year = test_df.iloc[i,1]
    all_teams.append(team)
    all_years.append(year)
# print(all_years)

In [28]:
probs_df['Year'] = all_years
probs_df['Team'] = all_teams
probs_df.sort_values(by=['Year','True'],ascending=False)

Unnamed: 0,False,True,Year,Team
326,10.310418,89.68958,2020,CLE
285,25.373086,74.62691,2020,LAD
300,25.373086,74.62691,2020,CHW
301,25.373086,74.62691,2020,SDP
305,25.373086,74.62691,2020,MIN
306,25.373086,74.62691,2020,CIN
313,25.373086,74.62691,2020,TBR
315,25.373086,74.62691,2020,MIL
316,25.373086,74.62691,2020,CHC
317,25.373086,74.62691,2020,OAK


In [29]:
seasons = probs_df['Year'].unique()
top_three = {}
for season in seasons:
    df = probs_df.loc[probs_df['Year'] == season]
    df = df.sort_values(by='True', ascending=False)
    top_three[season] = {'first_prediction':df.iloc[0,3], 'second_prediction': df.iloc[1,3], 'third_prediction': df.iloc[2,3]}
    

In [30]:
top_three_df = pd.DataFrame(top_three)
top_three_df = top_three_df.transpose()

In [31]:
top_three_df = top_three_df.sort_index()

In [32]:
top_three_df.reset_index(inplace=True)
top_three_df = top_three_df.rename(columns={'index':'Year'})

In [33]:
top_three_df

Unnamed: 0,Year,first_prediction,second_prediction,third_prediction
0,2000,ATL,SEA,STL
1,2002,BOS,STL,ANA
2,2004,STL,HOU,SFG
3,2006,NYM,LAA,LAD
4,2008,BOS,CHC,PHI
5,2010,MIN,TEX,CIN
6,2012,TEX,DET,LAA
7,2014,DET,KCR,LAD
8,2016,CLE,BOS,PIT
9,2018,CLE,BOS,SEA


In [34]:
df = top_three_df.merge(winners_df, on='Year')

In [35]:
df.drop(columns=['Prediction','Actual'], inplace=True)
df.rename(columns={'Team':'actual'})

Unnamed: 0,Year,first_prediction,second_prediction,third_prediction,actual
0,2000,ATL,SEA,STL,NYY
1,2002,BOS,STL,ANA,ANA
2,2004,STL,HOU,SFG,BOS
3,2006,NYM,LAA,LAD,STL
4,2008,BOS,CHC,PHI,PHI
5,2010,MIN,TEX,CIN,SFG
6,2012,TEX,DET,LAA,SFG
7,2014,DET,KCR,LAD,SFG
8,2016,CLE,BOS,PIT,CHC
9,2018,CLE,BOS,SEA,BOS
