# Neural Network for Predicting World Series Champions
The procedure has been borrowed in part from the machine learning class examples

# Read in data and pre-process

In [None]:
# import dependencies
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings('ignore')

In [None]:
# read in data
pd.set_option('display.max_columns', None)
mlb_df = pd.read_csv('resources/mlb_data.csv')
# mlb_df.head()

In [None]:
# change the League column to be binary instead of string
for i, row in mlb_df.iterrows():
#     print(i,row['Lg'])
    if row['Lg'] == 'AL':
        mlb_df.at[i,'Lg'] = 0
#         print(f'new value at {i} is {mlb_df.at[i,"Lg"]}')
    elif row['Lg'] == 'NL':
        mlb_df.at[i,'Lg'] = 1
#         print(f'new value at {i} is {mlb_df.at[i,"Lg"]}')
    

In [None]:
# league is now a 1 or 0
# mlb_df.head()
# mlb_df.dtypes

# Split into training and testing set
Manually split the data by season. Odd years will be used for training, while even years will be used for testing

In [None]:
# get X and Y train for making the model
train_df = mlb_df.loc[mlb_df['Year'] % 2 == 1]
train_data = train_df.values
X_train = train_data[:, 3:22]
y_train = train_data[:,23]

In [None]:
# get X and Y for making the model
test_df = mlb_df.loc[mlb_df['Year'] % 2 == 0]
test_data = test_df.values
X_test = test_data[:, 3:22]
y_test = test_data[:,23]

# Scaling and One-hot encoding

In [None]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# label encode the winner column
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
label_encoder.fit(y_test)
encoded_y_test = label_encoder.transform(y_test)

In [None]:
# One-hot encoding
from keras.utils import to_categorical

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
# y_train_categorical

# Create model

In [None]:
from tensorflow.keras.models import Sequential

model = Sequential()

In [None]:
X_train.shape

In [None]:
from tensorflow.keras.layers import Dense
number_inputs = 19
number_hidden_nodes = 4
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))
model.add(Dense(4))

In [None]:
number_classes = 2
model.add(Dense(units=number_classes, activation='softmax'))

In [None]:
model.summary()

In [None]:
# # Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
print(X_train_scaled.shape)
print(y_train_categorical.shape)

In [None]:
# Fit the model
model.fit(
    x=X_train_scaled,
    y=y_train_categorical,
    epochs=1000,
    shuffle=False,
    verbose=0
)

In [None]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
encoded_predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
# print(f"Predicted classes: {prediction_labels}")
# print(f"Actual Labels: {list(y_test)}")

# Post-process and analyze results

In [None]:
pd.set_option('display.max_rows', None)
predictions_df = pd.DataFrame({'Prediction':prediction_labels, 'Actual':list(y_test)})
# predictions_df.head()

In [None]:
winners_df = predictions_df.loc[predictions_df['Actual']]
# winners_df

In [None]:
# connect predictions to the team and season using iloc and test_df
teams = []
years = []
for i, row in winners_df.iterrows():
    team = test_df.iloc[i,2]
    year = test_df.iloc[i,1]
    teams.append(team)
    years.append(year)

In [None]:
# display predictions by team
winners_df['Team'] = teams
winners_df['Year'] = years
# winners_df

In [None]:
# using probabilities 
probs = model.predict_proba(X_test_scaled)
# probs

In [None]:
# dataframe cleanup
probs_df = pd.DataFrame.from_records(probs)
probs_df[0] = probs_df[0] * 100
probs_df[1] = probs_df[1] * 100
probs_df.rename(columns={0:'False', 1:'True'}, inplace=True)
# probs_df.head()

In [None]:
# connect predictions to the team and season using iloc and test_df
all_teams = []
all_years = []
for i, row in probs_df.iterrows():
    team = test_df.iloc[i,2]
    year = test_df.iloc[i,1]
    all_teams.append(team)
    all_years.append(year)
# print(all_years)

In [None]:
# add teams to dataframe
probs_df['Year'] = all_years
probs_df['Team'] = all_teams
probs_df.sort_values(by=['Year','True'],ascending=False)
# probs_df.head()

In [None]:
# add top three most likely teams for each season to a dictionary
seasons = probs_df['Year'].unique()
top_three = {}
for season in seasons:
    df = probs_df.loc[probs_df['Year'] == season]
    df = df.sort_values(by='True', ascending=False)
    top_three[season] = {'first_prediction':df.iloc[0,3], 'second_prediction': df.iloc[1,3], 'third_prediction': df.iloc[2,3]}
    

In [None]:
# put dictionary into dataframe
top_three_df = pd.DataFrame(top_three)
top_three_df = top_three_df.transpose()

In [None]:
# sort by year
top_three_df = top_three_df.sort_index()

In [None]:
# adding year as a column
top_three_df.reset_index(inplace=True)
top_three_df = top_three_df.rename(columns={'index':'Year'})

In [None]:
# top_three_df

In [None]:
# bring in actual winners
even_df = top_three_df.merge(winners_df, on='Year')

In [None]:
# dataframe cleanup
even_df.drop(columns=['Prediction','Actual'], inplace=True)

In [None]:
even_df.head()

In [None]:
even_df.rename(columns={'Team':'actual'}, inplace=True)

In [None]:
even_df.head()

# Part 2 - Repeat for odd years

In [None]:
# import dependencies
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings('ignore')

In [None]:
# read in data
pd.set_option('display.max_columns', None)
mlb_df = pd.read_csv('resources/mlb_data.csv')
# mlb_df.head()

In [None]:
# change the League column to be binary instead of string
for i, row in mlb_df.iterrows():
#     print(i,row['Lg'])
    if row['Lg'] == 'AL':
        mlb_df.at[i,'Lg'] = 0
#         print(f'new value at {i} is {mlb_df.at[i,"Lg"]}')
    elif row['Lg'] == 'NL':
        mlb_df.at[i,'Lg'] = 1
#         print(f'new value at {i} is {mlb_df.at[i,"Lg"]}')
    

In [None]:
# league is now a 1 or 0
# mlb_df.head()
# mlb_df.dtypes

# Split into training and testing set
Manually split the data by season. Even years will be used for training, while odd years will be used for testing

In [None]:
# get X and Y train for making the model
train_df = mlb_df.loc[mlb_df['Year'] % 2 == 0]
train_data = train_df.values
X_train = train_data[:, 3:22]
y_train = train_data[:,23]

In [None]:
# get X and Y for making the model
test_df = mlb_df.loc[mlb_df['Year'] % 2 == 1]
test_data = test_df.values
X_test = test_data[:, 3:22]
y_test = test_data[:,23]

# Scaling and One-hot encoding

In [None]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# label encode the winner column
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
label_encoder.fit(y_test)
encoded_y_test = label_encoder.transform(y_test)

In [None]:
# One-hot encoding
from keras.utils import to_categorical

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
# y_train_categorical

# Create model

In [None]:
from tensorflow.keras.models import Sequential

model = Sequential()

In [None]:
X_train.shape

In [None]:
from tensorflow.keras.layers import Dense
number_inputs = 19
number_hidden_nodes = 4
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))
model.add(Dense(4))

In [None]:
number_classes = 2
model.add(Dense(units=number_classes, activation='softmax'))

In [None]:
model.summary()

In [None]:
# # Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
print(X_train_scaled.shape)
print(y_train_categorical.shape)

In [None]:
# Fit the model
model.fit(
    x=X_train_scaled,
    y=y_train_categorical,
    epochs=1000,
    shuffle=False,
    verbose=0
)

In [None]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
encoded_predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
# print(f"Predicted classes: {prediction_labels}")
# print(f"Actual Labels: {list(y_test)}")

# Post-process and analyze results

In [None]:
pd.set_option('display.max_rows', None)
predictions_df = pd.DataFrame({'Prediction':prediction_labels, 'Actual':list(y_test)})
# predictions_df.head()

In [None]:
winners_df = predictions_df.loc[predictions_df['Actual']]
# winners_df

In [None]:
# connect predictions to the team and season using iloc and test_df
teams = []
years = []
for i, row in winners_df.iterrows():
    team = test_df.iloc[i,2]
    year = test_df.iloc[i,1]
    teams.append(team)
    years.append(year)

In [None]:
# display predictions by team
winners_df['Team'] = teams
winners_df['Year'] = years
# winners_df

In [None]:
# using probabilities 
probs = model.predict_proba(X_test_scaled)
# probs

In [None]:
# dataframe cleanup
probs_df = pd.DataFrame.from_records(probs)
probs_df[0] = probs_df[0] * 100
probs_df[1] = probs_df[1] * 100
probs_df.rename(columns={0:'False', 1:'True'}, inplace=True)
# probs_df.head()

In [None]:
# connect predictions to the team and season using iloc and test_df
all_teams = []
all_years = []
for i, row in probs_df.iterrows():
    team = test_df.iloc[i,2]
    year = test_df.iloc[i,1]
    all_teams.append(team)
    all_years.append(year)
# print(all_years)

In [None]:
# add teams to dataframe
probs_df['Year'] = all_years
probs_df['Team'] = all_teams
probs_df.sort_values(by=['Year','True'],ascending=False)
# probs_df.head()

In [None]:
# add top three most likely teams for each season to a dictionary
seasons = probs_df['Year'].unique()
top_three = {}
for season in seasons:
    df = probs_df.loc[probs_df['Year'] == season]
    df = df.sort_values(by='True', ascending=False)
    top_three[season] = {'first_prediction':df.iloc[0,3], 'second_prediction': df.iloc[1,3], 'third_prediction': df.iloc[2,3]}
    

In [None]:
# put dictionary into dataframe
top_three_df = pd.DataFrame(top_three)
top_three_df = top_three_df.transpose()

In [None]:
# sort by year
top_three_df = top_three_df.sort_index()

In [None]:
# adding year as a column
top_three_df.reset_index(inplace=True)
top_three_df = top_three_df.rename(columns={'index':'Year'})

In [None]:
# top_three_df

In [None]:
# bring in actual winners
odd_df = top_three_df.merge(winners_df, on='Year')

In [None]:
# dataframe cleanup
odd_df.drop(columns=['Prediction','Actual'], inplace=True)

In [None]:
odd_df.head()

In [None]:
odd_df.rename(columns={'Team':'actual'}, inplace=True)

In [None]:
# odd_df.head()

In [None]:
# even_df.head()

In [None]:
df = pd.concat([even_df, odd_df])
# df.head()

In [None]:
df = df.sort_values(by='Year')
df.reset_index(inplace=True, drop=True)
df.head()

In [None]:
# add full names to df
teams =  ['ANA','ARI','ATL','BAL','BOS','CHC','CHW','CIN','CLE','COL','DET','FLA','HOU','KCR','LAA','LAD','MIA','MIL','MIN','MON','NYM','NYY','OAK','PHI','PIT','SDP','SEA','SFG','STL','TBD','TBR','TEX','TOR','WSN']
full_names = ['Anahiem Angels','Arizona Diamondbacks','Atlanta Braves','Baltimore Orioles','Boston Red Sox','Chicago Cubs','Chicago White Sox','Cincinnati Reds','Cleveland Indians','Colorado Rockies','Detroit Tigers','Florida Marlins','Houston Astros','Kansas City Royals','Los Angeles Angels','Los Angeles Dodgers','Miami Marlins','Milwaukee Brewers','Minnesota Twins','Montreal Expos','New York Mets','New York Yankees','Oakland Athletics','Philadelphia Phillies','Pittsburgh Pirates','San Diego Padres','Seattle Mariners','San Francisco Giants','St. Louis Cardinals','Tampa Bay Devil Rays','Tampa Bay Rays','Texas Rangers','Toronto Blue Jays','Washington Nationals']
full_names_df = pd.DataFrame({'abbr':teams,'fn':full_names})
# full_names_df

In [None]:
df = df.merge(full_names_df, right_on='abbr', left_on='actual')

In [None]:
df.rename(columns={'fn':'actual_fn'},inplace=True)
df.drop(columns='abbr',inplace=True)

In [None]:
df = df.merge(full_names_df, right_on='abbr', left_on='first_prediction')

In [None]:
df.rename(columns={'fn':'first_prediction_fn'},inplace=True)
df.drop(columns='abbr',inplace=True)

In [None]:
df = df.merge(full_names_df, right_on='abbr', left_on='second_prediction')

In [None]:
df.rename(columns={'fn':'second_prediction_fn'},inplace=True)
df.drop(columns='abbr',inplace=True)

In [None]:
df = df.merge(full_names_df, right_on='abbr', left_on='third_prediction')

In [None]:
df.rename(columns={'fn':'third_prediction_fn'},inplace=True)
df.drop(columns='abbr',inplace=True)

In [None]:
df = df.sort_values('Year')

In [None]:
df

In [None]:
df.to_json('nn_predictions.json')