In [39]:
# import dependencies here
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [40]:
# pull in csv file and read into a pandas data frame
players_df = pd.read_csv("Resources/player_data.csv")
players_df

Unnamed: 0,player_id,name,at_bats,runs,hits,runs_batted_in,bases_on_balls,batting_average,doubles,triples,home_runs,stolen_bases,hall_of_fame
0,abreubr01,Bryan Abreu,no,3,5,no,10,no,no,no,0,1,no
1,adamswi02,Willie Adams,no,92,149,no,55,no,no,no,20,13,no
2,adenhni01,Nick Adenhart,no,12,25,no,16,no,no,no,0,0,no
3,adkinst01,Steve Adkins,no,18,19,no,29,no,no,no,4,3,no
4,ahearpa01,Pat Ahearne,no,13,20,no,5,no,no,no,2,1,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18801,zimmejo01,Jordan Zimmerman,0,0,0,0,0,no,0,0,0,0,no
18802,zimmeky01,Kyle Zimmer,0,0,0,0,0,no,0,0,0,0,no
18803,zinsebi01,Bill Zinser,0,0,0,0,0,no,0,0,0,0,no
18804,zumayjo01,Joel Zumaya,0,0,0,0,0,no,0,0,0,0,no


In [41]:
# clean up the data frame set minimum at bats to 100
clean_df = players_df.loc[(players_df["stolen_bases"] != 'no') & (players_df["batting_average"] != 'no') & (players_df["hall_of_fame"] != "EXEC") & (players_df["hall_of_fame"] != "EXEC/PIO") & (players_df["hall_of_fame"] != "MGR") & (players_df["hall_of_fame"] != "PIO") & (players_df["hall_of_fame"] != "UMP") & (players_df["hits"] != "0.0")]

dtype_change_df = clean_df.astype({"at_bats": "float64"})

new_df = dtype_change_df.loc[dtype_change_df["at_bats"] > 100]
new_df

Unnamed: 0,player_id,name,at_bats,runs,hits,runs_batted_in,bases_on_balls,batting_average,doubles,triples,home_runs,stolen_bases,hall_of_fame
750,rosepe01,Pete Rose,14053.0,2165,4256,1314,1566,0.303,746,135,160,198,no
751,aaronha01,Henry Aaron,12364.0,2174,3771,2297,1402,0.305,624,98,755,240,RF
752,yastrca01,Carl Yastrzemski,11988.0,1816,3419,1844,1845,0.285,646,59,452,168,LF
753,ripkeca01,Cal Ripken Jr.,11551.0,1647,3184,1695,1129,0.276,603,44,431,36,SS
754,cobbty01,Ty Cobb,11440.0,2245,4189,1944,1249,0.366,724,295,117,897,CF
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9567,raymocl01,Claude Raymond,101.0,6,11,8,3,0.109,4,0,0,0,no
9568,suthele01,Leo Sutherland,101.0,15,25,5,4,0.248,3,0,0,6,no
9569,tayloch02,Chuck Taylor,101.0,5,16,10,3,0.158,4,1,0,0,no
9570,tovarwi01,Wilfredo Tovar,101.0,6,19,7,6,0.188,5,0,0,1,no


In [59]:
# convert HOF column to yes/no format
replaced_df = new_df.replace(to_replace=["P", "C", "1B", "2B", "3B", "SS", "LF", "CF", "RF", "DH"], value=["yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes"])
replaced_df.to_csv("Resources/player_data_update.csv")
replaced_df

Unnamed: 0,player_id,name,at_bats,runs,hits,runs_batted_in,bases_on_balls,batting_average,doubles,triples,home_runs,stolen_bases,hall_of_fame
750,rosepe01,Pete Rose,14053.0,2165,4256,1314,1566,0.303,746,135,160,198,no
751,aaronha01,Henry Aaron,12364.0,2174,3771,2297,1402,0.305,624,98,755,240,yes
752,yastrca01,Carl Yastrzemski,11988.0,1816,3419,1844,1845,0.285,646,59,452,168,yes
753,ripkeca01,Cal Ripken Jr.,11551.0,1647,3184,1695,1129,0.276,603,44,431,36,yes
754,cobbty01,Ty Cobb,11440.0,2245,4189,1944,1249,0.366,724,295,117,897,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9567,raymocl01,Claude Raymond,101.0,6,11,8,3,0.109,4,0,0,0,no
9568,suthele01,Leo Sutherland,101.0,15,25,5,4,0.248,3,0,0,6,no
9569,tayloch02,Chuck Taylor,101.0,5,16,10,3,0.158,4,1,0,0,no
9570,tovarwi01,Wilfredo Tovar,101.0,6,19,7,6,0.188,5,0,0,1,no


In [43]:
# Store x and y variables and print the shapes
X = replaced_df[["at_bats", "runs", "hits", "runs_batted_in",  "bases_on_balls", "doubles", "triples", "home_runs", "stolen_bases"]]

y = replaced_df["hall_of_fame"]
print(X.shape, y.shape)

(8661, 9) (8661,)


In [44]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [45]:
# scale the x/y variables
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#y_scaler = StandardScaler().fit(y_train)
#y_train_scaled = y_scaler.transform(y_train)
#y_test_scaled = y_scaler.transform(y_test)

In [46]:
# label encode the y variable 
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [47]:
# create a sequential model
model = Sequential()

In [48]:
# add the first layer of neural network
model.add(Dense(units=4, activation='relu', input_dim=9))

In [49]:
# add the output layer
model.add(Dense(units=2, activation='softmax'))

In [50]:
# compile the model
recall = tf.keras.metrics.Recall()
precision = tf.keras.metrics.Precision()
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', recall, precision])

In [51]:
# get the model summary before training
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 4)                 40        
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 10        
Total params: 50
Trainable params: 50
Non-trainable params: 0
_________________________________________________________________


In [52]:
# Fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 6495 samples
Epoch 1/100
6495/6495 - 20s - loss: 0.4002 - accuracy: 0.9726 - recall: 0.9726 - precision: 0.9726
Epoch 2/100
6495/6495 - 1s - loss: 0.2052 - accuracy: 0.9726 - recall: 0.9726 - precision: 0.9726
Epoch 3/100
6495/6495 - 1s - loss: 0.1429 - accuracy: 0.9726 - recall: 0.9726 - precision: 0.9726
Epoch 4/100
6495/6495 - 1s - loss: 0.1161 - accuracy: 0.9726 - recall: 0.9726 - precision: 0.9726
Epoch 5/100
6495/6495 - 1s - loss: 0.1042 - accuracy: 0.9726 - recall: 0.9726 - precision: 0.9726
Epoch 6/100
6495/6495 - 1s - loss: 0.0986 - accuracy: 0.9726 - recall: 0.9726 - precision: 0.9726
Epoch 7/100
6495/6495 - 1s - loss: 0.0955 - accuracy: 0.9726 - recall: 0.9726 - precision: 0.9726
Epoch 8/100
6495/6495 - 1s - loss: 0.0941 - accuracy: 0.9726 - recall: 0.9726 - precision: 0.9726
Epoch 9/100
6495/6495 - 1s - loss: 0.0934 - accuracy: 0.9726 - recall: 0.9726 - precision: 0.9726
Epoch 10/100
6495/6495 - 1s - loss: 0.0929 - accuracy: 0.9726 - recall: 0.9726 - precision: 0.9

<tensorflow.python.keras.callbacks.History at 0x7fb3d08a3320>

In [53]:
# evaluate the model 
model_loss, model_accuracy, model_recall, model_precision = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - LOSS: {round(model_loss, 3)}, ACCURACY: {round(model_accuracy, 3)}, RECALL: {round(model_recall, 3)}, PRECISION: {round(model_precision, 3)}")

2166/1 - 2s - loss: 0.0524 - accuracy: 0.9783 - recall: 0.9783 - precision: 0.9783
Normal Neural Network - LOSS: 0.078, ACCURACY: 0.9779999852180481, RECALL: 0.9779999852180481, PRECISION: 0.9779999852180481


In [57]:
# test the model using a sample player
player_predict = model.predict(replaced_df.loc[replaced_df["name"] == "Babe Rutz", ["at_bats", "runs", "hits", "runs_batted_in",  "bases_on_balls", "doubles", "triples", "home_runs", "stolen_bases"]])
player_predict



ValueError: Structure is a scalar but len(flat_sequence) == 0 > 1