In [1]:
# import dependencies here
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
# pull in csv file and read into a pandas data frame and clean data frame
players_df = pd.read_csv("Resources/player_data.csv")

clean_df = players_df.loc[(players_df["stolen_bases"] != 'no') & (players_df["batting_average"] != 'no') & (players_df["hall_of_fame"] != "EXEC") & (players_df["hall_of_fame"] != "EXEC/PIO") & (players_df["hall_of_fame"] != "MGR") & (players_df["hall_of_fame"] != "PIO") & (players_df["hall_of_fame"] != "UMP") & (players_df["hits"] != "0.0")]

clean_df

Unnamed: 0,player_id,name,at_bats,runs,hits,runs_batted_in,bases_on_balls,batting_average,doubles,triples,home_runs,stolen_bases,hall_of_fame
1,aaronha01,Henry Aaron,12364,2174.0,3771.0,2297,1402.0,0.305,624,98,755.0,240,no
2,aaronto01,Tommie Aaron,944,102.0,216.0,94,86.0,0.229,42,6,13.0,9,no
4,abadan01,Andy Abad,21,1.0,2.0,0,4.0,0.095,0,0,0.0,0,no
5,abadfe01,Fernando Abad,9,0.0,1.0,0,0.0,0.111,0,0,0.0,0,no
6,abbated01,Ed Abbaticchio,3044,355.0,772.0,324,289.0,0.254,99,43,11.0,142,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18800,zupcibo01,Bob Zupcic,795,99.0,199.0,80,57.0,0.25,47,4,7.0,7,no
18801,zupofr01,Frank Zupo,18,3.0,3.0,0,2.0,0.167,1,0,0.0,0,no
18802,zuvelpa01,Paul Zuvella,491,41.0,109.0,20,34.0,0.222,17,2,2.0,2,no
18803,zuverge01,George Zuverink,142.0,5.0,21.0,7.0,9.0,0.148,2.0,1.0,0.0,0.0,no


In [3]:
# convert HOF column to yes/no format
replaced_df = clean_df.replace(to_replace=["P", "C", "1B", "2B", "3B", "SS", "LF", "CF", "RF", "DH"], value=["yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes"])
replaced_df

Unnamed: 0,player_id,name,at_bats,runs,hits,runs_batted_in,bases_on_balls,batting_average,doubles,triples,home_runs,stolen_bases,hall_of_fame
1,aaronha01,Henry Aaron,12364,2174.0,3771.0,2297,1402.0,0.305,624,98,755.0,240,no
2,aaronto01,Tommie Aaron,944,102.0,216.0,94,86.0,0.229,42,6,13.0,9,no
4,abadan01,Andy Abad,21,1.0,2.0,0,4.0,0.095,0,0,0.0,0,no
5,abadfe01,Fernando Abad,9,0.0,1.0,0,0.0,0.111,0,0,0.0,0,no
6,abbated01,Ed Abbaticchio,3044,355.0,772.0,324,289.0,0.254,99,43,11.0,142,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18800,zupcibo01,Bob Zupcic,795,99.0,199.0,80,57.0,0.25,47,4,7.0,7,no
18801,zupofr01,Frank Zupo,18,3.0,3.0,0,2.0,0.167,1,0,0.0,0,no
18802,zuvelpa01,Paul Zuvella,491,41.0,109.0,20,34.0,0.222,17,2,2.0,2,no
18803,zuverge01,George Zuverink,142.0,5.0,21.0,7.0,9.0,0.148,2.0,1.0,0.0,0.0,no


In [4]:
# Store x and y variables and print the shapes
X = replaced_df[["at_bats", "runs", "hits", "runs_batted_in",  "bases_on_balls", "doubles", "triples", "home_runs", "stolen_bases"]]

y = replaced_df["hall_of_fame"]
print(X.shape, y.shape)

(13978, 9) (13978,)


In [5]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
# scale the x/y variables
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#y_scaler = StandardScaler().fit(y_train)
#y_train_scaled = y_scaler.transform(y_train)
#y_test_scaled = y_scaler.transform(y_test)

In [7]:
# label encode the y variable 
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [8]:
# create a sequential model
model = Sequential()

In [9]:
# add the first layer of neural network
model.add(Dense(units=4, activation='relu', input_dim=9))

In [10]:
# add the output layer
model.add(Dense(units=2, activation='softmax'))

In [11]:
# compile the model
recall = tf.keras.metrics.Recall()
precision = tf.keras.metrics.Precision()
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', recall, precision])

In [12]:
# get the model summary before training
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 4)                 40        
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 10        
Total params: 50
Trainable params: 50
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 10483 samples
Epoch 1/100
10483/10483 - 7s - loss: 0.3776 - accuracy: 0.9658 - recall: 0.9655 - precision: 0.9657
Epoch 2/100
10483/10483 - 2s - loss: 0.1193 - accuracy: 0.9829 - recall: 0.9829 - precision: 0.9829
Epoch 3/100
10483/10483 - 1s - loss: 0.0764 - accuracy: 0.9829 - recall: 0.9829 - precision: 0.9829
Epoch 4/100
10483/10483 - 2s - loss: 0.0648 - accuracy: 0.9830 - recall: 0.9830 - precision: 0.9830
Epoch 5/100
10483/10483 - 2s - loss: 0.0611 - accuracy: 0.9829 - recall: 0.9829 - precision: 0.9829
Epoch 6/100
10483/10483 - 2s - loss: 0.0596 - accuracy: 0.9835 - recall: 0.9835 - precision: 0.9835
Epoch 7/100
10483/10483 - 2s - loss: 0.0588 - accuracy: 0.9846 - recall: 0.9846 - precision: 0.9846
Epoch 8/100
10483/10483 - 2s - loss: 0.0584 - accuracy: 0.9843 - recall: 0.9843 - precision: 0.9843
Epoch 9/100
10483/10483 - 1s - loss: 0.0581 - accuracy: 0.9845 - recall: 0.9845 - precision: 0.9845
Epoch 10/100
10483/10483 - 1s - loss: 0.0578 - accuracy: 0.9849 - recall: 0.9

<tensorflow.python.keras.callbacks.History at 0x7fa96645c048>

In [14]:
# evaluate the model 
model_loss, model_accuracy, model_recall, model_precision = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - LOSS: {round(model_loss, 3)}, ACCURACY: {round(model_accuracy, 3)}, RECALL: {round(model_recall, 3)}, PRECISION: {round(model_precision, 3)}")

3495/1 - 1s - loss: 0.3872 - accuracy: 0.9854 - recall: 0.9854 - precision: 0.9854
Normal Neural Network - LOSS: 0.054, ACCURACY: 0.9850000143051147, RECALL: 0.9850000143051147, PRECISION: 0.9850000143051147


In [21]:
# test the model using a sample player
player_predict = model.predict(replaced_df.loc[replaced_df["name"] == "Andy Abad", ["at_bats", "runs", "hits", "runs_batted_in",  "bases_on_balls", "doubles", "triples", "home_runs", "stolen_bases"]])
player_predict



array([[5.6051066e-16, 1.0000000e+00]], dtype=float32)