# Neural Network Prediction of Career Batting Average

We will use a neural network to predict the career batting average based on other career stats.

In [35]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [24]:
# read in the data
master_df = pd.read_csv("../data/Master.csv") 
batting_df = pd.read_csv("../data/Batting.csv")
batting_df

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abercda01,1871,1,TRO,,1,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,,,,
1,addybo01,1871,1,RC1,,25,118.0,30.0,32.0,6.0,...,13.0,8.0,1.0,4.0,0.0,,,,,
2,allisar01,1871,1,CL1,,29,137.0,28.0,40.0,4.0,...,19.0,3.0,1.0,2.0,5.0,,,,,
3,allisdo01,1871,1,WS3,,27,133.0,28.0,44.0,10.0,...,27.0,1.0,1.0,0.0,2.0,,,,,
4,ansonca01,1871,1,RC1,,25,120.0,29.0,39.0,11.0,...,16.0,6.0,2.0,2.0,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101327,zitoba01,2015,1,OAK,AL,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101328,zobribe01,2015,1,OAK,AL,67,235.0,39.0,63.0,20.0,...,33.0,1.0,1.0,33.0,26.0,2.0,0.0,0.0,3.0,5.0
101329,zobribe01,2015,2,KCA,AL,59,232.0,37.0,66.0,16.0,...,23.0,2.0,3.0,29.0,30.0,1.0,1.0,0.0,2.0,3.0
101330,zuninmi01,2015,1,SEA,AL,112,350.0,28.0,61.0,11.0,...,28.0,0.0,1.0,21.0,132.0,0.0,5.0,8.0,2.0,6.0


In [28]:
# process data that we want to use 
batting_df2 = batting_df.drop(['yearID', 'stint', 'teamID', 'lgID'], axis=1)
batting_df2 = batting_df2.fillna(0)
batting_df2 = batting_df2.groupby(['playerID']).sum()
batting_averages = batting_df2['H']/batting_df2['AB'] # compute the career batting averages
batting_df2.reset_index(inplace=True)
batting_df2 = batting_df2.drop(['playerID','H', 'AB'], axis=1)
batting_df2

Unnamed: 0,G,R,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0
1,3298,2174.0,624.0,98.0,755.0,2297.0,240.0,73.0,1402.0,1383.0,293.0,32.0,21.0,121.0,328.0
2,437,102.0,42.0,6.0,13.0,94.0,9.0,8.0,86.0,145.0,3.0,0.0,9.0,6.0,36.0
3,448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
4,15,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,5.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18654,16,3.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0
18655,209,41.0,17.0,2.0,2.0,20.0,2.0,0.0,34.0,50.0,1.0,2.0,18.0,0.0,8.0
18656,266,5.0,2.0,1.0,0.0,7.0,0.0,1.0,9.0,39.0,0.0,0.0,16.0,0.0,3.0
18657,366,167.0,76.0,15.0,30.0,202.0,46.0,0.0,128.0,139.0,0.0,4.0,31.0,0.0,0.0


In [29]:
# players with no ABs must have batting average of 0
batting_averages = batting_averages.fillna(0)
batting_averages

playerID
aardsda01    0.000000
aaronha01    0.304998
aaronto01    0.228814
aasedo01     0.000000
abadan01     0.095238
               ...   
zupofr01     0.166667
zuvelpa01    0.221996
zuverge01    0.147887
zwilldu01    0.284375
zychto01     0.000000
Length: 18659, dtype: float64

In [32]:
# scale the batting average data
min_max_scaler = preprocessing.MinMaxScaler()
columns = batting_df2.columns
batting_scaled = pd.DataFrame(min_max_scaler.fit_transform(batting_df2))
batting_scaled.columns = columns
batting_scaled

Unnamed: 0,G,R,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,0.092925,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000770,0.000000,0.000000,0.001953,0.000000,0.000000
1,0.925884,0.947277,0.787879,0.317152,0.990814,1.000000,0.170697,0.217910,0.548084,0.532538,0.425872,0.111498,0.041016,0.945312,0.937143
2,0.122684,0.044444,0.053030,0.019417,0.017060,0.040923,0.006401,0.023881,0.033620,0.055834,0.004360,0.000000,0.017578,0.046875,0.102857
3,0.125772,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001155,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.004211,0.000436,0.000000,0.000000,0.000000,0.000000,0.000000,0.002985,0.001564,0.001925,0.000000,0.000000,0.000000,0.000000,0.002857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18654,0.004492,0.001307,0.001263,0.000000,0.000000,0.000000,0.000000,0.000000,0.000782,0.002310,0.000000,0.000000,0.000000,0.000000,0.000000
18655,0.058675,0.017865,0.021465,0.006472,0.002625,0.008707,0.001422,0.000000,0.013292,0.019253,0.001453,0.006969,0.035156,0.000000,0.022857
18656,0.074677,0.002179,0.002525,0.003236,0.000000,0.003047,0.000000,0.002985,0.003518,0.015017,0.000000,0.000000,0.031250,0.000000,0.008571
18657,0.102751,0.072767,0.095960,0.048544,0.039370,0.087941,0.032717,0.000000,0.050039,0.053523,0.000000,0.013937,0.060547,0.000000,0.000000


In [33]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(batting_scaled, batting_averages, test_size=0.33, random_state=42)

In [49]:
# create model
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(15, activation='relu', dtype='float64'),
  tf.keras.layers.Dense(64, activation='relu', dtype='float64'),
  tf.keras.layers.Dense(1, activation='linear', dtype='float64')
])

In [50]:
model.compile(optimizer='adam',
              loss='mean_absolute_error',
              metrics=['mean_absolute_error'])

In [51]:
model.fit(X_train, y_train, epochs=50);
model.evaluate(X_test, y_test)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


[0.039998169988393784, 0.03999817371368408]