In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pathlib

import matplotlib.pyplot as plt
import pandas as pd

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

In [2]:
print(tf.__version__)

2.0.0


In [3]:
def data_processing(file_name, file_name_D):
    """
    Read the csv files and create game characterstic features and player statistic features
    """
    df = pd.read_csv(file_name)
    df_D = pd.read_csv(file_name_D)
    df.sort_values(by = ['Name', 'Unnamed: 2_level_0 G#']) # for rolling average
    # All box score player stats, except defensive statistics
#     stats = ['Passing AY/A', 'Passing Att', 'Passing Cmp', 'Passing Cmp%',
#             'Passing Int', 'Passing Rate','Passing Yds','FPoints']
    stats = ['Passing Yds','FPoints']
    # Opponent Characteristics
    df, opp_features = get_opp_d(df, df_D)
    # Game Characteristic Indicators, e.g. home/away, opponent, team
#     df, game_features = get_game_char_indicators(df)
    # Player Statistic Features, e.g. Season, last 4 weeks, previous week
    df, player_features = get_player_averages(df, stats)
#     features = game_features + player_features
#     features = player_features
    features = player_features + opp_features
    df = df.fillna(0)
    return df, features


def get_game_char_indicators(df):
    """
    Transform str cols into game categorical variables
    Returns transformed and columns
    """
    df['home'] = 1 * df['Unnamed: 6_level_0 Unnamed: 6_level_1'] == '0'
    oppts = pd.get_dummies(df['Unnamed: 7_level_0 Opp'], prefix='Oppt')
    teams = pd.DataFrame()
    team_list = pd.Series('ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET',\
                           'GB', 'HOU', 'IND', 'JAC', 'KC', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ',\
                           'OAK', 'PHI', 'PIT', 'SD', 'SEA', 'SF', 'STL', 'TB', 'TEN', 'WAS')
    for team in df['Unnamed: 5_level_0 Tm']:
        temp = (team_list == team)
        teams = teams.append(temp, ignore_index=True)
    teams.index = range(len(df['Unnamed: 5_level_0 Tm']))
    teams.columns = list(team_list)
    df = pd.concat([df, oppts, teams], axis=1)
    return df, ['home'] + list(oppts.columns) + list(team_list)

def get_opp_d(df, df_D):

    d_stats = ['Passing Yds', 'Passing 1stD','Sc%','Unnamed: 26_level_0 TO%']
    df_D['Season']= df_D['Season'].astype(str)
#     for stat in df[d_stats + ['FPoints']]:
#         opp = abbrev_dict[df['Unnamed: 7_level_0 Opp']]
#         df['D_'+stat] = df_D.loc[df_D['Tm'] == opp][stat]
    for stat in d_stats:
#         print(stat)
        df[stat] = df.apply(lambda row: get_team_stat(row,stat,df_D), axis=1)
    return df, d_stats
def get_team_stat(row,stat,df_D):
    abbrev_dict = {'ARI':'Arizona Cardinals', 'ATL':'Atlanta Falcons', 'BAL':'Baltimore Ravens', 'BUF':'Buffalo Bills', 
                          'CAR':'Carolina Panthers', 'CHI':'Chicago Bears', 'CIN':'Cincinnati Bengals', 
                          'CLE':'Cleveland Browns', 'DAL':'Dallas Cowboys', 'DEN':'Denver Broncos', 'DET':'Detroit Lions', 
                          'GB':'Green Bay Packers','GNB':'Green Bay Packers', 'HOU': 'Houston Texans', 'IND':'Indianapolis Colts', 
                          'JAC':'Jacksonville Jaguars', 'JAX':'Jacksonville Jaguars','KAN':'Kansas City Chiefs','KC':'Kansas City Chiefs','LAC':'Los Angeles Chargers',
                          'LAR':'Los Angeles Rams', 'MIA':'Miami Dolphins', 'MIN':'Minnesota Vikings','NE':'New England Patriots','NWE':'New England Patriots',
                          'NO':'New Orleans Saints','NOR':'New Orleans Saints', 'NYG':'New York Giants', 'NYJ':'New York Jets','OAK':'Oakland Raiders', 
                          'PHI':'Philadelphia Eagles', 'PIT':'Pittsburgh Steelers', 'SD':'San Diego Chargers', 'SDG':'San Diego Chargers', 'SEA':'Seattle Seahawks', 
                          'SF':'San Francisco 49ers', 'SFO':'San Francisco 49ers','STL':'St. Louis Rams', 'TB':'Tampa Bay Buccaneers',
                           'TEN':'Tennessee Titans', 'TAM':'Tampa Bay Buccaneers',
                          'WAS':'Washington Redskins'}
#     print(row)
#     print(df_D['Season'])
    df_D.index=df_D['Tm']+df_D['Season']
#     print(df_D)
    opp = abbrev_dict[row['Unnamed: 7_level_0 Opp']]+str(row['Season'])
#     print(type(df_D.loc[df_D['Tm']==opp][stat]))
#     print(opp,stat)
    return df_D.at[opp,stat]

def rolling_average(df, window):
    return df.rolling(min_periods=1, window=window).mean().shift(1)

def get_player_averages(df, stats):
    """
    Estimate player averages for all stats and FanDuel point histories,
    for season-to-date, last 4 weeeks, and previous week
    """
    feature_names = []
    for stat in df[stats + ['FPoints']]:
        df['season_{}'.format(stat)] = df.groupby('Name')[stat].apply(lambda x: rolling_average(x, 16))
        df['recent_{}'.format(stat)] = df.groupby('Name')[stat].apply(lambda x: rolling_average(x, 4))
        df['prev_{}'.format(stat)] = df.groupby('Name')[stat].apply(lambda x: rolling_average(x, 1))
        feature_names = feature_names + [time + "_" + stat for time in ['season', 'recent', 'prev']]
    return df, feature_names


path = "data/"
train, features = data_processing(path + 'QB_all.csv', path + 'teams_all.csv')
# test, features2 = data_processing(path + 'gamelog_QB_2018.csv', path + 'teams_2018.csv')
# if (features != features2):
#     print("Debug error about feature inconsistency")
#     exit()

In [4]:
train.to_csv('train.csv')

In [5]:
features.append('FPoints')
# features.remove('home')
features.remove('season_FPoints')
features.remove('recent_FPoints')
features.remove('prev_FPoints')
print(features)
dataset = train[features]
# dataset
train_dataset = dataset.sample(frac=0.8,random_state=0)
# print(train_dataset)
test_dataset = dataset.drop(train_dataset.index)
train_stats = train_dataset.describe(include='all')
train_stats.pop('FPoints')
train_stats = train_stats.transpose()
train_stats

['season_Passing Yds', 'recent_Passing Yds', 'prev_Passing Yds', 'season_Passing Rate', 'recent_Passing Rate', 'prev_Passing Rate', 'season_FPoints', 'recent_FPoints', 'prev_FPoints', 'Passing Yds', 'Passing 1stD', 'Sc%', 'Unnamed: 26_level_0 TO%', 'FPoints']


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
season_Passing Yds,4705.0,3577.187646,597.565648,0.0,3510.0,3671.75,3816.875,4681.0
recent_Passing Yds,4705.0,3590.444757,614.186345,0.0,3489.0,3674.5,3857.25,4681.0
prev_Passing Yds,4705.0,3596.502657,687.417622,0.0,3382.0,3688.0,3949.0,4796.0
season_Passing Rate,4705.0,83.62387,20.715257,0.0,77.066667,86.875,95.55625,158.3
recent_Passing Rate,4705.0,84.015399,23.212702,0.0,74.35,86.825,98.55,158.3
prev_Passing Rate,4705.0,84.324336,32.498469,0.0,67.1,86.1,105.5,158.3
season_FPoints,4705.0,14.193563,4.887679,-1.52,11.6525,14.5225,17.385,30.68
recent_FPoints,4705.0,14.280995,5.692056,-1.52,10.97,14.54,17.945,31.335
prev_FPoints,4705.0,14.275439,8.331501,-6.6,8.5,14.2,19.64,49.32
Passing Yds,4705.0,3683.977683,393.646384,2459.0,3423.0,3697.0,3947.0,4796.0


In [6]:
train_labels = train_dataset.pop('FPoints')
test_labels = test_dataset.pop('FPoints')
train_labels

840     19.28
3161     0.00
3065    12.24
3067    16.92
3924    15.26
        ...  
3436    11.50
4256     0.00
5692    17.16
5612     7.34
5308    13.50
Name: FPoints, Length: 4705, dtype: float64

In [93]:
def build_model():
    model = keras.Sequential([
        layers.Dense(16, activation='relu', input_shape=[len(train_dataset.keys())]),
        layers.Dense(8, activation='relu'),
        layers.Dense(1)
    ])


#     model=keras.Sequential()
# #     model.add(layers.Dense(64,input_shape=[len(train_dataset.keys())]))
#     model.add(layers.Dense(16,input_shape=[len(train_dataset.keys())]))
#     model.add(layers.LeakyReLU(alpha=1))
# #     model.add(layers.Dense(64))
# #     model.add(layers.LeakyReLU(alpha=0.6))
#     model.add(layers.Dense(1))

    optimizer = tf.keras.optimizers.RMSprop(0.001)

    model.compile(loss='mse',
                    optimizer=optimizer,
                    metrics=['mae', 'mse'])
    return model
model = build_model()

In [94]:
def norm(x):
  return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [95]:
example_batch = normed_train_data[:10]
example_batch

Unnamed: 0,season_Passing Yds,recent_Passing Yds,prev_Passing Yds,season_Passing Rate,recent_Passing Rate,prev_Passing Rate,season_FPoints,recent_FPoints,prev_FPoints,Passing Yds,Passing 1stD,Sc%,Unnamed: 26_level_0 TO%
840,0.042777,0.020035,-0.312041,-0.103734,-0.10944,-0.646933,-0.53984,-0.478912,-1.362952,-1.183747,-1.056429,-1.733884,2.831649
3161,0.169646,0.201983,0.522968,-0.384324,-0.33884,-1.099262,-0.570529,-0.387557,-0.517967,-0.007564,-0.660038,-0.508162,-0.678031
3065,0.658191,0.618795,0.789182,0.726572,0.631534,1.580864,0.60999,0.508429,0.679897,0.706274,-0.015902,-1.29182,1.093209
3067,0.537423,0.551633,-0.073467,-0.286771,-0.178368,-1.803911,0.405735,0.439912,0.221396,2.418471,2.461544,-0.246942,0.962007
3924,0.702328,0.596407,0.118556,0.48098,0.565837,0.75621,0.822003,1.691657,1.265626,-0.12188,-0.858234,0.55681,-2.219666
5176,-0.0296,0.136449,-0.55207,-0.247348,0.228737,0.140796,-0.766941,-0.116126,0.298213,5.7e-05,-0.015902,0.456341,-1.793256
842,-0.01677,0.061553,0.258209,0.078499,-0.192369,0.383885,-0.053924,-0.189034,0.456648,0.617362,0.727332,-0.046004,-0.546828
1789,0.025311,0.149068,0.620434,0.300859,0.23843,0.156182,0.488256,0.378423,1.030374,2.588674,2.758837,-0.267036,1.55242
3768,0.347869,0.57809,0.38477,0.555201,0.40321,-0.074598,1.285157,1.24015,-0.722011,0.96793,0.132745,-0.427787,-0.087617
2030,0.15264,0.163721,0.146195,-0.872973,-0.897801,-0.726937,-1.538064,-1.121914,-0.923656,-0.972898,-1.155527,0.396059,-1.137241


In [96]:
example_result = model.predict(example_batch)
example_result



array([[ 0.1662177 ],
       [ 0.07789403],
       [ 0.43464684],
       [-0.33066067],
       [ 1.0541307 ],
       [ 0.01641579],
       [-0.44709554],
       [-1.0062178 ],
       [ 0.79395205],
       [-0.09718367]], dtype=float32)

In [None]:
# Display training progress by printing a single dot for each completed epoch
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 100 == 0: print('')
    print('.', end='')

EPOCHS = 100

print(normed_train_data.shape, train_labels.shape)

history = model.fit(
  normed_train_data, train_labels,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
  callbacks=[PrintDot()])

(4705, 13) (4705,)


In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [points]')
  plt.plot(hist['epoch'], hist['mae'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mae'],
           label = 'Val Error')
  plt.ylim([5,10])
  plt.legend()

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error [points^2]')
  plt.plot(hist['epoch'], hist['mse'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mse'],
           label = 'Val Error')
  plt.ylim([50,100])
  plt.legend()
  plt.show()
plot_history(history)

In [None]:
model = build_model()

# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(normed_train_data, train_labels, epochs=EPOCHS,
                    validation_split = 0.2, verbose=0, callbacks=[early_stop, PrintDot()])

plot_history(history)

In [None]:
loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=2)
print("Testing set Mean Abs Error: {:5.2f} points".format(mae))

In [None]:
test_predictions = model.predict(normed_test_data).flatten()

plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [points]')
plt.ylabel('Predictions [points]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])

In [None]:
error = test_predictions - test_labels
plt.hist(error, bins = 25)
plt.xlabel("Prediction Error [points]")
_ = plt.ylabel("Count")