In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pathlib

import matplotlib.pyplot as plt
import pandas as pd

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

from sklearn.linear_model import Ridge, BayesianRidge, ElasticNet, RidgeCV, ElasticNetCV
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np

In [3]:
def data_processing(file_name, file_name_D):
    """
    Read the csv files and create game characterstic features and player statistic features
    """
    df = pd.read_csv(file_name)
    df_D = pd.read_csv(file_name_D)
    df.sort_values(by = ['Name', 'Unnamed: 2_level_0 G#']) # for rolling average
    # All box score player stats, except defensive statistics
    stats = ['Passing AY/A', 'Passing Att', 'Passing Cmp', 'Passing Cmp%',
            'Passing Int', 'Passing Rate', 'Passing Sk', 'Passing TD',
            'Passing Y/A', 'Passing Yds', 'Passing Yds.1', 'Rushing Att',
            'Rushing TD', 'Rushing Y/A', 'Rushing Yds','FPoints']
    # Opponent Characteristics
    df, opp_features = get_opp_d(df, df_D)
    # Game Characteristic Indicators, e.g. home/away, opponent, team
#     df, game_features = get_game_char_indicators(df)
    # Player Statistic Features, e.g. Season, last 4 weeks, previous week
    df, player_features = get_player_averages(df, stats)
#     features = game_features + player_features
#     features = player_features
    features = player_features + opp_features
    df = df.fillna(0)
    return df, features


def get_game_char_indicators(df):
    """
    Transform str cols into game categorical variables
    Returns transformed and columns
    """
    df['home'] = 1 * df['Unnamed: 6_level_0 Unnamed: 6_level_1'] == '0'
    oppts = pd.get_dummies(df['Unnamed: 7_level_0 Opp'], prefix='Oppt')
    teams = pd.DataFrame()
    team_list = pd.Series('ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET',\
                           'GB', 'HOU', 'IND', 'JAC', 'KC', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ',\
                           'OAK', 'PHI', 'PIT', 'SD', 'SEA', 'SF', 'STL', 'TB', 'TEN', 'WAS')
    for team in df['Unnamed: 5_level_0 Tm']:
        temp = (team_list == team)
        teams = teams.append(temp, ignore_index=True)
    teams.index = range(len(df['Unnamed: 5_level_0 Tm']))
    teams.columns = list(team_list)
    df = pd.concat([df, oppts, teams], axis=1)
    return df, ['home'] + list(oppts.columns) + list(team_list)

def get_opp_d(df, df_D):

    d_stats = ['PF', 'Yds','Tot Yds & TO Ply','Tot Yds & TO Y/P','Tot Yds & TO TO','FL','1stD','Passing Cmp','Passing Att',
             'Passing Yds','Passing TD','Passing Int','Passing NY/A','Passing 1stD','Penalties Pen','Penalties Yds',
             'Penalties 1stPy','Sc%','Unnamed: 26_level_0 TO%','EXP']
    df_D['Season']= df_D['Season'].astype(str)
#     for stat in df[d_stats + ['FPoints']]:
#         opp = abbrev_dict[df['Unnamed: 7_level_0 Opp']]
#         df['D_'+stat] = df_D.loc[df_D['Tm'] == opp][stat]
    for stat in d_stats:
#         print(stat)
        df[stat] = df.apply(lambda row: get_team_stat(row,stat,df_D), axis=1)
    return df, d_stats
def get_team_stat(row,stat,df_D):
    abbrev_dict = {'ARI':'Arizona Cardinals', 'ATL':'Atlanta Falcons', 'BAL':'Baltimore Ravens', 'BUF':'Buffalo Bills', 
                          'CAR':'Carolina Panthers', 'CHI':'Chicago Bears', 'CIN':'Cincinnati Bengals', 
                          'CLE':'Cleveland Browns', 'DAL':'Dallas Cowboys', 'DEN':'Denver Broncos', 'DET':'Detroit Lions', 
                          'GB':'Green Bay Packers','GNB':'Green Bay Packers', 'HOU': 'Houston Texans', 'IND':'Indianapolis Colts', 
                          'JAC':'Jacksonville Jaguars', 'JAX':'Jacksonville Jaguars','KAN':'Kansas City Chiefs','KC':'Kansas City Chiefs','LAC':'Los Angeles Chargers',
                          'LAR':'Los Angeles Rams', 'MIA':'Miami Dolphins', 'MIN':'Minnesota Vikings','NE':'New England Patriots','NWE':'New England Patriots',
                          'NO':'New Orleans Saints','NOR':'New Orleans Saints', 'NYG':'New York Giants', 'NYJ':'New York Jets','OAK':'Oakland Raiders', 
                          'PHI':'Philadelphia Eagles', 'PIT':'Pittsburgh Steelers', 'SD':'San Diego Chargers', 'SDG':'San Diego Chargers', 'SEA':'Seattle Seahawks', 
                          'SF':'San Francisco 49ers', 'SFO':'San Francisco 49ers','STL':'St. Louis Rams', 'TB':'Tampa Bay Buccaneers',
                           'TEN':'Tennessee Titans', 'TAM':'Tampa Bay Buccaneers',
                          'WAS':'Washington Redskins'}
#     print(row)
#     print(df_D['Season'])
    df_D.index=df_D['Tm']+df_D['Season']
#     print(df_D)
    opp = abbrev_dict[row['Unnamed: 7_level_0 Opp']]+str(row['Season'])
#     print(type(df_D.loc[df_D['Tm']==opp][stat]))
#     print(opp,stat)
    return df_D.at[opp,stat]

def rolling_average(df, window):
    return df.rolling(min_periods=1, window=window).mean().shift(1)

def get_player_averages(df, stats):
    """
    Estimate player averages for all stats and FanDuel point histories,
    for season-to-date, last 4 weeeks, and previous week
    """
    feature_names = []
    for stat in df[stats + ['FPoints']]:
        df['season_{}'.format(stat)] = df.groupby('Name')[stat].apply(lambda x: rolling_average(x, 16))
        df['recent_{}'.format(stat)] = df.groupby('Name')[stat].apply(lambda x: rolling_average(x, 4))
        df['prev_{}'.format(stat)] = df.groupby('Name')[stat].apply(lambda x: rolling_average(x, 1))
        feature_names = feature_names + [time + "_" + stat for time in ['season', 'recent', 'prev']]
    return df, feature_names


path = "data/"
train, features = data_processing(path + 'QB_all.csv', path + 'teams_all.csv')
# test, features2 = data_processing(path + 'gamelog_QB_2018.csv', path + 'teams_2018.csv')
# if (features != features2):
#     print("Debug error about feature inconsistency")
#     exit()



In [4]:
test, features2 = data_processing(path + 'gamelog_QB_2018.csv', path + 'teams_2018.csv')

In [6]:
positions = sorted(train['Position'].unique())
estimators = ["Ridge",
              "ElasticNet",
              "RandomForestRegressor"
              # "GradientBoostingRegressor"
              # "SVM"
              ]
types = ['train', 'cv', 'test']
# Dataframe index, e.g. Ridge_train
rmse_names = [x + '_' + y for y in types for x in estimators]
df_rmse = pd.DataFrame([[0.0] for j in range(len(rmse_names))], 
    index = rmse_names)

In [11]:
for position in positions:
    # Iterate through all positions
    print ('Learning for Position %s ...' % position)
    df_pos_train = train.loc[train['Position'] == position,]
    df_pos_test = test.loc[test['Position'] == position,]

    for i in range(len(estimators)):
        est = estimators[i]

        if(est == "GradientBoostingRegressor"):
            n_estimators = [50]
            learning_rate = [0.1]
            param_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate}
            grid_search = GridSearchCV(GradientBoostingRegressor(max_depth=3), param_grid, cv=5)
            grid_search.fit(df_pos_train[features], df_pos_train['FPoints'])

        elif(est == "RandomForestRegressor"):
            n_estimators = [50]
            param_grid = {'n_estimators': n_estimators}
            grid_search = GridSearchCV(RandomForestRegressor(max_depth=3), param_grid, cv=5)
            grid_search.fit(df_pos_train[features], df_pos_train['FPoints'])

        elif(est == "ElasticNet"):
            grid_search = ElasticNetCV().fit(df_pos_train[features], df_pos_train['FPoints'])

        elif(est == "BayesianRidge"):
            alpha_1 = [1e-6, 1e-5, 1e-7]
            alpha_2 = [1e-6, 1e-5, 1e-7]
            lambda_1 = [1e-6, 1e-5, 1e-7]
            lambda_2 = [1e-6, 1e-5, 1e-7]
            param_grid = {'alpha_1': alpha_1, 'alpha_2':alpha_2, 'lambda_1':lambda_1, 'lambda_2':lambda_2}
            grid_search = GridSearchCV(BayesianRidge(), param_grid, cv=5)
            grid_search.fit(df_pos_train[features], df_pos_train[target])

        elif(est == "Ridge"):
            grid_search = RidgeCV().fit(df_pos_train[features], df_pos_train['FPoints'])

        elif(est == "SVM"):
            C = [50]
            gamma = [0.3]
            param_grid = {'C': C, 'gamma': gamma}
            grid_search = GridSearchCV(SVC(), param_grid, cv=5)
            grid_search.fit(df_pos_train[features], df_pos_train['FPoints'])

        else:
            print(est)
            print("Cannot find the algorithm")
            exit()

        train_rmse = np.sqrt(np.mean( (df_pos_train['FPoints'] - \
                    grid_search.predict(df_pos_train[features]))**2.0 ))
        test_rmse = np.sqrt(np.mean( (df_pos_test['FPoints'] - \
                    grid_search.predict(df_pos_test[features]))**2.0 ))
        # Deprecating "mean_squared_error". Use "neg_mean_squared_error" instead.
        cv_rmse = np.sqrt(np.abs( cross_val_score(grid_search, train[features], train['FPoints'],\
            cv = 5, scoring = 'neg_mean_squared_error').mean() ))

        # Given the variable name in a string, get the variable value and import into dataframe
        for val in types:
            df_rmse.loc[estimators[i] + "_" + val, position] = eval(val + '_rmse')

Learning for Position QB ...




In [14]:
""" save rmse into csv """

df_rmse.to_csv('rmse.csv', header = True, index=True)

"""
MSE of FD_2016_Projections.csv (Fantasydata.com)
"""

test['diff'] = (test['proj'] - test['FPoints']) ** 2.0
FantasyData_rmse = (test.groupby(['Position'])['diff'].mean()) ** 0.5
FantasyData_rmse.to_csv('FantasyData_rmse.csv', header = True, index = True)

print("Program finished normally")

KeyError: 'proj'