In [1]:
# import here
import pandas as pd
import numpy as np
import os.path
from sqlalchemy import create_engine
from NFL_RefMaps import TeamDictionary
from NFL_Metrics import SkillPoints
from scrapers import PFR_Gamelinks,PFR_Gamepage

# connect to database
print("Setting up database connections . . .")
kaggle_engine = create_engine('mysql+pymysql://root:@localhost:3306/kaggle')
kaggle_conn = kaggle_engine.connect()
nfldb_engine = create_engine('mysql+pymysql://root:@localhost:3306/main_stats')
main_engine = nfldb_engine.connect()
file = ("D:\\NFLDB\\game_info.csv")

# trim csv file to relevant stats for weeks 1-16, 2009-2018
print("Reading db csv file . . .")
gameinfo = pd.read_csv(file)

print("Dropping unused columns . . .")
# drop playoff weeks
indexNames = gameinfo[ gameinfo['schedule_playoff'] == True ].index
gameinfo.drop(indexNames,inplace=True)

# drop stats older than 2009
indexNames = gameinfo[ gameinfo['schedule_season'] < 2009 ].index
gameinfo.drop(indexNames,inplace=True)

# drop unused columns
gameinfo.drop(['schedule_playoff'],axis=1,inplace=True)
gameinfo.drop(['stadium'],axis=1,inplace=True)
gameinfo.drop(['stadium_neutral'],axis=1,inplace=True)
gameinfo.drop(['weather_temperature'],axis=1,inplace=True)
gameinfo.drop(['weather_wind_mph'],axis=1,inplace=True)
gameinfo.drop(['weather_humidity'],axis=1,inplace=True)
gameinfo.drop(['weather_detail'],axis=1,inplace=True)

def get_home_favorite(row):
    home_team = row['team_home']
    home_abbrev = TeamDictionary().nfl_api[home_team]
    if home_abbrev == row['team_favorite_id']:
        return 1
    elif row['team_favorite_id'] == 'PICK':
        return 0
    else:
        return -1

def get_spread_result(row):
    score_fav = 0
    score_und = 0
    spread = row['spread_favorite']*-1
    if(row['home_favorite']==1):
        score_fav = row['score_home']
        score_und = row['score_away']
    else:
        score_fav = row['score_away']
        score_und = row['score_home']
    diff = score_fav-score_und
    if( diff > spread ):
        return 1
    elif( diff < spread ):
        return -1
    else:
        return 0

def get_OU_result(row):
    OU = float(row['over_under_line'])
    total = row['score_home']+row['score_away']
    if( total > OU ):
        return 1
    elif( total < OU ):
        return -1
    else:
        return 0

def get_index(row):
    date = row['schedule_date']
    comps = date.split('/')
    date = comps[2]+'-'+comps[0]+'-'+comps[1]
    return date+TeamDictionary().nfl_api[row['team_home']]

# # generate metrics for dataset, set index
print("Generating odds metrics for gameinfo table . . .")
gameinfo['home_favorite'] = gameinfo.apply (lambda row: get_home_favorite(row), axis=1)
gameinfo['spread_result'] = gameinfo.apply(lambda row: get_spread_result(row),axis=1)
gameinfo['OU_result'] = gameinfo.apply(lambda row: get_OU_result(row),axis=1)
gameinfo['idx'] = gameinfo.apply(lambda row: get_index(row),axis=1)
gameinfo.set_index('idx',inplace=True)

def get_pbpindex(row):
    team_name = row['home_team']
    comps = row['game_date'].split('/')
    date = comps[2]+'-'+str(comps[0]).zfill(2)+'-'+str(comps[1]).zfill(2)
    idx = date+team_name
    return idx

def map_pfrlinks():
    cols = ['idx','pfr_link']
    df = pd.DataFrame(columns=cols)
    seasons = np.arange(2009,2019).tolist()
    weeks = np.arange(1,18).tolist()
    data = {'idx':[],'gamelinks':[],'season':[],'week':[]}

    cache = os.path.exists('pfrlinks.txt')
    cache_links = []
    cache_indexes = {}
    if cache:
        cache = open('pfrlinks.txt','r')
        games = cache.readlines()
        for game in games:
            comps = game.split(',')
            link = comps[1].rstrip()
            cache_links.append(link)
            cache_indexes[link]=comps[0].strip()
        cache.close()
    file = open('pfrlinks.txt','a')

    for season in seasons:
        for week in weeks:
            print("\tScraping ",str(season),"-",str(week)," . . .")
            pfrg = PFR_Gamelinks(season,week)
            gamelinks = pfrg.get_game_links()
            for game in gamelinks:
                if(game not in cache_links):
                    print("\t\tGame: ",game)
                    data['gamelinks'].append(game)
                    pfr = PFR_Gamepage(game)
                    gameinfo = pfr.get_gameinfo()
                    date = gameinfo[1]
                    mm = date[4:6]
                    yyyy = date[0:4]
                    dd = date[6:8]
                    date = yyyy+"-"+mm+"-"+dd
                    home_team = gameinfo[2]
                    teams = TeamDictionary().nfl_api
                    data['idx'].append(date+teams[home_team])
                    file.write(date+teams[home_team]+','+game+'\n')
                else:
                    data['gamelinks'].append(game)
                    data['idx'].append(cache_indexes[game])
                data['season'].append(season)
                data['week'].append(week)

    df = pd.DataFrame.from_dict(data)
    df.set_index('idx',inplace=True)
    file.close()
    return df

print("Getting gameids from nfl_api . . .")
# sql statement for getting gameids
sql = "select distinct(pbp.game_id) as game_id, pbp.home_team as home_team, pbp.game_date as game_date \
       from nfl_pbp pbp \
       order by pbp.game_id"
gameinfo_gameids = pd.read_sql_query(sql, kaggle_conn, index_col=None)
gameinfo_gameids['idx'] = gameinfo_gameids.apply(lambda row: get_pbpindex(row),axis=1)

gameinfo_gameids.set_index('idx',inplace=True)
gameinfo['game_id']=gameinfo_gameids['game_id']

print("Grabbing the matching pro-football-reference.com game links for each game . . .")
pfrlinks = map_pfrlinks()
print(pfrlinks.shape)
print(gameinfo.shape)


Setting up database connections . . .
Reading db csv file . . .
Dropping unused columns . . .
Generating odds metrics for gameinfo table . . .
Getting gameids from nfl_api . . .
Grabbing the matching pro-football-reference.com game links for each game . . .
	Scraping  2009 - 1  . . .
	Scraping  2009 - 2  . . .
	Scraping  2009 - 3  . . .
	Scraping  2009 - 4  . . .
	Scraping  2009 - 5  . . .
	Scraping  2009 - 6  . . .
	Scraping  2009 - 7  . . .
	Scraping  2009 - 8  . . .
	Scraping  2009 - 9  . . .
	Scraping  2009 - 10  . . .
	Scraping  2009 - 11  . . .
	Scraping  2009 - 12  . . .
	Scraping  2009 - 13  . . .
	Scraping  2009 - 14  . . .
	Scraping  2009 - 15  . . .
	Scraping  2009 - 16  . . .
	Scraping  2009 - 17  . . .
	Scraping  2010 - 1  . . .
	Scraping  2010 - 2  . . .
	Scraping  2010 - 3  . . .
	Scraping  2010 - 4  . . .
	Scraping  2010 - 5  . . .
	Scraping  2010 - 6  . . .
	Scraping  2010 - 7  . . .
	Scraping  2010 - 8  . . .
	Scraping  2010 - 9  . . .
	Scraping  2010 - 10  . . .
	Scr

In [2]:
gameinfo['pfr_gamelinks'] = pfrlinks['gamelinks']

https://www.pro-football-reference.com/boxscores/200909100pit.htm
https://www.pro-football-reference.com/boxscores/200909130crd.htm
https://www.pro-football-reference.com/boxscores/200909130atl.htm
https://www.pro-football-reference.com/boxscores/200909130rav.htm
https://www.pro-football-reference.com/boxscores/200909130car.htm
https://www.pro-football-reference.com/boxscores/200909130cin.htm
https://www.pro-football-reference.com/boxscores/200909130cle.htm
https://www.pro-football-reference.com/boxscores/200909130gnb.htm
https://www.pro-football-reference.com/boxscores/200909130htx.htm
https://www.pro-football-reference.com/boxscores/200909130clt.htm
https://www.pro-football-reference.com/boxscores/200909130nor.htm
https://www.pro-football-reference.com/boxscores/200909130nyg.htm
https://www.pro-football-reference.com/boxscores/200909130sea.htm
https://www.pro-football-reference.com/boxscores/200909130tam.htm
https://www.pro-football-reference.com/boxscores/200909140nwe.htm
https://ww

https://www.pro-football-reference.com/boxscores/201701010nyj.htm
https://www.pro-football-reference.com/boxscores/201701010phi.htm
https://www.pro-football-reference.com/boxscores/201701010pit.htm
https://www.pro-football-reference.com/boxscores/201701010sdg.htm
https://www.pro-football-reference.com/boxscores/201701010sfo.htm
https://www.pro-football-reference.com/boxscores/201701010tam.htm
https://www.pro-football-reference.com/boxscores/201701010oti.htm
https://www.pro-football-reference.com/boxscores/201701010was.htm
https://www.pro-football-reference.com/boxscores/201709070nwe.htm
https://www.pro-football-reference.com/boxscores/201709100buf.htm
https://www.pro-football-reference.com/boxscores/201709100chi.htm
https://www.pro-football-reference.com/boxscores/201709100cin.htm
https://www.pro-football-reference.com/boxscores/201709100cle.htm
https://www.pro-football-reference.com/boxscores/201709100dal.htm
https://www.pro-football-reference.com/boxscores/201709100det.htm
https://ww