In [17]:
"""
Created on Tue Jul  7 15:01:19 2020
@author: Nick Bohall
"""

'\nCreated on Tue Jul  7 15:01:19 2020\n@author: Nick Bohall\n'

In [1]:
#packages
import pandas as pd
import numpy as np
import sklearn as sk 
from sklearn.metrics import f1_score, make_scorer, classification_report
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
results = pd.read_csv("Z:\\Coding\\Myproj\\NFL\\Data\\spreadspoke_scores_reversed.csv")
results.replace(r'^\s*$', np.nan, regex=True) #replace Blanks with nan
results.drop(['stadium', 'stadium_neutral'], 1, inplace = True) #dropping columns
results.columns = ['date', 'season', 'week', 'playoff', 'homeTeam', 'homeScore', 'awayScore', 'awayTeam',  #Rename columns
              'favorite', 'spread', 'total', 'temp', 'wind', 'humidity', 'detail']

In [3]:
#This is a modifier for a specific season of the dataset. If the whole dataset is needed, just comment out
year = input('What year are we looking at?: ')
results = results[results['season'] == int(year)]

What year are we looking at?: 2020


In [4]:
# Setting larger display for Data Frame
pd.set_option('display.max_rows', 500) 
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [5]:
dict = {                    #This is a dictionary of all of the teams and their Abbreviations
'Arizona Cardinals':'ARI',
'Atlanta Falcons':'ATL',
'Baltimore Ravens':'BAL',
'Buffalo Bills':'BUF',
'Carolina Panthers':'CAR',
'Chicago Bears':'CHI',
'Cincinnati Bengals':'CIN',
'Cleveland Browns':'CLE',
'Dallas Cowboys':'DAL',
'Denver Broncos':'DEN',
'Detroit Lions':'DET',
'Green Bay Packers':'GB',
'Houston Texans':'HOU',
'Indianapolis Colts':'IND',
'Jacksonville Jaguars':'JAX',
'Kansas City Chiefs':'KC',
'Los Angeles Chargers':'LAC',
'Los Angeles Rams':'LAR',
'Miami Dolphins':'MIA',
'Minnesota Vikings':'MIN',
'New England Patriots':'NE',
'New Orleans Saints':'NO',
'New York Giants':'NYG',
'New York Jets':'NYJ',
'Oakland Raiders':'OAK',
'Philadelphia Eagles':'PHI',
'Pittsburgh Steelers':'PIT',
'Seattle Seahawks':'SEA',
'San Francisco 49ers':'SF',
'Tampa Bay Buccaneers':'TB',
'Tennessee Titans':'TEN',
'Washington Redskins':'WAS',
'Houston Oilers':'TEN',
'Tennessee Oilers':'TEN',
'Los Angeles Raiders':'OAK',
'San Diego Chargers':'LAC',
'St. Louis Rams':'LAR',
'Phoenix Cardinals':'ARI',
'St. Louis Cardinals':'ARI',
'Baltimore Colts':'IND', 
'Boston Patriots':'NE'
}

In [6]:
#This changes the team name to the abbreviation so for easier reference
results['homeTeam'] = results['homeTeam'].replace(dict)
results['awayTeam'] = results['awayTeam'].replace(dict)

results['total'] = pd.to_numeric(results['total'], errors='coerce')  #This converts values to numeric and NaN for blanks
results['total'] = results['total'].astype(float) # This converts the type to float

In [7]:
#exploratory

homeWin = results['homeScore'] > results['awayScore']  #Boolean list of home wins
awayWin = results['homeScore'] < results['awayScore']  #Boolean list of away wins

tie = results['homeScore'] == results['awayScore']     #Boolean list of ties

homeFav = (results.homeTeam == results.favorite)  #Boolean list of Home favorites
awayFav = (results.awayTeam == results.favorite)  #Boolean list of Away favorites

pick = (results.favorite == 'PICK')

In [8]:
# More Explore

homeFavWin = homeWin & homeFav #When the home favorite wins game
awayFavWin = awayWin & awayFav #When the away favorite wins game

homeFavLose = ~homeWin & homeFav #When the home favorite loses game
awayFavLose = ~awayWin & awayFav #When the away favorite loses game

all = (homeFavWin | awayFavWin | homeFavLose | awayFavLose | pick) #combines all of the possibilities

In [9]:
#Percentages. These give the decimal values of all 4 scenarios of game outcomes where there is a favorite (does not include pickems)

homeFavWinP = homeFavWin.sum() / homeFav.sum()
awayFavWinP = awayFavWin.sum() / awayFav.sum()
homeFavLoseP = homeFavLose.sum() / homeFav.sum()
awayFavLoseP = awayFavLose.sum() / awayFav.sum()

  homeFavWinP = homeFavWin.sum() / homeFav.sum()
  awayFavWinP = awayFavWin.sum() / awayFav.sum()
  homeFavLoseP = homeFavLose.sum() / homeFav.sum()
  awayFavLoseP = awayFavLose.sum() / awayFav.sum()


In [10]:
#This fuction just turns the decimals above to percentages for easy reading. Just pass the argument into 'Input'
def percentage(input):
    new = str(round(input * 100, 2)) + ' %'
    return new

In [28]:
def seasonStats(season, playoff = False): #Needs work
    stats = results.query("schedule_season == @season & schedule_playoff == @playoff")
    print(stats.head())
    print(sum(stats.score_home))
    print(sum(stats.score_away))

In [29]:
#Counts the number of times each team is favored
favor = results.favorite.value_counts()

In [30]:
#Beginning the NBA analysis recreation

results['homeWin'] = homeWin
y_true = results['homeWin'].values #this takes the true false column and turns it into a list
n_games = results['homeWin'].count()  #this is the number of games played
n_homewins = results['homeWin'].sum()  #this is the number of Trues in the column (aka home wins)
win_percentage = n_homewins / n_games  #calculating win %
print('Home Win percentage: {0:.1f}%'.format(100 * win_percentage))  

Home Win percentage: 52.1%


In [31]:
y_pred = [1] * len(y_true) #This is for the modeling later
print('Baseline --> F1: {:.4f}'.format(f1_score(y_true, y_pred, pos_label = None, average = 'weighted'))) #This F1 score is the score to beat

Baseline --> F1: 0.3565


In [32]:
results['homeLastWin'] = False 
results['awayLastWin'] = False

from collections import defaultdict #like a regular dictionary but doesn't bring errors
won_last = defaultdict(int) #defaultdict makes sets everything in the dictionary to a value of 0 initially

for index, row in results.iterrows(): #iterrows is something that iterrates over a df --> Not efficient
    home_team = row['homeTeam'] #This creates a temporary variable that calls on the home team
    away_team = row['awayTeam'] #This creates a temporary variable that calls on the away team
    row['homeLastWin'] = won_last[home_team] #This ammends the homeLastWin row to the defaultdict of each team
    row['awayLastWin'] = won_last[away_team]
    results.loc[index] = row #Not sure
    won_last[home_team] = row['homeWin']  #This looks at if the team won the game and inputs it into the lastwin columns
    won_last[away_team] = not row['homeWin'] #if homeWin = False, then the away team won
    
print(results[['season', 'week', 'homeTeam', 'homeScore', 'awayScore', 'awayTeam', 'favorite', 'spread', 'total', 'homeWin', 'homeLastWin', 'awayLastWin']])

       season        week homeTeam  homeScore  awayScore awayTeam favorite  spread  total  homeWin homeLastWin awayLastWin
12411    2019           1      CHI          3         10       GB      CHI    -3.0   46.5    False           0           0
12412    2019           1      ARI         27         27      DET      DET    -3.0   45.5    False           0           0
12413    2019           1      CAR         27         30      LAR      LAR    -2.0   49.5    False           0           0
12414    2019           1      CLE         13         43      TEN      CLE    -5.5   44.0    False           0           0
12415    2019           1      DAL         35         17      NYG      DAL    -7.0   44.0     True           0           0
12416    2019           1      JAX         26         40       KC       KC    -3.5   49.0    False           0           0
12417    2019           1      LAC         30         24      IND      LAC    -6.0   44.5     True           0           0
12418    2019   

In [33]:
#Checking the new score after adding the above columns
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state = 14)
scorer = make_scorer (f1_score, pos_label = None, average = 'weighted')

from sklearn.model_selection import cross_val_score  #This just uses a decision tree classifier to look at if the team won their last game or not
X_previouswins = results[['homeLastWin', 'awayLastWin']].values 
clf = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(clf, X_previouswins, y_true, scoring = scorer)
print('Using just the last result from home and away teams - F1: {0:.4f}'.format(np.mean(scores))) #Using the new information, the F score does go up

Using just the last result from home and away teams - F1: 0.5754


In [34]:
#What about win streaks
results['homeWinStreak'] = 0
results['awayWinStreak'] = 0
win_streak = defaultdict(int)

for index, row in results.iterrows(): 
    home_team = row['homeTeam'] 
    away_team = row['awayTeam'] 
    row['homeWinStreak'] = win_streak[home_team] 
    row['awayWinStreak'] = win_streak[away_team]
    results.loc[index] = row 
    if row['homeWin']:
        win_streak[home_team] += 1
        win_streak[away_team] = 0
    else:
        win_streak[home_team] = 0
        win_streak[away_team] += 1
        
print(results[['homeTeam', 'homeScore', 'awayScore', 'awayTeam', 'favorite', 'spread', 'total', 'homeWin', 'homeWinStreak', 'awayWinStreak']])

      homeTeam  homeScore  awayScore awayTeam favorite  spread  total  homeWin  homeWinStreak  awayWinStreak
12411      CHI          3         10       GB      CHI    -3.0   46.5    False              0              0
12412      ARI         27         27      DET      DET    -3.0   45.5    False              0              0
12413      CAR         27         30      LAR      LAR    -2.0   49.5    False              0              0
12414      CLE         13         43      TEN      CLE    -5.5   44.0    False              0              0
12415      DAL         35         17      NYG      DAL    -7.0   44.0     True              0              0
12416      JAX         26         40       KC       KC    -3.5   49.0    False              0              0
12417      LAC         30         24      IND      LAC    -6.0   44.5     True              0              0
12418      MIA         10         59      BAL      BAL    -7.0   40.5    False              0              0
12419      MIN     

In [35]:
#Checking new score after adding win streaks
X_winstreak = results[['homeLastWin', 'awayLastWin', 'homeWinStreak', 'awayWinStreak']].values
scores = cross_val_score(clf, X_winstreak, y_true, scoring = scorer)
print('Using whether the hometeam is on a winstreak or not - F1: {0:.4f}'.format(np.mean(scores)))

Using whether the hometeam is on a winstreak or not - F1: 0.5464


In [36]:
#Bringing in 2018 Rankings
rankings = pd.read_csv("Z:\\Coding\\Myproj\\NFL\\Data\\2018_nfl_rankings.csv")
rankings['Tm'] = rankings['Tm'].replace(dict)
rankings['rank'] = rankings['W'].rank(method='first', ascending = False, pct = False)
rankings = rankings.sort_values('rank')
rankings.set_index('Tm', inplace = True)
print(rankings)

      W   L  T   W-L%   PF   PA   PD   MoV  SoS   SRS  OSRS  DSRS  rank
Tm                                                                     
NO   13   3  0  0.813  504  353  151   9.4  0.6  10.1   7.9   2.2   1.0
LAR  13   3  0  0.813  527  384  143   8.9 -0.4   8.5   9.5  -1.1   2.0
KC   12   4  0  0.750  565  421  144   9.0 -0.1   8.9  12.6  -3.8   3.0
LAC  12   4  0  0.750  428  329   99   6.2 -0.2   6.0   3.0   2.9   4.0
CHI  12   4  0  0.750  421  283  138   8.6 -2.3   6.3   1.5   4.8   5.0
NE   11   5  0  0.688  436  325  111   6.9 -1.8   5.2   3.1   2.1   6.0
HOU  11   5  0  0.688  402  316   86   5.4 -1.5   3.8   2.4   1.4   7.0
BAL  10   6  0  0.625  389  287  102   6.4  0.6   7.0   0.6   6.4   8.0
IND  10   6  0  0.625  433  344   89   5.6 -2.2   3.4   3.9  -0.6   9.0
DAL  10   6  0  0.625  339  324   15   0.9  0.2   1.1  -1.9   2.9  10.0
SEA  10   6  0  0.625  428  347   81   5.1 -0.6   4.5   3.0   1.5  11.0
PIT   9   6  1  0.594  428  360   68   4.3  1.3   5.6   3.9   1.

In [37]:
#Creating a new feature -- HomeTeamRanksHigher
def home_team_ranked_higher(row):
    home_team = row['homeTeam']
    away_team = row['awayTeam']
    home_rank = rankings.loc[home_team]['rank']
    away_rank = rankings.loc[away_team]['rank']
    return home_rank < away_rank #ranking higher == lower

results['homeTeamRankedHigher'] = results.apply(home_team_ranked_higher, axis = 1)
print(results.head(40))

            date  season week  playoff homeTeam  homeScore  awayScore awayTeam favorite  spread  total  temp  wind humidity detail  homeWin homeLastWin awayLastWin  homeWinStreak  awayWinStreak  homeTeamRankedHigher
12411   9/5/2019    2019    1    False      CHI          3         10       GB      CHI    -3.0   46.5   NaN   NaN      NaN    NaN    False           0           0              0              0                  True
12412   9/8/2019    2019    1    False      ARI         27         27      DET      DET    -3.0   45.5  72.0   0.0      NaN   DOME    False           0           0              0              0                 False
12413   9/8/2019    2019    1    False      CAR         27         30      LAR      LAR    -2.0   49.5   NaN   NaN      NaN    NaN    False           0           0              0              0                 False
12414   9/8/2019    2019    1    False      CLE         13         43      TEN      CLE    -5.5   44.0   NaN   NaN      NaN    NaN    Fa

In [38]:
X_homehigher = results[['homeLastWin', 'awayLastWin', 'homeTeamRankedHigher']].values
scores = cross_val_score(clf, X_homehigher, y_true, scoring = scorer)
print('Using whether the home team is ranked higher last season - F1: {0:.4f}'.format(np.mean(scores)))

Using whether the home team is ranked higher last season - F1: 0.6707


In [39]:
#Next feature looks at which team won in the last encounter between the two teams

last_match_winner = defaultdict(int)

def home_team_won_last (row):
    home_team = row['homeTeam']
    away_team = row['awayTeam']
    #sort for consistent ordering
    teams = tuple(sorted([home_team, away_team]))
    result = 1 if last_match_winner[teams] == row['homeTeam'] else 0
    #update record for next encounter
    winner = row['homeTeam'] if row['homeWin'] else row['awayTeam']
    last_match_winner[teams] = winner
    return result 

results['homeTeamWonLast'] = results.apply(home_team_won_last, axis=1)
print(results)

             date  season        week  playoff homeTeam  homeScore  awayScore awayTeam favorite  spread  total  temp  wind humidity detail  homeWin homeLastWin awayLastWin  homeWinStreak  awayWinStreak  homeTeamRankedHigher  homeTeamWonLast
12411    9/5/2019    2019           1    False      CHI          3         10       GB      CHI    -3.0   46.5   NaN   NaN      NaN    NaN    False           0           0              0              0                  True                0
12412    9/8/2019    2019           1    False      ARI         27         27      DET      DET    -3.0   45.5  72.0   0.0      NaN   DOME    False           0           0              0              0                 False                0
12413    9/8/2019    2019           1    False      CAR         27         30      LAR      LAR    -2.0   49.5   NaN   NaN      NaN    NaN    False           0           0              0              0                 False                0
12414    9/8/2019    2019           

In [40]:
X_home_higher = results[['homeLastWin', 'awayLastWin', 'homeTeamRankedHigher', 'homeTeamWonLast']].values
clf = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(clf, X_home_higher, y_true, scoring = scorer)
print('Using whether the home team won last encounter - Accuracy: {0:.4f}'.format(np.mean(scores)))

Using whether the home team won last encounter - Accuracy: 0.6779


In [41]:
#my attempt at creating a spread column. not sure if it works yet. 

def home_spread (row):
    home_team = row['homeTeam']
    favorite = row['favorite']
    spread = row['spread']
    if home_team == favorite:
        return spread
    else: return spread*(-1)
    
results['homeSpread'] = results.apply(home_spread, axis=1)
print(results[['season', 'week', 'homeTeam', 'homeScore', 'awayScore', 'awayTeam', 'favorite', 'spread', 'total', 'homeWin', 'homeLastWin', 'awayLastWin']])

       season        week homeTeam  homeScore  awayScore awayTeam favorite  spread  total  homeWin homeLastWin awayLastWin
12411    2019           1      CHI          3         10       GB      CHI    -3.0   46.5    False           0           0
12412    2019           1      ARI         27         27      DET      DET    -3.0   45.5    False           0           0
12413    2019           1      CAR         27         30      LAR      LAR    -2.0   49.5    False           0           0
12414    2019           1      CLE         13         43      TEN      CLE    -5.5   44.0    False           0           0
12415    2019           1      DAL         35         17      NYG      DAL    -7.0   44.0     True           0           0
12416    2019           1      JAX         26         40       KC       KC    -3.5   49.0    False           0           0
12417    2019           1      LAC         30         24      IND      LAC    -6.0   44.5     True           0           0
12418    2019   

In [42]:
def homeFav (row):
    home_spread = row['homeSpread']
    if home_spread < 0:
        return 1
    else:
        return 0

results['homeFav'] = results.apply(homeFav, axis =1)

In [43]:
X_homeFav = results[['homeFav']].values
scores = cross_val_score(clf, X_homeFav, y_true, scoring = scorer, cv = 4)
print('Using the favorite - F1: {0:.4f}'.format(np.mean(scores)))

Using the favorite - F1: 0.6410


In [44]:
results.to_csv("Z:\\Coding\\Myproj\\NFL\\Data\\resultsTest.csv")