In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
from sklearn.metrics import f1_score, make_scorer, classification_report

scorer = make_scorer(f1_score, pos_label=None, average='weighted')

#This fixes the date to be in the correct format while reading the file in
df = pd.read_csv('data/2017_2018BoxScores.csv', parse_dates=['Date'])

#This grabs the files for other stats for teams
rankingsDF = pd.read_csv('data/NBAExpandedStandings2017_2018.csv', skiprows=[0], index_col='Team')
teamPerGameStats = pd.read_csv('data/TeamPerGameStats2017_2018.csv', index_col='Team')

#Renaming Columns to have them make more sense 
df.columns = ['Date', 'StartTime', 'VisitorTeam', 'VisitorPoints',
              'HomeTeam', 'HomePoints', 'ScoreType', 'OT', 'Attendance','Notes']

#Getting basic true false if home team won or lost
df['HomeWins'] = df['HomePoints'] > df['VisitorPoints']
y_true = df['HomeWins'].values

In [3]:
#Simple Prediction that the home team wins
n_games = df['HomeWins'].count()
print(n_games)
n_homewins = df['HomeWins'].sum()
print(n_homewins)
win_percentage = 100 * (n_homewins / n_games)
print('Home Team Win Percentage' , win_percentage)

544
316
Home Team Win Percentage 58.0882352941


In [4]:
#Predicting the score we need to beat to have an advantage 
#This score we need to beat is the score you get when always picking the home team to win
from sklearn.metrics import f1_score
y_pred= [1] * len(y_true)
f1_score(y_true, y_pred, pos_label = None, average='weighted')

  'precision', 'predicted', average, warn_for)


0.4268809849521204

In [5]:
#Going through and setting the LastWins for both home and Visitor based off of previous data
from datetime import datetime, timedelta

#This gets the Home Teams Rest and if they had a back to back home game
#Line commented below can be used later to get the number of days of rest
#df['HomeRest'] =  df.groupby(['HomeTeam'])['Date'].diff() - timedelta(days=1) 
home_rest_holder = df.groupby(['HomeTeam'])['Date'].diff() - timedelta(days=1) == timedelta(days=0)
df['HomeBackToBack'] = home_rest_holder


#This gets the Away Team and if they had a back to back away Game
#Line commented below can be used later to get the number of days of rest
#df['VisitorRest'] =  df.groupby(['VisitorTeam'])['Date'].diff() - timedelta(days=1) 
visitor_rest_holder = df.groupby(['VisitorTeam'])['Date'].diff() - timedelta(days=1)  == timedelta(days=0)
df['VisitorBackToBack'] = visitor_rest_holder


##NOTE THIS NEEDS TO BE A BACK TO BACK GAME REGARDLESS OF HOME OR AWAY.. RIGHT NOW IT DOESNT ACCOUNT FOR BOTH



In [6]:
#This method below finds the rankings of the visitor team and the home team and visitor team
#and sets the row to true if home team ranks higher else false.
def home_team_rankings_higher(row):
    home_team  = row['HomeTeam']
    visitor_team = row['VisitorTeam']
    home_rank = rankingsDF.loc[home_team]['Rk']
    visitor_rank = rankingsDF.loc[visitor_team]['Rk']
    return home_rank < visitor_rank #the lower the rank the higher you are thats why hometeam < visitor

df['HomeTeamRanksHigher'] = df.apply(home_team_rankings_higher, axis = 1)


In [7]:
#This method below finds teams per game stats that is still in the works like FG%, 3P%, Assist
#And Compares them to the opponents and if its higher then it will be true else false
def home_team_field_goal_percent_higher(row, column_name):
    home_team  = row['HomeTeam']
    visitor_team = row['VisitorTeam']
    home_fg_percentage = teamPerGameStats.loc[home_team][column_name]
    visitor_fg_percentage = teamPerGameStats.loc[visitor_team][column_name]
    return home_fg_percentage > visitor_fg_percentage


def home_team_per_assist_turnover_ratio_higher(row):
    home_team  = row['HomeTeam']
    visitor_team = row['VisitorTeam']
    home_assist_turnover_ratio = teamPerGameStats.loc[home_team]['AST'] / teamPerGameStats.loc[home_team]['TOV']
    visitor_assist_turnover_ratio = teamPerGameStats.loc[visitor_team]['AST'] / teamPerGameStats.loc[visitor_team]['TOV']
    return home_assist_turnover_ratio > visitor_assist_turnover_ratio 


df['HomeTeamFG%Higher'] = df.apply(home_team_field_goal_percent_higher, args=('FG%',) ,axis = 1)
df['HomeTeam3P%Higher'] = df.apply(home_team_field_goal_percent_higher, args=('3P%',) ,axis = 1)
df['HomeTeamPTSHigher'] = df.apply(home_team_field_goal_percent_higher, args=('PTS',) ,axis = 1)
df['HomeTeamAssistTurnOverRatioHigher'] = df.apply(home_team_per_assist_turnover_ratio_higher, axis = 1)

In [8]:
#This is the predictor variable 
#RENAME THIS
X_features = df[['HomeTeamRanksHigher','HomeTeamFG%Higher', 'HomeTeamPTSHigher','HomeTeamAssistTurnOverRatioHigher','HomeTeam3P%Higher']]


In [9]:
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import KFold
# from sklearn.metrics import accuracy_score
# from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

strat_k_fold = StratifiedKFold(n_splits=10, random_state=10)

# Specify parameters
c_values = list(np.arange(1, 10))
param_grid = [
    {'C': c_values, 'penalty': ['l1'], 'solver' : ['liblinear'], 'multi_class' : ['ovr']},
    {'C': c_values, 'penalty': ['l2'], 'solver' : ['liblinear', 'newton-cg', 'lbfgs'], 'multi_class' : ['ovr']}
]

grid = GridSearchCV(LogisticRegression(), param_grid, cv=strat_k_fold, scoring='accuracy', iid=False)
grid.fit(X_features, y_true)
print(grid.best_score_)

0.633504224636


In [10]:
#This fixes the date to be in the correct format while reading the file in
df2 = pd.read_csv('data/2018_2019BoxScores.csv', parse_dates=['Date'])
rankingsDF = pd.read_csv('data/NBAExpandedStandings2018_2019.csv', skiprows=[0], index_col='Team')
teamPerGameStats = pd.read_csv('data/TeamPerGameStats2018_2019.csv', index_col='Team')

#Renaming Columns to have them make more sense 
df2.columns = ['Date', 'StartTime', 'VisitorTeam', 'VisitorPoints',
              'HomeTeam', 'HomePoints', 'ScoreType', 'OT', 'Attendance','Notes']

In [11]:
df2['HomeWins'] = df2['HomePoints'] > df2['VisitorPoints']
y_test = df2['HomeWins'].values

#Simple Prediction that the home team wins
n_games = df2['HomeWins'].count()
print(n_games)
n_homewins = df2['HomeWins'].sum()
print(n_homewins)
win_percentage = 100 * (n_homewins / n_games)
print('Home Team Win Percentage' , win_percentage)

535
324
Home Team Win Percentage 60.5607476636


In [12]:
df2['HomeTeamRanksHigher'] = df2.apply(home_team_rankings_higher, axis = 1)
df2['HomeTeamFG%Higher'] = df2.apply(home_team_field_goal_percent_higher, args=('FG%',) , axis = 1)
df2['HomeTeam3P%Higher'] = df2.apply(home_team_field_goal_percent_higher, args=('3P%',) ,axis = 1)
df2['HomeTeamPTSHigher'] = df2.apply(home_team_field_goal_percent_higher, args=('PTS',) ,axis = 1)
df2['HomeTeamAssistTurnOverRatioHigher'] = df2.apply(home_team_per_assist_turnover_ratio_higher, axis = 1)

In [13]:
#This gets the Home Teams Rest and if they had a back to back home game
#Line commented below can be used later to get the number of days of rest
#df['HomeRest'] =  df.groupby(['HomeTeam'])['Date'].diff() - timedelta(days=1) 
home_rest_holder = df2.groupby(['HomeTeam'])['Date'].diff() - timedelta(days=1) == timedelta(days=0)
df2['HomeBackToBack'] = home_rest_holder


#This gets the Away Team and if they had a back to back away Game
#Line commented below can be used later to get the number of days of rest
#df['VisitorRest'] =  df.groupby(['VisitorTeam'])['Date'].diff() - timedelta(days=1) 
visitor_rest_holder = df2.groupby(['VisitorTeam'])['Date'].diff() - timedelta(days=1)  == timedelta(days=0)
df2['VisitorBackToBack'] = visitor_rest_holder


In [14]:
#This is the predictor variable 
#RENAME THIS
X_prediction_features = df2[['HomeTeamRanksHigher','HomeTeamFG%Higher', 'HomeTeamPTSHigher', 'HomeTeamAssistTurnOverRatioHigher','HomeTeam3P%Higher']]


In [15]:
from sklearn.metrics import classification_report

y_true = [1] * len(y_test)
y_pred = grid.predict(X_prediction_features)
print(classification_report(y_test,y_pred))
print('Number to beat', np.mean(y_test==y_true))
print ('My Predictor', np.mean(y_test==y_pred))

             precision    recall  f1-score   support

      False       0.56      0.56      0.56       211
       True       0.71      0.71      0.71       324

avg / total       0.65      0.65      0.65       535

Number to beat 0.605607476636
My Predictor 0.65046728972
