In [1]:
import pandas as pd
df = pd.read_csv('nba_games_test.csv', index_col=0)
df = df.sort_values("date")
df = df.reset_index(drop=True)
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]




In [2]:
def add_target_ou(group):
    group["target_ou"] = group["ou_bet"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target_ou)

def add_target_spread(group):
    group["target_spread"] = group["spread_bet"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target_spread)

def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target)

df["target"][pd.isnull(df["target"])] = 2
df["target"] = df["target"].astype(int, errors="ignore")

dex = df[df.target_spread.isna()]['target_spread'].index
df.loc[dex,'target_spread'] = 2

dex = df[df.target_ou.isna()]['target_ou'].index
df.loc[dex,'target_ou'] = 2


nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]
valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df["target"])] = 2


In [3]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier

rr = RidgeClassifier(alpha=1)

rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            min_samples_leaf=4,
                            n_estimators=400,
                            max_depth=70, 
                            )

split = TimeSeriesSplit(n_splits=3)

sfs_tar = SequentialFeatureSelector(rr, 
                                n_features_to_select=30, 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )
sfs_ou = SequentialFeatureSelector(rr, 
                                n_features_to_select=30, 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )
sfs_spread = SequentialFeatureSelector(rr, 
                                n_features_to_select=30, 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )

In [4]:
removed_columns = ["season", "date", "won", "target",'target_spread','target_ou', "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [6]:
# added target as param to function, need to update rest of code
def backtest(data, model, predictors, target, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train[target])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test[target], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [7]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis=1)

df = df.dropna()

In [8]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

df["ou_next"] = add_col(df, "ou")
df["spread_next"] = add_col(df, "spread")
df["fav_next"] = add_col(df, "fav")

In [9]:
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], 
                right_on=["team_opp_next", "date_next"])

In [10]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [11]:
sfs_tar.fit(full[selected_columns], full["target"])


SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=RidgeClassifier(alpha=1),
                          n_features_to_select=30, n_jobs=1)

In [12]:
sfs_ou.fit(full[selected_columns], full["target_ou"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=RidgeClassifier(alpha=1),
                          n_features_to_select=30, n_jobs=1)

In [13]:
sfs_spread.fit(full[selected_columns], full["target_spread"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=RidgeClassifier(alpha=1),
                          n_features_to_select=30, n_jobs=1)

In [14]:
# For moneyline
predictors = list(selected_columns[sfs_tar.get_support()])
predictions = backtest(full, rr, predictors, 'target')
from sklearn.metrics import accuracy_score
accuracy_score(predictions["actual"], predictions["prediction"])

0.6754662582214614

In [15]:
# For spread
predictors = list(selected_columns[sfs_spread.get_support()])
predictions = backtest(full, rr, predictors, 'target_spread')
accuracy_score(predictions["actual"], predictions["prediction"])

0.5148211550590144

In [16]:
# For ou
predictors = list(selected_columns[sfs_ou.get_support()])
predictions = backtest(full, rr, predictors, 'target_ou')
accuracy_score(predictions["actual"], predictions["prediction"])

0.5089647716010451

In [17]:
next_30 = df[df.target==2]
next_game = pd.read_csv('next_game_30.csv',index_col=0)
make_pred = next_30.merge(next_game,how='left',on=['team'])

make_pred.home_next = make_pred.HOME
make_pred.team_opp_next = make_pred.OPPONENT
make_pred.date_next = make_pred.DATE

del make_pred['DATE']
del make_pred['OPPONENT']
del make_pred['HOME']

import numpy as np
rows = df[df['target']==2].index
df.loc[rows,['target','team','team_opp_next','date_next','home_next']] = np.array(make_pred[['target','team','team_opp_next','date_next','home_next']])

In [18]:
next_30 = df[df.target==2]
odds_tonight = pd.read_csv('odds_tonight.csv',index_col=0)
combined = next_30.merge(odds_tonight,how='left',on=['team'])
combined.fav_next = combined.favO
combined.ou_next = combined.ouO
combined.spread_next = combined.spreadO

del combined['favO']
del combined['ouO']
del combined['spreadO']

import numpy as np
rows = df[df['target']==2].index
df.loc[rows,['spread_next','fav_next','ou_next']] = np.array(combined[['spread_next','fav_next','ou_next']])

In [19]:
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]],  left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])

In [20]:
predictions = backtest(full, rr, predictors,'target')
ml = pd.concat([full,predictions],axis=1)
import datetime
today = str(datetime.date.today())
ml[(ml.target==2) & (ml.date_next==today)][['date_next','team_x','team_y','prediction']].sort_values(['date_next','team_x'])

Unnamed: 0,date_next,team_x,team_y,prediction
15346,2023-01-24,BOS,MIA,1.0
15345,2023-01-24,CHA,PHX,1.0
15331,2023-01-24,CHI,IND,1.0
15351,2023-01-24,CLE,NYK,1.0
15329,2023-01-24,DAL,WAS,0.0
15335,2023-01-24,DEN,NOP,1.0
15353,2023-01-24,IND,CHI,0.0
15327,2023-01-24,LAC,LAL,0.0
15344,2023-01-24,LAL,LAC,1.0
15341,2023-01-24,MIA,BOS,0.0


In [21]:
predictions = backtest(full, rr, predictors,'target_ou')
ou = pd.concat([full,predictions],axis=1)
import datetime
today = str(datetime.date.today())
ou[(ou.target==2) & (ou.date_next==today)][['date_next','team_x','team_y','prediction']].sort_values(['date_next','team_x'])

Unnamed: 0,date_next,team_x,team_y,prediction
15346,2023-01-24,BOS,MIA,1.0
15345,2023-01-24,CHA,PHX,1.0
15331,2023-01-24,CHI,IND,1.0
15351,2023-01-24,CLE,NYK,0.0
15329,2023-01-24,DAL,WAS,0.0
15335,2023-01-24,DEN,NOP,0.0
15353,2023-01-24,IND,CHI,0.0
15327,2023-01-24,LAC,LAL,0.0
15344,2023-01-24,LAL,LAC,1.0
15341,2023-01-24,MIA,BOS,1.0


In [22]:
predictions = backtest(full, rr, predictors,'target_spread')
spread = pd.concat([full,predictions],axis=1)
import datetime
today = str(datetime.date.today())
spread[(spread.target==2) & (spread.date_next==today)][['date_next','team_x','team_y','prediction']].sort_values(['date_next','team_x'])

Unnamed: 0,date_next,team_x,team_y,prediction
15346,2023-01-24,BOS,MIA,0.0
15345,2023-01-24,CHA,PHX,0.0
15331,2023-01-24,CHI,IND,0.0
15351,2023-01-24,CLE,NYK,0.0
15329,2023-01-24,DAL,WAS,1.0
15335,2023-01-24,DEN,NOP,1.0
15353,2023-01-24,IND,CHI,1.0
15327,2023-01-24,LAC,LAL,1.0
15344,2023-01-24,LAL,LAC,0.0
15341,2023-01-24,MIA,BOS,0.0
