In [1]:
import pandas as pd
df = pd.read_csv('nba_games_updated.csv', index_col=0)
df = df.sort_values("date")
df = df.reset_index(drop=True)
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]




In [2]:
def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target)

df["target"][pd.isnull(df["target"])] = 2
df["target"] = df["target"].astype(int, errors="ignore")

nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]

valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df["target"])] = 2


In [3]:
df.target.value_counts()

0    9699
1    9695
2      30
Name: target, dtype: int64

In [4]:
# from sklearn import datasets, ensemble


# params = {
#     "n_estimators": 500,
#     "max_depth": 4,
#     "min_samples_split": 5,
#     "learning_rate": 0.01,
#     "loss": "squared_error",
# }

# reg = ensemble.GradientBoostingRegressor(**params)

In [5]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier

rr = RidgeClassifier(alpha=1)

rf = RandomForestClassifier(max_depth=3, max_samples = .4, n_estimators = 100)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rf, 
                                n_features_to_select=30, 
                                direction="backward",
                                cv=split,
                                n_jobs=1
                               )

In [6]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [8]:
def backtest(data, model, predictors, target, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train[target])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test[target], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [9]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis=1)

df = df.dropna()

In [10]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])


In [11]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [12]:
#Using saved pickle in next cell
#sfs.fit(full[selected_columns], full["target"])

In [13]:
import pickle
filename = 'rr_model.sav'
#pickle.dump(sfs, open(filename, 'wb'))
sfs = pickle.load(open(filename, 'rb'))

In [14]:
# Train rf 'forward'
#sfs.fit(full[selected_columns], full["target"])
#import pickle
#filename = 'rf_model.sav'
#pickle.dump(sfs, open(filename, 'wb'))
#sfs = pickle.load(open(filename, 'rb'))

In [15]:
predictors = list(selected_columns[sfs.get_support()])
predictions = backtest(full, rr, predictors, 'target')

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(predictions["actual"], predictions["prediction"])

0.6384498615947852

In [17]:
# .6216 accuracy

In [18]:
next_game = pd.read_csv('next_game_30.csv',index_col=0)

In [19]:
# Need to do this before merging into 'Full'
df.team = df.team.str.replace('CHO','CHA').str.replace('PHO','PHX').str.replace('BRK','BKN')
df.team_opp = df.team_opp.str.replace('CHO','CHA').str.replace('PHO','PHX').str.replace('BRK','BKN')
df.team_opp_next = df.team_opp_next.str.replace('CHO','CHA').str.replace('PHO','PHX').str.replace('BRK','BKN')

In [20]:
next_30 = df[df.target==2]

In [21]:
make_pred = next_30.merge(next_game,how='left',on=['team'])

In [22]:
make_pred.home_next = make_pred.HOME
make_pred.team_opp_next = make_pred.OPPONENT
make_pred.date_next = make_pred.DATE

del make_pred['DATE']
del make_pred['OPPONENT']
del make_pred['HOME']


In [23]:
import numpy as np
rows = df[df['target']==2].index
df.loc[rows,['target','team','team_opp_next','date_next','home_next']] = np.array(make_pred[['target','team','team_opp_next','date_next','home_next']])

In [24]:
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]],  left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])

In [25]:
predictions = backtest(full, rr, predictors,'target')

In [26]:
ml = pd.concat([full,predictions],axis=1)

In [27]:
import datetime
today = str(datetime.date.today())
ml[(ml.target==2) & (ml.date_next==today)][['date_next','team_x','team_y','prediction']].sort_values(['date_next','team_x']).sort_values('prediction',ascending=False)

Unnamed: 0,date_next,team_x,team_y,prediction
17104,2023-01-24,BOS,MIA,1.0
17105,2023-01-24,CHI,IND,1.0
17089,2023-01-24,DAL,WAS,1.0
17098,2023-01-24,DEN,NOP,1.0
17088,2023-01-24,LAL,LAC,1.0
17093,2023-01-24,PHX,CHA,1.0
17102,2023-01-24,CHA,PHX,0.0
17077,2023-01-24,CLE,NYK,0.0
17076,2023-01-24,IND,CHI,0.0
17090,2023-01-24,LAC,LAL,0.0
