In [1]:
import pandas as pd

In [None]:
df = pd.read_csv("nba_games.csv", index_col=0)

In [None]:
df

In [None]:
df = df.sort_values("date")

In [None]:
df = df.reset_index(drop=True)

In [None]:
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [None]:
def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target)

In [None]:
df[df["team"] == "WAS"]

In [None]:
df["target"][pd.isnull(df["target"])] = 2
df["target"] = df["target"].astype(int, errors="ignore")

In [None]:
df["won"].value_counts()

In [None]:
df["target"].value_counts()

In [None]:
nulls = pd.isnull(df).sum()

In [None]:
nulls = nulls[nulls > 0]

In [None]:
valid_columns = df.columns[~df.columns.isin(nulls.index)]

In [None]:
valid_columns

In [None]:
df = df[valid_columns].copy()

In [None]:
df

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = RidgeClassifier(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=30, 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )

In [None]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [None]:
df

In [None]:
sfs.fit(df[selected_columns], df["target"])

In [None]:
predictors = list(selected_columns[sfs.get_support()])

In [None]:
predictors

In [None]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [None]:
predictions = backtest(df, rr, predictors)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(predictions["actual"], predictions["prediction"])

In [None]:
df.groupby(["home"]).apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

In [None]:
df

In [None]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

In [None]:
df_rolling

In [None]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis=1)

In [None]:
df = df.dropna()

In [None]:
df

In [None]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

In [None]:
df

In [None]:
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])

In [None]:
full

In [None]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

In [None]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [None]:
removed_columns

In [None]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]
sfs.fit(full[selected_columns], full["target"])

In [None]:
predictors = list(selected_columns[sfs.get_support()])

In [None]:
predictors

In [None]:
predictions = backtest(full, rr, predictors)

In [None]:
accuracy_score(predictions["actual"], predictions["prediction"])