# Predict NBA Games Tutorial

The following is an exploratory notebook for predicting the results of historial NBA games. This notebook follows the tutorial by Vikas Paruchuri of Dataquest, which can be found [here](https://www.youtube.com/watch?v=egTylm6C2is) and in [this repo](https://github.com/dataquestio/project-walkthroughs/tree/master/nba_games).

The dataset for this notebook can be found [here](https://www.youtube.com/redirect?event=video_description&redir_token=QUFFLUhqbmpENGkySmZQVzk3Z0VRMkdnOXhJLWw2eUFCd3xBQ3Jtc0trU0RGdUFyd3dQazNjaXJPOVd4NW9xVzhpYUdHLUZ6UEQ2dGUtSnd6S3ZtVC13bnhuZWx4em9QeTZKVm5kNkVUX2F6U2IzTzNCalBoSml5WGt5b09YTlhSN1o0WWZOVThsNGV1NnNwZl9BVW41RWpJdw&q=https%3A%2F%2Fdrive.google.com%2Fuc%3Fexport%3Ddownload%26id%3D1YyNpERG0jqPlpxZvvELaNcMHTiKVpfWe&v=egTylm6C2is) and should be placed in the "../data/raw/" directory.

In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv("../data/raw/tutorial_nba_games.csv", index_col=0) # read NBA data, first col is our pandas index
df

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/tutorial_nba_games.csv'

In [None]:
df = df.sort_values("date")
df = df.reset_index(drop=True) # drop old index
df

In [None]:
# Remove extraneous columns
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]
df

In [None]:
# This adds a "target" column to our data frame, which is what we should predict as the outcome of the subsequent game
def add_target(team):
    new_team_df = team.copy()
    new_team_df["target"] = new_team_df["won"].shift(-1)
    return new_team_df

df = df.groupby("team", group_keys=False).apply(add_target)

In [None]:
df["team"]

In [None]:
df[df["team"] == "TOR"]

In [None]:
df.loc[pd.isnull(df["target"]), "target"] = 2

In [None]:
df["target"] = df["target"].astype(int, errors="ignore")

In [None]:
# Check our dataset is balanced
df["won"].value_counts()

In [None]:
df["target"].value_counts()

In [None]:
nulls = pd.isnull(df)

In [None]:
nulls = nulls.sum()

In [None]:
nulls = nulls[nulls > 0]
nulls

In [None]:
valid_columns = df.columns[~df.columns.isin(nulls.index)]
valid_columns

In [None]:
df = df[valid_columns].copy()

In [None]:
df  # should have 142 columns

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split)

In [None]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]

In [None]:
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [None]:
df

In [None]:
sfs.fit(df[selected_columns], df["target"])

In [None]:
predictors = list(selected_columns[sfs.get_support()])

In [None]:
predictors

In [None]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds , index=test.index)
        
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    
    return pd.concat(all_predictions)

In [None]:
predictions = backtest(df, rr, predictors)

In [None]:
predictions

In [None]:
from sklearn.metrics import accuracy_score

predictions = predictions[predictions["actual"] != 2]
accuracy_score(predictions["actual"], predictions["prediction"])

In [None]:
df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

In [None]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [None]:
df_rolling

In [None]:
def find_team_averages(team):
    # Calculate rolling mean for numeric columns only
    numeric_cols = team.select_dtypes(include='number')
    rolling_numeric = numeric_cols.rolling(10).mean()
#     # Ensure the order of rows is preserved by combining the numeric rolling means with non-numeric data
#     team[numeric_cols.columns] = rolling_numeric
    return rolling_numeric

# Compute a rolling average of past 10 games, grouped by team and season
df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

In [None]:
df_rolling

In [None]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols

df = pd.concat([df, df_rolling], axis=1)

In [None]:
df

In [None]:
df = df.dropna()
df

In [None]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

# This will add information about the next future game (if we are the home team, who is the opp, and the date)

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

In [None]:
df

In [None]:
full = df.merge(
    df[rolling_cols + ["team_opp_next", "date_next", "team"]], 
    left_on=["team", "date_next"], 
    right_on=["team_opp_next", "date_next"]
)

In [None]:
full

In [None]:
# Check if the merge worked properly
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

In [None]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [None]:
removed_columns

In [None]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [None]:
sfs.fit(full[selected_columns], full["target"])

In [None]:
predictors = list(selected_columns[sfs.get_support()])

In [None]:
predictions = backtest(full, rr, predictors)

In [None]:
accuracy_score(predictions["actual"], predictions["prediction"])