<a href="https://colab.research.google.com/github/ryan-montoyo/NBA-Predict/blob/main/NBA_Predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**NBA Predictive Model**

##**Load Data**

In [None]:
import pandas as pd

# read our csv file using pandas

df = pd.read_csv('nba_games.csv', index_col = 0)



##**Data Preparation**

###**Reorganizing Data**

In [None]:
# sort by date for chronology
df = df.sort_values('date')

#reset indices
df = df.reset_index(drop=True)


#remove extraneous columns

del df['mp.1']
del df['mp_opp.1']
del df['index_opp']

In [None]:
# function adds a column to csv called target indicating if the team won or lost the next game (True = won, False = lost)

def add_target(team):
  team['target'] = team['won'].shift(-1)
  return team

df = df.groupby('team', group_keys = False).apply(add_target)

In [None]:
df[df['team'] == "NYK"]


In [None]:
# we are looking for null values under target which indicate no next game
# turn nulls into 2 and dont worry about errors that come from converting bool to int

df["target"][pd.isnull(df['target'])] = 2

df['target'] = df['target'].astype(int, errors="ignore")



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df['target'])] = 2


In [None]:
df['won'].value_counts()

won
False    8886
True     8886
Name: count, dtype: int64

In [None]:
df['target'].value_counts()

target
1    8872
0    8870
2      30
Name: count, dtype: int64

In [None]:
#remove any nulls in our data set
nulls = pd.isnull(df)
nulls = nulls.sum()
nulls = nulls[nulls > 0]

valid_columns = df.columns[~df.columns.isin(nulls.index)]

valid_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=142)

In [None]:
# copy with no nulls
# data is ready

df = df[valid_columns].copy()


##**Train Model**

###**Feature Selector**

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits = 3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction = 'forward', cv = split)


In [None]:
removed_columns = ['season', 'date', 'won', 'target', 'team', 'team_opp']

selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])



In [None]:
sfs.fit(df[selected_columns], df['target'])

In [None]:
predictors = list(selected_columns[sfs.get_support()])


In [None]:
def backtest(data, model, predictors, start = 2, step = 1):
  all_predictions = []

  seasons = sorted(data['season'].unique())

  for i in range(start, len(seasons), step):
    season = seasons[i]

    train = data[data['season'] < season]
    test = data[data['season'] == season]

    model.fit(train[predictors], train['target'])

    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index = test.index)

    combined = pd.concat([test['target'], preds], axis = 1)
    combined.columns = ['actual', 'prediction']

    all_predictions.append(combined)
    return pd.concat(all_predictions)

In [None]:
predictions = backtest(df, rr, predictors)
predictions

In [None]:
from sklearn.metrics import accuracy_score

predictions = predictions[predictions['actual'] !=2]
accuracy_score(predictions['actual'], predictions['prediction'])

0.5487804878048781

###Improving Model with rolling averages

In [None]:
df.groupby('home').apply(lambda x: x[x['won'] == 1].shape[0] / x.shape[0])

df_rolling = df[list(selected_columns) + ['won', 'team', 'season']]

df_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,team,season
0,0.0,0.363636,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.079,0.679245,0.277279,0.554502,0.317647,0.451923,1.0,False,NOP,2016
1,0.0,0.431818,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.140,0.509434,0.160462,0.345972,0.317647,0.317308,1.0,False,CLE,2016
2,0.0,0.409091,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.185,0.270440,0.088575,0.232227,0.329412,0.298077,0.0,True,CHI,2016
3,0.0,0.500000,0.529412,0.377990,0.310345,0.393939,0.356295,0.441860,0.333333,0.893816,...,0.063,0.344864,0.215661,0.530806,0.505882,0.298077,0.0,True,GSW,2016
4,0.0,0.409091,0.323529,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,0.047,0.300839,0.019255,0.203791,0.317647,0.403846,0.0,False,ATL,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.103,0.401468,0.182285,0.208531,0.411765,0.413462,0.0,False,BOS,2022
17768,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.124,0.423480,0.928113,1.000000,0.411765,0.288462,0.0,True,GSW,2022
17769,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.076,0.300839,0.181001,0.630332,0.352941,0.384615,1.0,False,BOS,2022
17770,0.0,0.340909,0.294118,0.373206,0.379310,0.363636,0.466746,0.232558,0.174603,0.903151,...,0.063,0.300839,0.120668,0.459716,0.400000,0.375000,0.0,False,BOS,2022


In [None]:
def find_team_averages(team):
  rolling = team.rolling(10).mean()
  return rolling

df_rolling = df_rolling.groupby(['team', 'season'], group_keys=False).apply(find_team_averages)
df_rolling

#38:48

AttributeError: 'DataFrameGroupBy' object has no attribute 'groupby'