In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from src.process import processor

STATS_TO_ADJUST = ['PTS', 'PACE', 'FGM', 'FGA', '3PT_FGM', '3PT_FGA', 'FTM', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'ORTG', 'DRTG', '2PT_FGM', '2PT_FGA']

In [3]:
df = pd.read_csv("data/game/box_raw/box_reshaped_regular.csv")
df = df[df['SEASON'] >= '2020-21']

In [4]:
model = processor.BaseModel()

In [5]:
processed_df = model.forward(df)

fitting seasonal slopes for 2020-21
fitting seasonal slopes for 2021-22
fitting seasonal slopes for 2022-23
fitting seasonal slopes for 2023-24
calculating home adjustment for 2020-21
calculating home adjustment for 2021-22
calculating home adjustment for 2022-23
calculating home adjustment for 2023-24
calculating rest adjustment for 2020-21
calculating rest adjustment for 1in3
calculating rest adjustment for 2in3
calculating rest adjustment for 2in2
calculating rest adjustment for 2021-22
calculating rest adjustment for 1in3
calculating rest adjustment for 2in3
calculating rest adjustment for 2in2
calculating rest adjustment for 2022-23
calculating rest adjustment for 1in3
calculating rest adjustment for 2in3
calculating rest adjustment for 2in2
calculating rest adjustment for 2023-24
calculating rest adjustment for 1in3
calculating rest adjustment for 2in3
calculating rest adjustment for 2in2
calculating opponent adjustment for 2020-21
2020-21 opponent effect magnitude for iteration 

In [6]:
mean_filled_df = processor.make_mean_filled_df(processed_df)

In [7]:
processed_df = model.backward(mean_filled_df)

applying opponent adjustment backward
applying rest adjustment backward
applying home adjustment backward
applying seasonal slopes backward


In [29]:
original_df = pd.read_csv("data/game/box_raw/box_reshaped_regular.csv")
original_df = original_df[original_df['SEASON'] >= '2020-21']

In [30]:
training_df = processor.make_training_df(processed_df, original_df)

In [31]:
simplified_training_df = training_df.drop(columns=['HOME_TEAM', 'AWAY_TEAM', 'SEASON', 'DATE'])

In [32]:
simplified_training_df['PTS_DIFF'] = simplified_training_df['HOME_PTS'] - simplified_training_df['AWAY_PTS']

In [33]:
simplified_training_df[['HOME_REST', 'AWAY_REST']] = simplified_training_df[['HOME_REST', 'AWAY_REST']].astype('category')
simplified_training_df['HOME_WIN'] = simplified_training_df['HOME_WIN'].astype(bool)
non_numeric_cols = ['HOME_REST', 'AWAY_REST', 'HOME_WIN']
numeric_cols = [col for col in simplified_training_df.columns 
                if col not in non_numeric_cols]
simplified_training_df[numeric_cols] = simplified_training_df[numeric_cols].astype(float)

In [34]:
simplified_training_df[numeric_cols] = simplified_training_df[numeric_cols].div(simplified_training_df[numeric_cols].max(axis=1), axis=0)

In [35]:
simplified_training_df.drop(columns=['HOME_Y', 'AWAY_Y'], inplace=True)

In [61]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score, train_test_split

# split features and target
X = simplified_training_df.drop(columns=['HOME_WIN'])
y = simplified_training_df['HOME_WIN']

params = {
    'objective': 'binary:logistic',
    'max_depth': 3,  
    'learning_rate': 0.05,  
    'min_child_weight': 30,  
    'subsample': 0.7,  
    'colsample_bytree': 0.7,  
    'gamma': 1, 
    'random_state': 2002,
    'enable_categorical': True
}

# method 1: Using sklearn's cross_val_score
xgb_model = xgb.XGBClassifier(**params)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scores = cross_val_score(xgb_model, X_train, y_train, cv=5)
print(f"cross-validation accuracy: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

xgb_model.fit(X_train, y_train)
train_score = xgb_model.score(X_train, y_train)
print(f"train set accuracy: {train_score:.3f}")
test_score = xgb_model.score(X_test, y_test)
print(f"test set accuracy: {test_score:.3f}")


cross-validation accuracy: 0.660 (+/- 0.026)
train set accuracy: 0.708
test set accuracy: 0.666


In [23]:
# Get feature importance
importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 most important features:")
print(importance.head(10))

Top 10 most important features:
      feature  importance
43   AWAY_PTS    0.069608
19   HOME_PTS    0.052587
10   HOME_FGM    0.040921
16  HOME_ORTG    0.024466
40  AWAY_ORTG    0.023359
37   AWAY_FTR    0.022145
13   HOME_FTR    0.021723
18    HOME_PF    0.020826
33   AWAY_FGA    0.020658
17  HOME_PACE    0.020516


In [17]:
# Look at correlations with target
correlations = pd.DataFrame({
    'feature': [col for col in X.columns if col not in ['HOME_REST', 'AWAY_REST']],
    'correlation': [X[col].corr(y) for col in X.columns if col not in ['HOME_REST', 'AWAY_REST']]
}).sort_values('correlation', ascending=False)

print("\nTop correlations with HOME_WIN:")
print(correlations.head(10))


Top correlations with HOME_WIN:
         feature  correlation
19      HOME_PTS     0.262993
10      HOME_FGM     0.244255
16     HOME_ORTG     0.215648
17     HOME_PACE     0.162781
5       HOME_AST     0.160585
1   HOME_2PT_FGM     0.149037
9       HOME_FGA     0.138534
7       HOME_DRB     0.112954
4   HOME_3PT_FGM     0.112155
22      HOME_TRB     0.109395
