In [39]:
import pandas as pd
df = pd.read_parquet('df_final.parquet')

In [40]:
cutoff_date = "2020-04-01"
begin_date = "2020-01-01"
train = df[(df["game_date"] < cutoff_date) & (df["game_date"] >= begin_date)]
test = df[df["game_date"] >= cutoff_date]

In [41]:
def make_training_rows(df):
    rows = []
    for _, row in df.iterrows():
        w, l = row["Winner"], row["Loser"]

        # features
        w_feats = {
            "elo_diff": row["winner_elo_pre"] - row["loser_elo_pre"],
            "elo_surf_diff": row["winner_elo_surf_pre"] - row["loser_elo_surf_pre"],
            "h2h_pre": row["h2h_pre"],
            "recent_form_diff": row["recent_form_diff"],
            "label": 1  # winner perspective
        }
        l_feats = {
            "elo_diff": row["loser_elo_pre"] - row["winner_elo_pre"],
            "elo_surf_diff": row["loser_elo_surf_pre"] - row["winner_elo_surf_pre"],
            "h2h_pre": -row["h2h_pre"],  # flip perspective
            "recent_form_diff": -row["recent_form_diff"],
            "label": 0  # loser perspective
        }

        rows.append(w_feats)
        rows.append(l_feats)

    return pd.DataFrame(rows)

In [42]:
train_data = make_training_rows(train)
test_data = make_training_rows(test)

X_train = train_data.drop(columns=["label"])
y_train = train_data["label"]

X_test = test_data.drop(columns=["label"])
y_test = test_data["label"]

print(y_train.value_counts())  # should now show both 0 and 1

label
1    469
0    469
Name: count, dtype: int64


In [43]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", X_train.shape, dict(pd.Series(y_train).value_counts()))
print("After SMOTE:", X_train_res.shape, dict(pd.Series(y_train_res).value_counts()))

# 6. Rebuild balanced training DataFrame
df_train_res = pd.DataFrame(X_train_res, columns=X_train.columns)
df_train_res["Winner"] = y_train_res.values

# 7. Save balanced training set
df_train_res.to_csv("basketball_train_balanced.csv", index=False)
print("✅ Balanced training dataset saved as basketball_train_balanced.csv")

Before SMOTE: (938, 4) {1: np.int64(469), 0: np.int64(469)}
After SMOTE: (938, 4) {1: np.int64(469), 0: np.int64(469)}
✅ Balanced training dataset saved as basketball_train_balanced.csv


In [44]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [45]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=42)
model.fit(X_train_res, y_train_res)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6134777376654633
              precision    recall  f1-score   support

           0       0.61      0.61      0.61      4155
           1       0.61      0.61      0.61      4155

    accuracy                           0.61      8310
   macro avg       0.61      0.61      0.61      8310
weighted avg       0.61      0.61      0.61      8310

