In [1]:
import pandas as pd
import requests
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import numpy as np
from sklearn.preprocessing import StandardScaler
import sklearn as sk

In [2]:
# Make sure s3fs is installed: pip install s3fs
# s3_path = "s3://my-cfb-data-894398043980/filtered-data/"

# Read all Parquet files in that folder into a single DataFrame
# df = pd.read_parquet(s3_path, engine="pyarrow")

In [2]:
# Website information
api_key = os.environ["API_KEY"]

df = pd.DataFrame()

for i in range(2021, 2025):
    url = "https://api.collegefootballdata.com/stats/game/advanced"
    params = {"year": i, "excludeGarbageTime": True}
    headers = {"Authorization": f"Bearer {api_key}"}

    # Call the API
    r = requests.get(url, params=params, headers=headers)
    data = pd.json_normalize(r.json())

    df = pd.concat([df, data], axis=0, ignore_index=True)

In [3]:
game_results = pd.DataFrame()

for i in range(2021, 2025):
    url = "https://api.collegefootballdata.com/games"
    params = {"year": i}
    headers = {"Authorization": f"Bearer {api_key}"}

    # Call the API
    r = requests.get(url, params=params, headers=headers)
    data = pd.json_normalize(r.json())

    game_results = pd.concat([game_results, data], axis=0, ignore_index=True)

In [4]:
main_df = pd.merge(df, game_results, left_on="gameId", right_on="id", how="left")

In [6]:
main_df = main_df.drop_duplicates(subset='gameId', keep='first')

main_df["team_win"] = (
    ((main_df["team"] == main_df["homeTeam"]) & (main_df["homePoints"] > main_df["awayPoints"])) |
    ((main_df["team"] == main_df["awayTeam"]) & (main_df["awayPoints"] > main_df["homePoints"]))
).astype(int)

main_df["home_indicator"] = (main_df["homeTeam"] == main_df["team"]).astype(int)

columns_to_extract = [
   "offense.totalPPA",
    "offense.successRate",
   "offense.explosiveness",
   "offense.rushingPlays.totalPPA",
   "offense.passingPlays.totalPPA",
    "home_indicator",
    "defense.totalPPA",
    "defense.successRate",
    "defense.explosiveness",
    "defense.rushingPlays.totalPPA",
    "defense.passingPlays.totalPPA"
]

# Extract those columns
X = main_df[columns_to_extract]

y = main_df["team_win"]


In [7]:
from sklearn.preprocessing import StandardScaler

# Suppose your DataFrame is df
numeric_cols = X.select_dtypes(include='number').columns

scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

X.insert(0, "const", 1.0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_cols] = scaler.fit_transform(X[numeric_cols])


In [8]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
print("Training accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

Training accuracy: 0.8514825505116768
Test accuracy: 0.8396572827417381


In [9]:
y_pred_prob = model.predict_proba(X_test)[:, 1]
main_df['prediction'] = model.predict(X)
y_pred_class = (y_pred_prob >= 0.5).astype(int)

y_true = np.array(y_test)
y_pred = np.array(y_pred_class)
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

tn, fp, fn, tp = cm.ravel()

# Metrics
accuracy = (tp + tn) / (tn + fp + fn + tp)
print(f"Accuracy: {accuracy:.3f}")
print(f"True Positives: {tp}, True Negatives: {tn}, False Positives: {fp}, False Negatives: {fn}")

Confusion Matrix:
 [[682 134]
 [128 690]]
Accuracy: 0.840
True Positives: 690, True Negatives: 682, False Positives: 134, False Negatives: 128


In [10]:
bucket = "my-cfb-data-894398043980"
key = "filtered-data/"
s3_path = f"s3://{bucket}/{key}"

# Read parquet directly from S3
new_data = pd.read_parquet(s3_path, engine="pyarrow")  # or engine="fastparquet"

In [11]:
# Suppose your DataFrame is df

columns_to_extract = [
    "offense.totalppa",
    "offense.successrate",
    "offense.explosiveness",
    "offense.rushingplays.totalppa",
    "offense.passingplays.totalppa",
    "home_indicator",
    "defense.totalppa",
    "defense.successrate",
    "defense.explosiveness",
    "defense.rushingplays.totalppa",
    "defense.passingplays.totalppa"
]

new_data["home_indicator"] = (new_data["hometeam"] == new_data["team"]).astype(int)


X_new = new_data[columns_to_extract]

numeric_cols = X_new.select_dtypes(include='number').columns

scaler = StandardScaler()
X_new[numeric_cols] = scaler.fit_transform(X_new[numeric_cols])

X_new = sm.add_constant(X_new)

new_data['prediction'] = result.predict(X_new)

new_data["team_win"] = (
    ((new_data["team"] == new_data["hometeam"]) & (new_data["homepoints"] > new_data["awaypoints"])) |
    ((new_data["team"] == new_data["awayteam"]) & (new_data["awaypoints"] > new_data["homepoints"]))
).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_new[numeric_cols] = scaler.fit_transform(X_new[numeric_cols])


In [14]:
import json

# Save model coefficients and scaler params
model_data = {
    "coef": model.coef_.tolist(),
    "intercept": model.intercept_.tolist(),
    "scaler_mean": scaler.mean_.tolist(),
    "scaler_scale": scaler.scale_.tolist(),
    "feature_order": list(X_train.columns)
}

with open("model.json", "w") as f:
    json.dump(model_data, f)
