In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# --- 1. Load and Prepare Data ---
df = pd.read_csv('../data/cfb_box-scores_2002-2024.csv')
df.loc[df['rank_home'].isnull(), 'rank_home'] = 26
df.loc[df['rank_away'].isnull(), 'rank_away'] = 26
df.dropna(subset=['score_home', 'score_away'], inplace=True)
df['score_home'] = df['score_home'].astype(int)
df['score_away'] = df['score_away'].astype(int)

# --- 2. Feature Engineering for the Differential Model ---
df['score_difference'] = df['score_home'] - df['score_away']
df['rank_difference'] = df['rank_home'] - df['rank_away']

# --- 3. Build and Train the Model ---
X = df[['rank_difference']]
y = df['score_difference']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

# --- 4. Evaluate the Model's Performance ---
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print("--- Regression Model Evaluation ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

predicted_winner = np.where(predictions > 0, 1, 0)
actual_winner = np.where(y_test > 0, 1, 0)
accuracy = np.mean(predicted_winner == actual_winner)
print(f"\nModel Accuracy for Predicting the Winner: {accuracy * 100:.2f}%")

--- Regression Model Evaluation ---
Mean Squared Error (MSE): 424.64
R-squared (R2): 0.14

Model Accuracy for Predicting the Winner: 66.53%


In [5]:
# --- Improving the Model (Corrected Code) ---

# Create features for both teams' stats
df['pass_yards_diff'] = df['pass_yards_home'] - df['pass_yards_away']
df['rush_yards_diff'] = df['rush_yards_home'] - df['rush_yards_away']

# Define your new, expanded feature set
feature_columns = ['rank_difference', 'pass_yards_diff', 'rush_yards_diff']

# --- THE FIX ---
# Drop any rows where data is missing in our chosen feature columns
df.dropna(subset=feature_columns, inplace=True)

# Now, define X and y from the fully cleaned DataFrame
X = df[feature_columns]
y = df['score_difference']

# Split, train, and evaluate again
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# --- Re-run the evaluation code to see your new score! ---
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
predicted_winner = np.where(predictions > 0, 1, 0)
actual_winner = np.where(y_test > 0, 1, 0)
accuracy = np.mean(predicted_winner == actual_winner)

print("--- Evaluation of Improved Model ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")
print(f"Model Accuracy for Predicting the Winner: {accuracy * 100:.2f}%")

--- Evaluation of Improved Model ---
Mean Squared Error (MSE): 162.05
R-squared (R2): 0.67
Model Accuracy for Predicting the Winner: 81.02%
