In [4]:
# Imports
import pandas as pd
import sqlite3
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Load Data
conn = sqlite3.connect("cfb_data.db")
games_full_df = pd.read_sql_query("SELECT * FROM games_full", conn)
conn.close()

print(games_full_df.shape)
games_full_df.head()

(9817, 149)


Unnamed: 0,id,season,week,season_type,start_date,start_time_tbd,completed,neutral_site,conference_game,attendance,...,away_defense_rushingPlays_successRate,away_defense_rushingPlays_explosiveness,away_defense_passingPlays_ppa,away_defense_passingPlays_totalPPA,away_defense_passingPlays_successRate,away_defense_passingPlays_explosiveness,home_turnovers,home_possessionTime,away_turnovers,away_possessionTime
0,332410006,2013,1,regular,2013-08-29T23:30:00.000Z,0.0,1,1,0,15240.0,...,0.382353,0.588519,0.149096,4.025596,0.37037,1.382872,1,1575.0,0,2025.0
1,332410023,2013,1,regular,2013-08-30T02:00:00.000Z,0.0,1,0,0,13136.0,...,0.315789,0.79178,-0.077537,-1.938417,0.32,1.288211,0,1432.0,2,2168.0
2,332410041,2013,1,regular,2013-08-29T23:30:00.000Z,0.0,1,0,0,30689.0,...,0.363636,0.843752,0.232041,5.801016,0.4,1.617091,2,2141.0,1,2183.0
3,332410062,2013,1,regular,2013-08-30T03:00:00.000Z,,1,0,0,39058.0,...,0.208333,0.961625,-0.047574,-1.855368,0.282051,1.443332,4,1565.0,2,2830.0
4,332410084,2013,1,regular,2013-08-29T23:00:00.000Z,0.0,1,0,0,40278.0,...,0.259259,1.149174,0.340396,5.78673,0.411765,1.846431,3,1949.0,2,1662.0


In [None]:
# Correlation Analysis with the closing spread
#Drop columns that I don't care about
games_full_df = games_full_df.drop(columns=['start_date','start_time_tbd','venue_id', 'venue', 'home_id','away_id','excitement_index','highlights','notes'])

# Select numeric columns (exclude IDs, teams, dates, etc.)
numeric_cols = games_full_df.select_dtypes(include=[np.number]).columns
exclude_cols = ['id', 'season', 'week','season_type']  # Non-predictive numeric cols
features = [col for col in numeric_cols if col not in exclude_cols and col != 'avg_closing_spread']

# Compute correlations with target
correlations = games_full_df[features].corrwith(games_full_df['avg_closing_spread']).sort_values(key=abs, ascending=False)

# Display top correlations
print("Top 20 Features by Correlation with avg_closing_spread:")
print(correlations.head(20))

# Cell 4: Visualize Correlations
# Bar plot of top 20
plt.figure(figsize=(10, 6))
correlations.head(20).plot(kind='bar')
plt.title("Top 20 Feature Correlations with Closing Spread")
plt.xlabel("Feature")
plt.ylabel("Correlation Coefficient")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Optional: Heatmap of top features
top_features = correlations.head(10).index
corr_matrix = games_full_df[top_features].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Heatmap of Top Features")
plt.show()