In [19]:
import requests
import pandas as pd
import sqlite3
import numpy as np

In [20]:
#API Setup
BASE_URL = 'https://api.collegefootballdata.com/'
API_KEY = 'Y2P4Ex6vaj/fPBURQsf2jz+0R2pXikYv8PtvqoqiMG7ukTvpVscCVjUA10VDv+My'

def get_data(endpoint,params={}):
    headers = {"Authorization": f"Bearer {API_KEY}"}
    response = requests.get(f"{BASE_URL}/{endpoint}", headers=headers, params=params)
    if response.status_code != 200:
        raise Exception(f"API Error: {response.status_code} - {response.text}")
    return response.json()

In [21]:
# Initial Data Pull (Games)
games_data = []
for year in range(2013, 2025):
    print(f"Pulling games for {year}...")
    data = get_data("games", {"year": year, "division": "fbs"})
    games_data.extend(data)
games_df = pd.DataFrame(games_data)
games_df = games_df.drop(['home_line_scores','away_line_scores'], axis=1)
games_df.head()  # Quick check

Pulling games for 2013...
Pulling games for 2014...
Pulling games for 2015...
Pulling games for 2016...
Pulling games for 2017...
Pulling games for 2018...
Pulling games for 2019...
Pulling games for 2020...
Pulling games for 2021...
Pulling games for 2022...
Pulling games for 2023...
Pulling games for 2024...


Unnamed: 0,id,season,week,season_type,start_date,start_time_tbd,completed,neutral_site,conference_game,attendance,...,away_team,away_conference,away_division,away_points,away_post_win_prob,away_pregame_elo,away_postgame_elo,excitement_index,highlights,notes
0,332412579,2013,1,regular,2013-08-29T22:00:00.000Z,,True,False,False,81572.0,...,North Carolina,ACC,fbs,10.0,0.3444138448668871,1638.0,1612.0,,,
1,332412309,2013,1,regular,2013-08-29T22:00:00.000Z,False,True,False,False,20790.0,...,Liberty,Big South,fcs,10.0,0.6038425390589921,1467.0,1461.0,,,
2,332410154,2013,1,regular,2013-08-29T22:30:00.000Z,False,True,False,False,26202.0,...,Presbyterian,Big South,fcs,7.0,0.0002109028649118,,,,,
3,332412050,2013,1,regular,2013-08-29T23:00:00.000Z,False,True,False,False,16327.0,...,Illinois State,MVFC,fcs,28.0,0.086375149540862,,,,,
4,332410189,2013,1,regular,2013-08-29T23:00:00.000Z,,True,False,False,18142.0,...,Tulsa,Conference USA,fbs,7.0,0.3514652360707426,1635.0,1542.0,,,


In [22]:
#SQLite Setup
conn = sqlite3.connect("cfb_data.db")
games_df.to_sql("games", conn, if_exists="replace", index=False)
conn.close()

In [23]:
#Pull Lines Data
lines_data = []
for year in range(2013, 2025):
    print(f"Pulling lines for {year}...")
    data = get_data("lines", {"year": year, "division": "fbs"})
    lines_data.extend(data)

# Flatten the nested structure
flat_lines = []
for game in lines_data:
    game_id = game['id']
    home_team = game['homeTeam']
    away_team = game['awayTeam']
    if game['lines']:  # Check if lines exist
        for line in game['lines']:
            flat_lines.append({
                'game_id': game_id,
                'home_team': home_team,
                'away_team': away_team,
                'sportsbook': line['provider'],
                'spread_open': line.get('spreadOpen', None),
                'spread': line.get('spread', None)  # Closing spread
            })

lines_df = pd.DataFrame(flat_lines)
print(lines_df.shape)
lines_df.head()  # Check it out

Pulling lines for 2013...
Pulling lines for 2014...
Pulling lines for 2015...
Pulling lines for 2016...
Pulling lines for 2017...
Pulling lines for 2018...
Pulling lines for 2019...
Pulling lines for 2020...
Pulling lines for 2021...
Pulling lines for 2022...
Pulling lines for 2023...
Pulling lines for 2024...
(33390, 6)


Unnamed: 0,game_id,home_team,away_team,sportsbook,spread_open,spread
0,332412579,South Carolina,North Carolina,teamrankings,,-11.0
1,332412579,South Carolina,North Carolina,numberfire,,-11.0
2,332412579,South Carolina,North Carolina,consensus,,-11.5
3,332410062,Hawai'i,USC,teamrankings,,23.5
4,332410062,Hawai'i,USC,numberfire,,23.5


In [25]:
#Save to SQLite
conn = sqlite3.connect("cfb_data.db")
lines_df.to_sql("lines", conn, if_exists="replace", index=False)
conn.close()

In [26]:
# Average Spreads and Update Games Table
conn = sqlite3.connect("cfb_data.db")

# Query to average closing spreads
query = """
SELECT 
    g.*, 
    AVG(l.spread) AS avg_closing_spread
FROM 
    games g
LEFT JOIN 
    lines l ON g.id = l.game_id
GROUP BY 
    g.id, g.season, g.week, g.start_date, g.home_team, g.home_points, 
    g.away_team, g.away_points
"""

# Load into DataFrame
merged_df = pd.read_sql_query(query, conn)

# Overwrite games table with new data (no spread_open)
merged_df.to_sql("games", conn, if_exists="replace", index=False)

# Verify
print(merged_df.shape)
merged_df.head()

conn.close()

(9817, 32)


In [None]:
# Cell 9: Pull Advanced Stats
advanced_stats_data = []
for year in range(2013, 2025):
    print(f"Pulling advanced stats for {year}...")
    data = get_data("stats/game/advanced", {"year": year, "excludeGarbageTime":"true", "division": "fbs"})
    advanced_stats_data.extend(data)

# Create DataFrame
advanced_stats_df = pd.DataFrame(advanced_stats_data)
print(advanced_stats_df.shape)
advanced_stats_df.head()  # Check the data