In [94]:
import requests
import pandas as pd
import numpy as np
import re
import random
import os
from bs4 import BeautifulSoup
from tqdm import tqdm
from datetime import timedelta
import time
pd.set_option('display.max_columns', 120)
pd.set_option('display.max_rows', 100)
os.chdir(r'C:\Users\wws5213\CFB Email Scores Scraping Project')

API_KEY = "LtePW/LMN8WGSsjhRYy2cE61Gf/sZvOUIqBCKfktT/TyH4r/xF4MmWwwkbOFBmJ1"  # <--- Put your actual key here
N_GAMES = None  # <--- CHANGE THIS to pull this many games (set to None for all)

In [64]:
def to_title_case_underscores(name):
    """Converts camelCase or mixed names to Title_Case_With_Underscores"""
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    s2 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1)
    result = s2.replace("__", "_").replace("-", "_").replace(" ", "_")
    result = result.strip("_").lower()
    return "_".join([w.capitalize() for w in result.split("_")])

def boxscore_wide_cleaned(df):
    id_cols = ['Game_ID', 'homeAway', 'team', 'teamId', 'conference', 'points']
    stat_cols = [col for col in df.columns if col not in id_cols]
    keep_cols = id_cols + stat_cols
    df = df[keep_cols].copy()
    def prefix_cols(row):
        prefix = 'Home_' if row['homeAway'] == 'home' else 'Away_'
        out = {}
        for col in row.index:
            if col == 'Game_ID' or col == 'homeAway':
                continue
            if col == 'teamId':
                col_title = f"{prefix}TeamId"
            else:
                col_title = f"{prefix}{col[0].upper() + col[1:]}" if col not in ['team', 'conference'] else f"{prefix}{col.capitalize()}"
            out[col_title] = row[col]
        out['Game_ID'] = row['Game_ID']
        return pd.Series(out)
    wide_df = df.apply(prefix_cols, axis=1)
    wide_final = wide_df.groupby('Game_ID').first().reset_index()
    wide_final.columns = [to_title_case_underscores(c) if c != 'Game_ID' else c for c in wide_final.columns]
    for col in wide_final.columns:
        if col.endswith('Teamid'):
            wide_final[col] = pd.to_numeric(wide_final[col], errors='coerce').fillna(0).astype(int)
    return wide_final


In [65]:
def get_fbs_scores(season):
    url = "https://api.collegefootballdata.com/games"
    params = {"year": season, "seasonType": "regular"}
    headers = {"Authorization": f"Bearer {API_KEY}"}
    response = requests.get(url, params=params, headers=headers)
    response.raise_for_status()
    games = response.json()
    df = pd.DataFrame(games)
    cols = [
        "id", "week", "seasonType", "startDate", "completed", "neutralSite", "venue", "conferenceGame",
        "homeTeam", "homeClassification", "homeConference", "homePoints", "homeLineScores",
        "awayTeam", "awayClassification", "awayConference", "awayPoints", "awayLineScores", "excitementIndex"
    ]
    df_out = df[cols]
    return df_out







def clean_scores_df(df):
    df = df.copy()
    
    # Rename columns (including Venue)
    df = df.rename(columns={
        'id': 'Game_ID',
        'week': 'Week',
        'seasonType': 'Season_Type',
        'startDate': 'Start_Date',
        'venue': 'Venue',
        'completed': 'Completed',
        'neutralSite': 'Neutral_Site',
        'conferenceGame': 'Conference_Game',
        'homeTeam': 'Home_Team',
        'homeClassification': 'Home_Classification',
        'homeConference': 'Home_Conference',
        'homePoints': 'Home_Pts',
        'homeLineScores': 'Home_Line_Scores',
        'awayTeam': 'Away_Team',
        'awayClassification': 'Away_Classification',
        'awayConference': 'Away_Conference',
        'awayPoints': 'Away_Pts',
        'awayLineScores': 'Away_Line_Scores',
        'excitementIndex': 'Excitement'
    })
    
    # Capitalize all values in Season_Type
    df['Season_Type'] = df['Season_Type'].str.title()
    
    # Convert Start_Date to datetime (UTC)
    df['Start_DateTime'] = pd.to_datetime(df['Start_Date'], utc=True)
    df['Start_DateTime_EST'] = df['Start_DateTime'].dt.tz_convert('US/Eastern')

    # Create separate Start_Date and Start_Time columns in Eastern
    # Format date as "Fri, Aug. 29, 2024"
    df['Start_Date'] = df['Start_DateTime_EST'].dt.strftime('%a, %b. %d, %Y')
    df['Start_Time'] = df['Start_DateTime_EST'].dt.strftime('%H:%M')

    # Completed and Neutral_Site to Yes/No
    for col in ['Completed', 'Neutral_Site']:
        df[col] = df[col].apply(lambda x: 'Yes' if x else 'No')
    
    # Conference_Game: Non-Conf if False, else use conference name (from home or away)
    def get_conf(row):
        if not row['Conference_Game']:
            return 'Non-Conf'
        return row['Home_Conference'] if row['Home_Conference'] else row['Away_Conference']
    df['Conference_Game'] = df.apply(get_conf, axis=1)
    
    # Home/Away_Classification fully uppercase
    df['Home_Classification'] = df['Home_Classification'].str.upper()
    df['Away_Classification'] = df['Away_Classification'].str.upper()
    
    # Filter: at least one FBS team
    df = df[(df['Home_Classification'] == 'FBS') | (df['Away_Classification'] == 'FBS')]
    
    # Points as integer (remove decimals)
    df['Home_Pts'] = pd.to_numeric(df['Home_Pts'], errors='coerce').fillna(0).astype(int)
    df['Away_Pts'] = pd.to_numeric(df['Away_Pts'], errors='coerce').fillna(0).astype(int)
    
    # Excitement_Index: one decimal
    df['Excitement'] = pd.to_numeric(df['Excitement'], errors='coerce').round(1)
    
    # Break out line scores (handle missing or short lists gracefully)
    def get_q(scores, idx):
        try:
            return int(scores[idx])
        except Exception:
            return 0

    # Home line scores
    df['Home_1Q'] = df['Home_Line_Scores'].apply(lambda x: get_q(x, 0) if isinstance(x, list) else 0)
    df['Home_2Q'] = df['Home_Line_Scores'].apply(lambda x: get_q(x, 1) if isinstance(x, list) else 0)
    df['Home_3Q'] = df['Home_Line_Scores'].apply(lambda x: get_q(x, 2) if isinstance(x, list) else 0)
    df['Home_4Q'] = df['Home_Line_Scores'].apply(lambda x: get_q(x, 3) if isinstance(x, list) else 0)

    # Away line scores
    df['Away_1Q'] = df['Away_Line_Scores'].apply(lambda x: get_q(x, 0) if isinstance(x, list) else 0)
    df['Away_2Q'] = df['Away_Line_Scores'].apply(lambda x: get_q(x, 1) if isinstance(x, list) else 0)
    df['Away_3Q'] = df['Away_Line_Scores'].apply(lambda x: get_q(x, 2) if isinstance(x, list) else 0)
    df['Away_4Q'] = df['Away_Line_Scores'].apply(lambda x: get_q(x, 3) if isinstance(x, list) else 0)

    mask = df['Season_Type'] == 'Regular'
    df_reg = df[mask].sort_values('Start_DateTime_EST')
    week = 0
    week_cuts = []

    # Find the first week start (first game's date, but with time set to 00:00:00, next Tuesday)
    first_date = df_reg.iloc[0]['Start_DateTime_EST']
    # Move to Tuesday midnight of that week
    first_tuesday = first_date + timedelta(days=(1 - first_date.weekday()) % 7)
    first_tuesday = first_tuesday.replace(hour=0, minute=0, second=0, microsecond=0)
    if first_date > first_tuesday:
        current_week_cut = first_tuesday
    else:
        # If first game is before Tuesday, that's week 0
        current_week_cut = first_tuesday

    week_list = []
    for dt in df_reg['Start_DateTime_EST']:
        if dt >= current_week_cut:
            week += 1
            current_week_cut += timedelta(days=7)
        week_list.append(week)
    df.loc[mask, 'Week'] = week_list

    # For all non-regular season, set week as 'Post-Season'
    df['Week'] = df['Week'].astype(str)
    df.loc[df['Season_Type'] != 'Regular', 'Week'] = 'Post-Season'
    df['Week'] = df['Week'].astype(str)

    # Reorder columns (optional)
    final_cols = [
        'Game_ID','Week', 'Start_Date', 'Start_Time', 'Venue',
        'Completed', 'Neutral_Site', 'Conference_Game', 'Home_Team', 'Home_Conference', 'Home_Pts',
        'Home_1Q', 'Home_2Q', 'Home_3Q', 'Home_4Q', 'Away_Team', 'Away_Conference', 'Away_Pts',
        'Away_1Q', 'Away_2Q', 'Away_3Q', 'Away_4Q', 'Excitement'
    ]
    df = df[final_cols]

    return df















def get_team_records(df):
    home_records = []
    away_records = []
    home_conf_records = []
    away_conf_records = []
    team_results = {}
    
    for idx, row in df.iterrows():
        week = row['Week']
        home = row['Home_Team']
        away = row['Away_Team']
        home_conf = row['Home_Conference']
        away_conf = row['Away_Conference']
        try:
            home_pts = int(row['Home_Pts'])
            away_pts = int(row['Away_Pts'])
        except:
            home_pts, away_pts = None, None

        # Only count games that are completed and have valid scores
        if row['Completed'] == "Yes" and home_pts is not None and away_pts is not None:
            # Was this a conference game for each team?
            is_home_conf_game = (home_conf == away_conf) and (home_conf not in ["Non-Conf", "FBS Indep.", "FCS"])
            is_away_conf_game = is_home_conf_game  # same for both
            # Who won?
            home_win = home_pts > away_pts
            away_win = away_pts > home_pts
            # Store for home (current game included)
            team_results.setdefault(home, []).append((home_win, is_home_conf_game, home_win and is_home_conf_game, week))
            # Store for away (current game included)
            team_results.setdefault(away, []).append((away_win, is_away_conf_game, away_win and is_away_conf_game, week))

        # Now, calculate records (INCLUDING this game!)
        def record_for(team, conf):
            results = team_results.get(team, [])
            wins = sum(1 for r in results if r[0])
            losses = sum(1 for r in results if not r[0])
            conf_results = [r for r in results if r[1]]
            conf_wins = sum(1 for r in conf_results if r[2])
            conf_losses = sum(1 for r in conf_results if not r[2])
            return f"{wins}-{losses}", f"{conf_wins}-{conf_losses}"
        
        home_record, home_conf_record = record_for(home, home_conf)
        away_record, away_conf_record = record_for(away, away_conf)
        home_records.append(home_record)
        home_conf_records.append(home_conf_record)
        away_records.append(away_record)
        away_conf_records.append(away_conf_record)
    
    # Add columns
    df['Home_Record'] = home_records
    df['Home_Conf_Record'] = home_conf_records
    df['Away_Record'] = away_records
    df['Away_Conf_Record'] = away_conf_records
    return df
# For this cell, assume you finish with a df3 with your usual columns
df1 = get_fbs_scores(2024)
df2 = clean_scores_df(df1)   # <--- Your custom function, paste it above or import!
df3 = get_team_records(df2.copy())  # <--- Your custom function, paste it above or import!


In [66]:
def get_team_boxscores_df(game_ids, api_key, checkpoint_path=None, checkpoint_every=50):
    boxscore_url = "https://api.collegefootballdata.com/games/teams"
    boxscores = []
    for i, gid in enumerate(tqdm(game_ids, desc="Scraping boxscores")):
        params = {"id": gid}
        try:
            resp = requests.get(boxscore_url, headers={"Authorization": f"Bearer {api_key}"}, params=params)
            resp.raise_for_status()
            data = resp.json()
            for item in data:
                item['Game_ID'] = gid
                boxscores.append(item)
        except Exception as e:
            print(f"Failed for Game_ID {gid}: {e}")
        # Save a checkpoint every X games if path given
        if checkpoint_path and (i + 1) % checkpoint_every == 0:
            temp_df = pd.DataFrame(boxscores)
            temp_df.to_csv(checkpoint_path, index=False)
        time.sleep(random.uniform(0.8, 1.3))
    # Save final checkpoint
    if checkpoint_path:
        pd.DataFrame(boxscores).to_csv(checkpoint_path, index=False)
    # Process as before
    if not boxscores:
        return pd.DataFrame()
    boxscores_df = pd.DataFrame(boxscores)
    expanded = boxscores_df.explode('teams').reset_index(drop=True)
    team_stats_df = pd.concat(
        [expanded.drop(['teams'], axis=1), expanded['teams'].apply(pd.Series)], axis=1
    )
    def unpack_stats(row):
        stat_dict = {}
        if isinstance(row, list):
            for stat in row:
                stat_dict[stat['category']] = stat['stat']
        return stat_dict
    stats_df = team_stats_df['stats'].apply(unpack_stats).apply(pd.Series)
    final_df = pd.concat(
        [team_stats_df.drop(['stats'], axis=1).reset_index(drop=True),
         stats_df.reset_index(drop=True)],
        axis=1
    )
    return final_df

# # Usage
# # from tqdm import tqdm  # Already imported
# game_ids = df3['Game_ID'].tolist()  # or set N_GAMES for testing
# checkpoint_path = "team_boxscores_checkpoint.csv"
# team_boxscores = get_team_boxscores_df(game_ids, API_KEY, checkpoint_path=checkpoint_path)


Scraping boxscores:  74%|███████▍  | 646/874 [12:55<04:33,  1.20s/it]


KeyboardInterrupt: 

In [68]:
# import pandas as pd
# import os

# checkpoint_path = "team_boxscores_checkpoint.csv"

# # Load your checkpoint (if it exists)
# if os.path.exists(checkpoint_path):
#     df_checkpoint = pd.read_csv(checkpoint_path)
#     # Dedupe to get all unique Game_IDs already scraped
#     completed_games = set(df_checkpoint['Game_ID'])
#     print(f"Found checkpoint with {len(completed_games)} unique games already scraped.")
# else:
#     completed_games = set()
#     print("No checkpoint found; starting from scratch.")

# game_ids = df3['Game_ID'].tolist()   # Your master list
# remaining_game_ids = [gid for gid in game_ids if gid not in completed_games]
# print(f"Games left to scrape: {len(remaining_game_ids)}")

# # Only scrape the games you haven't done yet!
# team_boxscores_remaining = get_team_boxscores_df(
#     remaining_game_ids, API_KEY, checkpoint_path=checkpoint_path
# )


Found checkpoint with 599 unique games already scraped.


In [76]:
print(boxscores.columns.tolist())

['id', 'teams', 'Game_ID']


In [77]:
df4 = pd.read_csv('team_boxscores_checkpoint.csv')
df5 = pd.read_csv('team_boxscores_checkpoint_2.csv')

# Concatenate and drop duplicates (by Game_ID and team, just in case)
boxscores = pd.concat([df4, df5], ignore_index=True)
boxscores = boxscores.drop_duplicates(subset=['Game_ID', 'teams'], keep='last')
print("Combined boxscores shape:", boxscores.shape)

Combined boxscores shape: (873, 3)


In [82]:
import pandas as pd
import ast

# If your DataFrame is called boxscores:
# Rename the weird column to 'teams' (optional but makes code clearer)
boxscores = boxscores.rename(columns={0: 'teams'})

# Convert string representation of list to real list
boxscores['teams'] = boxscores['teams'].apply(ast.literal_eval)

# Now explode and expand as before
expanded = boxscores.explode('teams').reset_index(drop=True)

# Remove any non-dict teams just in case
expanded = expanded[expanded['teams'].apply(lambda x: isinstance(x, dict))].reset_index(drop=True)

# Expand the team dict into columns
team_stats_df = pd.concat(
    [expanded.drop(['teams'], axis=1), expanded['teams'].apply(pd.Series)], axis=1
)

# Unpack stats as before
def unpack_stats(row):
    stat_dict = {}
    if isinstance(row, list):
        for stat in row:
            stat_dict[stat['category']] = stat['stat']
    return stat_dict

stats_df = team_stats_df['stats'].apply(unpack_stats).apply(pd.Series)

final_df = pd.concat(
    [team_stats_df.drop(['stats'], axis=1).reset_index(drop=True),
     stats_df.reset_index(drop=True)],
    axis=1
)

print("Final shape:", final_df.shape)
print(final_df.head())


Final shape: (1746, 42)
          id    Game_ID  teamId           team     conference homeAway  \
0  401635525  401635525      59   Georgia Tech            ACC     home   
1  401635525  401635525      52  Florida State            ACC     away   
2  401643697  401643697     167     New Mexico  Mountain West     home   
3  401643697  401643697     147  Montana State        Big Sky     away   
4  401643696  401643696    2567            SMU            ACC     away   

   points rushingTDs puntReturnYards puntReturnTDs puntReturns passingTDs  \
0      24          3               0             0           1          0   
1      21          2               3             0           1          0   
2      31          1             NaN           NaN         NaN          1   
3      35          3              19             0           2          2   
4      29          2             NaN           NaN         NaN          1   

  kickReturnYards kickReturnTDs kickReturns kickingPoints fumblesRec

In [85]:
df6 = boxscore_wide_cleaned(final_df)
print(df6.shape)
df6.head()


(873, 81)


Unnamed: 0,Game_ID,Away_Completion_Attempts,Away_Conference,Away_Defensive_T_Ds,Away_First_Downs,Away_Fourth_Down_Eff,Away_Fumbles_Lost,Away_Fumbles_Recovered,Away_Id,Away_Interception_T_Ds,Away_Interception_Yards,Away_Interceptions,Away_Kick_Return_T_Ds,Away_Kick_Return_Yards,Away_Kick_Returns,Away_Kicking_Points,Away_Net_Passing_Yards,Away_Passes_Deflected,Away_Passes_Intercepted,Away_Passing_T_Ds,Away_Points,Away_Possession_Time,Away_Punt_Return_T_Ds,Away_Punt_Return_Yards,Away_Punt_Returns,Away_Qb_Hurries,Away_Rushing_Attempts,Away_Rushing_T_Ds,Away_Rushing_Yards,Away_Sacks,Away_Tackles,Away_Tackles_For_Loss,Away_Team,Away_Team_Id,Away_Third_Down_Eff,Away_Total_Fumbles,Away_Total_Penalties_Yards,Away_Total_Yards,Away_Turnovers,Away_Yards_Per_Pass,Away_Yards_Per_Rush_Attempt,Home_Completion_Attempts,Home_Conference,Home_Defensive_T_Ds,Home_First_Downs,Home_Fourth_Down_Eff,Home_Fumbles_Lost,Home_Fumbles_Recovered,Home_Id,Home_Interception_T_Ds,Home_Interception_Yards,Home_Interceptions,Home_Kick_Return_T_Ds,Home_Kick_Return_Yards,Home_Kick_Returns,Home_Kicking_Points,Home_Net_Passing_Yards,Home_Passes_Deflected,Home_Passes_Intercepted,Home_Passing_T_Ds,Home_Points,Home_Possession_Time,Home_Punt_Return_T_Ds,Home_Punt_Return_Yards,Home_Punt_Returns,Home_Qb_Hurries,Home_Rushing_Attempts,Home_Rushing_T_Ds,Home_Rushing_Yards,Home_Sacks,Home_Tackles,Home_Tackles_For_Loss,Home_Team,Home_Team_Id,Home_Third_Down_Eff,Home_Total_Fumbles,Home_Total_Penalties_Yards,Home_Total_Yards,Home_Turnovers,Home_Yards_Per_Pass,Home_Yards_Per_Rush_Attempt
0,401628319,22-40,Conference USA,0,10,2-3,0,1,401628319.0,,,2,0,38,2,0.0,103,1,,0,0.0,29:25,,,,0,27,0,42,1,43,2,Western Kentucky,98.0,3-18,0.0,4-40,145,2,2.6,1.6,12-18,SEC,0,25,1-1,1,0,401628319.0,0.0,87.0,0,,,,9,266,8,2.0,3,63.0,30:35,0,42,5,3,47,6,334,1,31,7,Alabama,333.0,8-13,3.0,7-59,600,1,14.8,7.1
1,401628320,11-23,SWAC,0,10,0-0,0,0,401628320.0,,,0,0,10,1,,123,2,,0,0.0,27:45,,,,0,23,0,7,0,26,2,Arkansas-Pine Bluff,2029.0,1-10,,5-35,130,0,5.3,0.3,28-37,SEC,0,34,0-0,0,0,401628320.0,,,0,0.0,30.0,1.0,10,408,4,,2,70.0,32:15,0,33,1,5,33,8,279,4,22,10,Arkansas,8.0,9-9,2.0,6-48,687,0,11.0,8.5
2,401628321,16-34,SWAC,0,13,1-2,0,2,401628321.0,,,0,0,94,5,3.0,204,2,,0,3.0,36:34,,,,0,36,0,36,0,24,0,Alabama A&M,2010.0,2-16,1.0,7-60,240,0,6.0,1.0,18-28,SEC,1,18,0-0,2,0,401628321.0,,,0,0.0,16.0,1.0,13,451,3,,6,73.0,13:26,1,70,5,1,18,3,177,3,47,11,Auburn,2.0,4-6,2.0,5-60,628,2,16.1,9.8
3,401628322,26-36,ACC,0,25,0-0,0,0,401628322.0,0.0,67.0,1,0,0,1,11.0,385,5,2.0,3,41.0,34:30,0.0,7.0,1.0,1,33,2,144,3,29,8,Miami,2390.0,5-10,,2-26,529,1,10.7,4.4,14-26,SEC,0,17,1-2,0,0,401628322.0,0.0,0.0,2,0.0,35.0,1.0,5,122,3,1.0,0,17.0,25:30,0,28,2,4,28,2,139,1,36,2,Florida,57.0,1-9,1.0,2-25,261,2,4.7,5.0
4,401628323,18-29,ACC,0,13,0-1,0,0,401628323.0,,,1,0,19,1,3.0,142,3,,0,3.0,27:18,,,,0,22,0,47,1,44,5,Clemson,228.0,4-13,,3-15,189,1,4.9,2.1,23-33,SEC,0,19,0-0,0,0,401628323.0,0.0,0.0,0,,,,10,278,2,1.0,2,34.0,32:42,0,23,3,0,27,2,169,2,32,6,Georgia,61.0,4-11,,7-70,447,0,8.4,6.3


In [87]:
def remove_xy(col):
    return col.replace('_x', '').replace('_y', '')

# Merge your main summary (df3) with wide-format boxscores (df5)
df7 = df3.merge(df6, on="Game_ID", how="inner")
df7.columns = [remove_xy(col) for col in df7.columns]

# Only keep columns that are present in both the keep list and your merged DataFrame
df7 = df7[[col for col in keep_cols if col in df7.columns]]

# Replace _T_Ds with _TDs in column names for standardization
df7.columns = [col.replace('_T_Ds', '_TDs') for col in df7.columns]

print(df7.shape)
df7.head()


(873, 106)


Unnamed: 0,Game_ID,Week,Start_Date,Start_Time,Venue,Completed,Neutral_Site,Conference_Game,Home_Team,Home_Team.1,Home_Conference,Home_Conference.1,Home_Pts,Home_1Q,Home_2Q,Home_3Q,Home_4Q,Away_Team,Away_Team.1,Away_Conference,Away_Conference.1,Away_Pts,Away_1Q,Away_2Q,Away_3Q,Away_4Q,Excitement,Home_Record,Home_Conf_Record,Away_Record,Away_Conf_Record,Away_Completion_Attempts,Away_Defensive_TDs,Away_First_Downs,Away_Fourth_Down_Eff,Away_Fumbles_Lost,Away_Fumbles_Recovered,Away_Id,Away_Interception_TDs,Away_Interception_Yards,Away_Interceptions,Away_Kick_Return_TDs,Away_Kick_Return_Yards,Away_Kick_Returns,Away_Kicking_Points,Away_Net_Passing_Yards,Away_Passes_Deflected,Away_Passes_Intercepted,Away_Passing_TDs,Away_Points,Away_Possession_Time,Away_Punt_Return_TDs,Away_Punt_Return_Yards,Away_Punt_Returns,Away_Qb_Hurries,Away_Rushing_Attempts,Away_Rushing_TDs,Away_Rushing_Yards,Away_Sacks,Away_Tackles,Away_Tackles_For_Loss,Away_Team_Id,Away_Third_Down_Eff,Away_Total_Fumbles,Away_Total_Penalties_Yards,Away_Total_Yards,Away_Turnovers,Away_Yards_Per_Pass,Away_Yards_Per_Rush_Attempt,Home_Completion_Attempts,Home_Defensive_TDs,Home_First_Downs,Home_Fourth_Down_Eff,Home_Fumbles_Lost,Home_Fumbles_Recovered,Home_Id,Home_Interception_TDs,Home_Interception_Yards,Home_Interceptions,Home_Kick_Return_TDs,Home_Kick_Return_Yards,Home_Kick_Returns,Home_Kicking_Points,Home_Net_Passing_Yards,Home_Passes_Deflected,Home_Passes_Intercepted,Home_Passing_TDs,Home_Points,Home_Possession_Time,Home_Punt_Return_TDs,Home_Punt_Return_Yards,Home_Punt_Returns,Home_Qb_Hurries,Home_Rushing_Attempts,Home_Rushing_TDs,Home_Rushing_Yards,Home_Sacks,Home_Tackles,Home_Tackles_For_Loss,Home_Third_Down_Eff,Home_Total_Fumbles,Home_Total_Penalties_Yards,Home_Total_Yards,Home_Turnovers,Home_Yards_Per_Pass,Home_Yards_Per_Rush_Attempt
0,401635525,0,"Sat, Aug. 24, 2024",12:00,Aviva Stadium,Yes,Yes,ACC,Georgia Tech,Georgia Tech,ACC,ACC,24,7,7,0,10,Florida State,Florida State,ACC,ACC,21,8,6,0,7,7.8,1-0,1-0,0-1,0-1,19-27,0,20,2-3,0,0,401635525.0,,,0,0,16,1,7,193,1,,0,21.0,30:39,0.0,3.0,1.0,1,31,2,98,0,35,3,52.0,5-12,,1-10,291,0,7.1,3.2,11-16,0,18,0-0,0,0,401635525.0,,,0,0.0,20.0,1.0,6,146,1,,0,24.0,29:21,0.0,0.0,1.0,1,36,3,190,1,35,7,5-9,2.0,3-35,336,0,9.1,5.3
1,401643697,0,"Sat, Aug. 24, 2024",16:00,University Stadium (NM),Yes,No,Non-Conf,New Mexico,New Mexico,Mountain West,Mountain West,31,10,14,7,0,Montana State,Montana State,Big Sky,Big Sky,35,0,14,0,21,4.9,0-1,0-0,1-0,0-0,21-32,0,27,0-3,2,1,401643697.0,,,0,0,40,3,5,205,2,,2,35.0,34:40,0.0,19.0,2.0,2,47,3,362,0,33,6,147.0,6-12,2.0,6-72,567,2,6.4,7.7,18-26,2,19,0-0,1,2,401643697.0,,,0,,,,7,172,2,,1,31.0,25:20,,,,6,28,1,152,3,39,4,1-8,1.0,4-30,324,1,6.6,5.4
2,401643696,0,"Sat, Aug. 24, 2024",20:00,Mackay Stadium,Yes,No,Non-Conf,Nevada,Nevada,Mountain West,Mountain West,24,7,10,7,0,SMU,SMU,ACC,ACC,29,0,10,3,16,7.7,0-1,0-0,1-0,0-0,21-35,0,22,0-1,0,0,401643696.0,,,1,0,97,4,7,308,2,,1,29.0,23:43,,,,2,34,2,100,1,41,5,2567.0,5-12,3.0,11-125,408,1,8.8,2.9,15-28,0,16,0-2,0,0,401643696.0,0.0,0.0,0,0.0,37.0,3.0,6,150,8,1.0,2,24.0,36:17,0.0,31.0,2.0,0,43,1,148,0,39,5,8-18,2.0,11-81,298,0,5.4,3.4
3,401643858,0,"Sat, Aug. 24, 2024",23:59,Clarence T.C. Ching Athletics Complex,Yes,No,Non-Conf,Hawai'i,Hawai'i,Mountain West,Mountain West,35,14,0,7,14,Delaware State,Delaware State,MEAC,MEAC,14,0,7,7,0,5.2,1-0,0-0,0-1,0-0,17-29,0,11,0-0,0,0,401643858.0,,,0,0,16,2,2,156,4,,0,14.0,33:05,0.0,11.0,2.0,1,34,2,104,1,30,2,2169.0,5-18,2.0,9-91,260,0,5.4,3.1,17-34,1,19,0-0,0,0,401643858.0,,,0,0.0,38.0,3.0,5,203,3,,2,35.0,26:55,1.0,61.0,3.0,3,27,2,128,5,40,11,5-13,,5-48,331,0,6.0,4.7
4,401628458,1,"Thu, Aug. 29, 2024",18:00,SHI Stadium,Yes,No,Non-Conf,Rutgers,Rutgers,Big Ten,Big Ten,44,7,10,14,13,Howard,Howard,MEAC,MEAC,7,0,7,0,0,4.7,1-0,0-0,0-1,0-0,14-25,0,14,1-4,0,0,401628458.0,,,1,0,22,1,1,116,2,,0,7.0,26:46,,,,1,36,1,145,0,31,2,47.0,6-16,,4-30,261,1,4.6,4.0,15-24,1,29,1-3,0,0,401628458.0,1.0,52.0,0,0.0,18.0,1.0,8,147,1,1.0,3,44.0,33:14,0.0,19.0,1.0,4,46,2,329,1,27,6,5-10,,3-40,476,0,6.1,7.2


In [95]:
def clean_column_names(df):
 # 1. Rename all _x columns to drop the _x suffix
    rename_map = {col: col[:-2] for col in df.columns if col.endswith('_x')}
    df = df.rename(columns=rename_map)

    # 2. Drop all _y columns
    drop_cols = [col for col in df.columns if col.endswith('_y')]
    df = df.drop(columns=drop_cols)

    # 3. Replace all _T_Ds (case-insensitive) with _TDs in column names
    def fix_td(col):
        col = re.sub(r'_T_Ds', '_TDs', col)
        col = re.sub(r'_t_ds', '_tds', col)
        return col
    df.columns = [fix_td(col) for col in df.columns]

    # 4. Drop Away_Id and Home_Id if present
    for col in ['Away_Id', 'Home_Id']:
        if col in df.columns:
            df = df.drop(columns=[col])

    # 5. Convert Away_Team_Id and Home_Team_Id to integers (if present)
    for col in ['Away_Team_Id', 'Home_Team_Id']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')

    # 6. Drop duplicate columns, keeping the first occurrence
    _, idx = np.unique(df.columns, return_index=True)
    df = df.iloc[:, np.sort(idx)]

    return df




df8 = clean_column_names(df7)
print(df8.shape)  # No parentheses after 'shape'
df8.head()        # If you want to preview the data


(873, 100)


Unnamed: 0,Game_ID,Week,Start_Date,Start_Time,Venue,Completed,Neutral_Site,Conference_Game,Home_Team,Home_Conference,Home_Pts,Home_1Q,Home_2Q,Home_3Q,Home_4Q,Away_Team,Away_Conference,Away_Pts,Away_1Q,Away_2Q,Away_3Q,Away_4Q,Excitement,Home_Record,Home_Conf_Record,Away_Record,Away_Conf_Record,Away_Completion_Attempts,Away_Defensive_TDs,Away_First_Downs,Away_Fourth_Down_Eff,Away_Fumbles_Lost,Away_Fumbles_Recovered,Away_Interception_TDs,Away_Interception_Yards,Away_Interceptions,Away_Kick_Return_TDs,Away_Kick_Return_Yards,Away_Kick_Returns,Away_Kicking_Points,Away_Net_Passing_Yards,Away_Passes_Deflected,Away_Passes_Intercepted,Away_Passing_TDs,Away_Points,Away_Possession_Time,Away_Punt_Return_TDs,Away_Punt_Return_Yards,Away_Punt_Returns,Away_Qb_Hurries,Away_Rushing_Attempts,Away_Rushing_TDs,Away_Rushing_Yards,Away_Sacks,Away_Tackles,Away_Tackles_For_Loss,Away_Team_Id,Away_Third_Down_Eff,Away_Total_Fumbles,Away_Total_Penalties_Yards,Away_Total_Yards,Away_Turnovers,Away_Yards_Per_Pass,Away_Yards_Per_Rush_Attempt,Home_Completion_Attempts,Home_Defensive_TDs,Home_First_Downs,Home_Fourth_Down_Eff,Home_Fumbles_Lost,Home_Fumbles_Recovered,Home_Interception_TDs,Home_Interception_Yards,Home_Interceptions,Home_Kick_Return_TDs,Home_Kick_Return_Yards,Home_Kick_Returns,Home_Kicking_Points,Home_Net_Passing_Yards,Home_Passes_Deflected,Home_Passes_Intercepted,Home_Passing_TDs,Home_Points,Home_Possession_Time,Home_Punt_Return_TDs,Home_Punt_Return_Yards,Home_Punt_Returns,Home_Qb_Hurries,Home_Rushing_Attempts,Home_Rushing_TDs,Home_Rushing_Yards,Home_Sacks,Home_Tackles,Home_Tackles_For_Loss,Home_Third_Down_Eff,Home_Total_Fumbles,Home_Total_Penalties_Yards,Home_Total_Yards,Home_Turnovers,Home_Yards_Per_Pass,Home_Yards_Per_Rush_Attempt
0,401635525,0,"Sat, Aug. 24, 2024",12:00,Aviva Stadium,Yes,Yes,ACC,Georgia Tech,ACC,24,7,7,0,10,Florida State,ACC,21,8,6,0,7,7.8,1-0,1-0,0-1,0-1,19-27,0,20,2-3,0,0,,,0,0,16,1,7,193,1,,0,21.0,30:39,0.0,3.0,1.0,1,31,2,98,0,35,3,52,5-12,,1-10,291,0,7.1,3.2,11-16,0,18,0-0,0,0,,,0,0.0,20.0,1.0,6,146,1,,0,24.0,29:21,0.0,0.0,1.0,1,36,3,190,1,35,7,5-9,2.0,3-35,336,0,9.1,5.3
1,401643697,0,"Sat, Aug. 24, 2024",16:00,University Stadium (NM),Yes,No,Non-Conf,New Mexico,Mountain West,31,10,14,7,0,Montana State,Big Sky,35,0,14,0,21,4.9,0-1,0-0,1-0,0-0,21-32,0,27,0-3,2,1,,,0,0,40,3,5,205,2,,2,35.0,34:40,0.0,19.0,2.0,2,47,3,362,0,33,6,147,6-12,2.0,6-72,567,2,6.4,7.7,18-26,2,19,0-0,1,2,,,0,,,,7,172,2,,1,31.0,25:20,,,,6,28,1,152,3,39,4,1-8,1.0,4-30,324,1,6.6,5.4
2,401643696,0,"Sat, Aug. 24, 2024",20:00,Mackay Stadium,Yes,No,Non-Conf,Nevada,Mountain West,24,7,10,7,0,SMU,ACC,29,0,10,3,16,7.7,0-1,0-0,1-0,0-0,21-35,0,22,0-1,0,0,,,1,0,97,4,7,308,2,,1,29.0,23:43,,,,2,34,2,100,1,41,5,2567,5-12,3.0,11-125,408,1,8.8,2.9,15-28,0,16,0-2,0,0,0.0,0.0,0,0.0,37.0,3.0,6,150,8,1.0,2,24.0,36:17,0.0,31.0,2.0,0,43,1,148,0,39,5,8-18,2.0,11-81,298,0,5.4,3.4
3,401643858,0,"Sat, Aug. 24, 2024",23:59,Clarence T.C. Ching Athletics Complex,Yes,No,Non-Conf,Hawai'i,Mountain West,35,14,0,7,14,Delaware State,MEAC,14,0,7,7,0,5.2,1-0,0-0,0-1,0-0,17-29,0,11,0-0,0,0,,,0,0,16,2,2,156,4,,0,14.0,33:05,0.0,11.0,2.0,1,34,2,104,1,30,2,2169,5-18,2.0,9-91,260,0,5.4,3.1,17-34,1,19,0-0,0,0,,,0,0.0,38.0,3.0,5,203,3,,2,35.0,26:55,1.0,61.0,3.0,3,27,2,128,5,40,11,5-13,,5-48,331,0,6.0,4.7
4,401628458,1,"Thu, Aug. 29, 2024",18:00,SHI Stadium,Yes,No,Non-Conf,Rutgers,Big Ten,44,7,10,14,13,Howard,MEAC,7,0,7,0,0,4.7,1-0,0-0,0-1,0-0,14-25,0,14,1-4,0,0,,,1,0,22,1,1,116,2,,0,7.0,26:46,,,,1,36,1,145,0,31,2,47,6-16,,4-30,261,1,4.6,4.0,15-24,1,29,1-3,0,0,1.0,52.0,0,0.0,18.0,1.0,8,147,1,1.0,3,44.0,33:14,0.0,19.0,1.0,4,46,2,329,1,27,6,5-10,,3-40,476,0,6.1,7.2


In [8]:
def get_espn_top25(season, week):
    url = f'https://www.espn.com/college-football/rankings/_/week/{week}/year/{season}/seasontype/2'
    resp = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    if not resp.ok:
        print(f"Failed for week {week}")
        return None
    soup = BeautifulSoup(resp.text, 'html.parser')

    for section in soup.find_all('section'):
        poll_title = section.find(['h2','span'])
        table = section.find('table')
        if not poll_title or not table:
            continue
        headers = [th.text.strip() for th in table.find_all('th')]
        rows = table.find_all('tr')[1:]
        poll_data = []
        for row in rows:
            cols = [td.text.strip() for td in row.find_all('td')]
            if len(cols) == len(headers):
                poll_data.append(dict(zip(headers, cols)))
        if poll_data:
            df = pd.DataFrame(poll_data)
            df['Poll'] = poll_title.text.strip()
            df['Week'] = week
            df['Season'] = season
            return df.head(25)
    print(f"No rankings found for week {week}")
    return None

# Loop through weeks 1-16
season = 2024
all_weeks = []

for week in range(1, 17):
    print(f"Scraping week {week}...")
    df = get_espn_top25(season, week)
    if df is not None and not df.empty:
        all_weeks.append(df)

# Combine all into a single DataFrame
if all_weeks:
    all_rankings = pd.concat(all_weeks, ignore_index=True)
    # Optional: Save to CSV
    # all_rankings.to_csv(f"espn_top25_weeks1-16_{season}.csv", index=False)
else:
    print("No data found for any week.")


Scraping week 1...
Scraping week 2...
Scraping week 3...
Scraping week 4...
Scraping week 5...
Scraping week 6...
Scraping week 7...
Scraping week 8...
Scraping week 9...
Scraping week 10...
Scraping week 11...
Scraping week 12...
Scraping week 13...
Scraping week 14...
Scraping week 15...
Scraping week 16...


In [14]:
def clean_team_name(team):
    team = re.sub(r'\s*\(\d+\)', '', str(team))         # Remove (xx) votes
    team = re.sub(r'^[A-Z]{2,4}\s+', '', team)          # Remove 3-4 letter code
    return team.strip()

def attach_rankings(df2_clean, all_rankings):
    df = df2_clean.copy()
    ranks = all_rankings.copy()

    # Make Week columns string for matching, and create Week_for_merge for week 0 logic
    df['Week'] = df['Week'].astype(str)
    ranks['Week'] = ranks['Week'].astype(str)
    df['Week_for_merge'] = df['Week'].replace({'0': '1'})
    ranks['Week_for_merge'] = ranks['Week']

    # Clean team names
    df['Home_Team_Clean'] = df['Home_Team'].apply(clean_team_name)
    df['Away_Team_Clean'] = df['Away_Team'].apply(clean_team_name)
    ranks['Team_Clean'] = ranks['Team'].apply(clean_team_name)

    # Remove any prior rank columns
    for col in ['Home_Rank', 'Away_Rank']:
        df = df.drop(columns=[c for c in df.columns if c.startswith(col)], errors='ignore')

    # Merge home ranks
    df = df.merge(
        ranks[['Week_for_merge', 'Team_Clean', 'RK']].rename(
            columns={'Team_Clean': 'Home_Team_Clean', 'RK': 'Home_Rank'}
        ),
        on=['Week_for_merge', 'Home_Team_Clean'],
        how='left'
    )

    # Merge away ranks
    df = df.merge(
        ranks[['Week_for_merge', 'Team_Clean', 'RK']].rename(
            columns={'Team_Clean': 'Away_Team_Clean', 'RK': 'Away_Rank'}
        ),
        on=['Week_for_merge', 'Away_Team_Clean'],
        how='left'
    )

    # Deduplicate columns, keeping only the final Home_Rank and Away_Rank
    for base in ['Home_Rank', 'Away_Rank']:
        cols = [c for c in df.columns if c.startswith(base)]
        if cols:
            df[base] = df[cols[-1]]
            for col in cols:
                if col != base:
                    df = df.drop(columns=[col])
    return df

# Usage:
df5_4gametest = df5_4gametest.loc[:, ~df5_4gametest.columns.duplicated()]
df6_4gametest = attach_rankings(df5_4gametest, all_rankings)

In [15]:
print(df6_4gametest.shape)
df6_4gametest.head()

(4, 105)


Unnamed: 0,Game_ID,Week,Start_Date,Start_Time,Venue,Completed,Neutral_Site,Conference_Game,Home_Team,Home_Conference,Home_Pts,Home_1Q,Home_2Q,Home_3Q,Home_4Q,Away_Team,Away_Conference,Away_Pts,Away_1Q,Away_2Q,Away_3Q,Away_4Q,Excitement,Home_Record,Home_Conf_Record,Away_Record,Away_Conf_Record,Away_Completion_Attempts,Away_Defensive_TDs,Away_First_Downs,Away_Fourth_Down_Eff,Away_Fumbles_Lost,Away_Fumbles_Recovered,Away_Interception_TDs,Away_Interception_Yards,Away_Interceptions,Away_Kick_Return_TDs,Away_Kick_Return_Yards,Away_Kick_Returns,Away_Kicking_Points,Away_Net_Passing_Yards,Away_Passes_Deflected,Away_Passes_Intercepted,Away_Passing_TDs,Away_Points,Away_Possession_Time,Away_Punt_Return_TDs,Away_Punt_Return_Yards,Away_Punt_Returns,Away_Qb_Hurries,Away_Rushing_Attempts,Away_Rushing_TDs,Away_Rushing_Yards,Away_Sacks,Away_Tackles,Away_Tackles_For_Loss,Away_Team_Id,Away_Third_Down_Eff,Away_Total_Fumbles,Away_Total_Penalties_Yards,Away_Total_Yards,Away_Turnovers,Away_Yards_Per_Pass,Away_Yards_Per_Rush_Attempt,Home_Completion_Attempts,Home_Defensive_TDs,Home_First_Downs,Home_Fourth_Down_Eff,Home_Fumbles_Lost,Home_Fumbles_Recovered,Home_Interception_TDs,Home_Interception_Yards,Home_Interceptions,Home_Kick_Return_TDs,Home_Kick_Return_Yards,Home_Kick_Returns,Home_Kicking_Points,Home_Net_Passing_Yards,Home_Passes_Deflected,Home_Passes_Intercepted,Home_Passing_TDs,Home_Points,Home_Possession_Time,Home_Punt_Return_TDs,Home_Punt_Return_Yards,Home_Punt_Returns,Home_Qb_Hurries,Home_Rushing_Attempts,Home_Rushing_TDs,Home_Rushing_Yards,Home_Sacks,Home_Tackles,Home_Tackles_For_Loss,Home_Third_Down_Eff,Home_Total_Fumbles,Home_Total_Penalties_Yards,Home_Total_Yards,Home_Turnovers,Home_Yards_Per_Pass,Home_Yards_Per_Rush_Attempt,Week_for_merge,Home_Team_Clean,Away_Team_Clean,Home_Rank,Away_Rank
0,401635525,0,"Sat, Aug. 24, 2024",12:00,Aviva Stadium,Yes,Yes,ACC,Georgia Tech,ACC,24,7,7,0,10,Florida State,ACC,21,8,6,0,7,7.8,1-0,1-0,0-1,0-1,19-27,0,20,2-3,0,0,,,0,0,16,1,7,193,1,,0,21.0,30:39,0.0,3.0,1.0,1,31,2,98,0,35,3,52,5-12,,1-10,291,0,7.1,3.2,11-16,0,18,0-0,0,0,,,0,0.0,20.0,1.0,6,146,1,,0,24.0,29:21,0.0,0.0,1.0,1,36,3,190,1,35,7,5-9,2.0,3-35,336,0,9.1,5.3,1,Georgia Tech,Florida State,,10.0
1,401643697,0,"Sat, Aug. 24, 2024",16:00,University Stadium (NM),Yes,No,Non-Conf,New Mexico,Mountain West,31,10,14,7,0,Montana State,Big Sky,35,0,14,0,21,4.9,0-1,0-0,1-0,0-0,21-32,0,27,0-3,2,1,,,0,0,40,3,5,205,2,,2,35.0,34:40,0.0,19.0,2.0,2,47,3,362,0,33,6,147,6-12,2.0,6-72,567,2,6.4,7.7,18-26,2,19,0-0,1,2,,,0,,,,7,172,2,,1,31.0,25:20,,,,6,28,1,152,3,39,4,1-8,1.0,4-30,324,1,6.6,5.4,1,New Mexico,Montana State,,
2,401643696,0,"Sat, Aug. 24, 2024",20:00,Mackay Stadium,Yes,No,Non-Conf,Nevada,Mountain West,24,7,10,7,0,SMU,ACC,29,0,10,3,16,7.7,0-1,0-0,1-0,0-0,21-35,0,22,0-1,0,0,,,1,0,97,4,7,308,2,,1,29.0,23:43,,,,2,34,2,100,1,41,5,2567,5-12,3.0,11-125,408,1,8.8,2.9,15-28,0,16,0-2,0,0,0.0,0.0,0,0.0,37.0,3.0,6,150,8,1.0,2,24.0,36:17,0.0,31.0,2.0,0,43,1,148,0,39,5,8-18,2.0,11-81,298,0,5.4,3.4,1,Nevada,SMU,,
3,401643858,0,"Sat, Aug. 24, 2024",23:59,Clarence T.C. Ching Athletics Complex,Yes,No,Non-Conf,Hawai'i,Mountain West,35,14,0,7,14,Delaware State,MEAC,14,0,7,7,0,5.2,1-0,0-0,0-1,0-0,17-29,0,11,0-0,0,0,,,0,0,16,2,2,156,4,,0,14.0,33:05,0.0,11.0,2.0,1,34,2,104,1,30,2,2169,5-18,2.0,9-91,260,0,5.4,3.1,17-34,1,19,0-0,0,0,,,0,0.0,38.0,3.0,5,203,3,,2,35.0,26:55,1.0,61.0,3.0,3,27,2,128,5,40,11,5-13,,5-48,331,0,6.0,4.7,1,Hawai'i,Delaware State,,


In [96]:
df8.to_csv("cfb_allgames_2024.csv", index=False)


In [35]:
all_rankings

Unnamed: 0,RK,Team,REC,PTS,TREND,Poll,Week,Season,CONF,Seed
0,1,UGA Georgia (46),0-0,1532,NR,,1,2024,,
1,2,OSU Ohio State (15),0-0,1490,NR,,1,2024,,
2,3,ORE Oregon (1),0-0,1403,NR,,1,2024,,
3,4,TEX Texas,0-0,1386,NR,,1,2024,,
4,5,ALA Alabama,0-0,1260,NR,,1,2024,,
...,...,...,...,...,...,...,...,...,...,...
395,21,SYR Syracuse,9-3,,1,,16,2024,ACC,
396,22,ARMY Army,11-1,,2,,16,2024,American,
397,23,COLO Colorado,9-3,,-,,16,2024,Big 12,
398,24,UNLV UNLV,10-3,,4,,16,2024,Mountain West,
