In [1]:
import pandas as pd

In [2]:
games_full = pd.read_csv("/workspaces/cfb_predictor/src/cfb_predictor/data/files/games_full.csv")
games_full.head()

Unnamed: 0,id,season,week,seasonType,startDate,startTimeTBD,completed,neutralSite,conferenceGame,attendance,...,away_scoringOpportunities_opportunities,away_scoringOpportunities_points,away_scoringOpportunities_pointsPerOpportunity,home_scoringOpportunities_opportunities,home_scoringOpportunities_points,home_scoringOpportunities_pointsPerOpportunity,away_fieldPosition_averageStart,away_fieldPosition_averageStartingPredictedPoints,home_fieldPosition_averageStart,home_fieldPosition_averageStartingPredictedPoints
0,401282714,2021,1,regular,2021-08-28T17:20:00.000Z,False,True,False,True,41064.0,...,4,17,4.25,5,21,4.2,77.9,0.88,67.4,1.51
1,401286187,2021,1,regular,2021-08-28T18:00:00.000Z,False,True,False,False,26043.0,...,1,0,0.0,6,24,4.0,73.9,1.1,68.7,1.53
2,401309833,2021,1,regular,2021-08-28T19:30:00.000Z,False,True,False,False,32982.0,...,5,10,2.0,6,24,4.0,77.4,0.85,60.0,1.82
3,401282049,2021,1,regular,2021-08-29T01:30:00.000Z,False,True,False,False,19034.0,...,8,23,2.88,5,3,0.6,71.3,1.32,66.2,1.26
4,401310693,2021,1,regular,2021-08-29T02:00:00.000Z,False,True,False,False,16204.0,...,9,14,1.56,8,45,5.63,67.7,1.05,73.0,1.32


In [3]:
list(games_full.columns)

['id',
 'season',
 'week',
 'seasonType',
 'startDate',
 'startTimeTBD',
 'completed',
 'neutralSite',
 'conferenceGame',
 'attendance',
 'venueId',
 'venue',
 'homeId',
 'homeTeam',
 'homeClassification',
 'homeConference',
 'homePoints',
 'homeLineScores',
 'homePostgameWinProbability',
 'homePregameElo',
 'homePostgameElo',
 'awayId',
 'awayTeam',
 'awayClassification',
 'awayConference',
 'awayPoints',
 'awayLineScores',
 'awayPostgameWinProbability',
 'awayPregameElo',
 'awayPostgameElo',
 'excitementIndex',
 'highlights',
 'notes',
 'homePoints_advanced',
 'homeWinProb',
 'awayPoints_advanced',
 'awayWinProb',
 'homeWinner',
 'excitement',
 'away_ppa_plays',
 'away_ppa_overall_total',
 'away_ppa_overall_quarter1',
 'away_ppa_overall_quarter2',
 'away_ppa_overall_quarter3',
 'away_ppa_overall_quarter4',
 'away_ppa_passing_total',
 'away_ppa_passing_quarter1',
 'away_ppa_passing_quarter2',
 'away_ppa_passing_quarter3',
 'away_ppa_passing_quarter4',
 'away_ppa_rushing_total',
 'away

In [4]:
games_full_features = [
    "id", "season", "week", "startDate", "homeId", "homeTeam", "awayId", "awayTeam",
    "homeConference", "awayConference", "homeClassification", "awayClassification",
    "home_ppa_plays", "away_ppa_plays", "home_ppa_overall_total", "away_ppa_overall_total",
    "home_ppa_passing_total", "away_ppa_passing_total", "home_ppa_rushing_total", "away_ppa_rushing_total",
    "home_cumulativePpa_overall_total", "away_cumulativePpa_overall_total", "home_cumulativePpa_passing_total", "away_cumulativePpa_passing_total",
    "home_cumulativePpa_rushing_total", "away_cumulativePpa_rushing_total",
    "home_successRates_overall_total", "home_successRates_standardDowns_total", "home_successRates_passingDowns_total",
    "away_successRates_overall_total", "away_successRates_standardDowns_total", "away_successRates_passingDowns_total",
    "home_rushing_stuffRate", "away_rushing_stuffRate",
    "home_havoc_total", "away_havoc_total", "home_scoringOpportunities_opportunities", "away_scoringOpportunities_opportunities",
    "home_scoringOpportunities_points", "home_scoringOpportunities_pointsPerOpportunity", "away_scoringOpportunities_points", "away_scoringOpportunities_pointsPerOpportunity",
    "home_fieldPosition_averageStart", "away_fieldPosition_averageStart", 'homePoints', 'awayPoints'
]


games_full = games_full[games_full_features]

In [5]:
def fix_week(df: pd.DataFrame):
    df['startDate'] = pd.to_datetime(df['startDate'])
    # Shift so Tuesday (1) becomes the start of week
    # Subtract 1 day to shift Tuesday->Monday, Wednesday->Tuesday, etc.
    shifted_date = df['startDate'] - pd.Timedelta(days=1)
    df['week_of_year'] = shifted_date.dt.isocalendar().week
    return df

games_full = fix_week(games_full)

In [6]:
from cfb_predictor.modeling.features import TeamNormalizer, WeeksOffCalculator, NodeIdAdder, TeamConferenceEncoder, TeamTalentMerger
from sklearn.pipeline import Pipeline

talent_df = pd.read_csv("/workspaces/cfb_predictor/src/cfb_predictor/data/files/talent.csv")

feature_pipeline = Pipeline([
    ('team_normalizer', TeamNormalizer()),
    ('weeks_off_calculator', WeeksOffCalculator()),
    ('team_talent_merger', TeamTalentMerger(talent_df=talent_df)),
    ('node_id_adder', NodeIdAdder()),
    ('team_conference_encoder', TeamConferenceEncoder())
])
nodes_df: pd.DataFrame = feature_pipeline.fit_transform(games_full)
nodes_df.head()

Unnamed: 0,id,season,week,startDate,teamId,team,opponentId,opponent,conference,classification,...,fieldPosition_averageStart,points,opponent_points,week_of_year,is_home,weeks_off,talent,node_id,opponent_node_id,conference_id
0,401331241,2021,1,2022-01-05 02:00:00+00:00,2306,Kansas State,99,LSU,Big 12,1,...,62.3,42.0,20.0,1,1,0,569.65,Kansas State_2306_2021_1,LSU_99_2021_1,0
1,401677106,2024,1,2025-01-03 21:00:00+00:00,326,Texas State,249,North Texas,Sun Belt,1,...,67.3,30.0,28.0,1,1,0,540.17,Texas State_326_2024_1,North Texas_249_2024_1,1
2,401677182,2024,1,2025-01-01 18:00:00+00:00,251,Texas,9,Arizona State,SEC,1,...,75.1,39.0,31.0,1,0,0,953.95,Texas_251_2024_1,Arizona State_9_2024_1,2
3,401677103,2024,1,2024-12-31 20:00:00+00:00,2579,South Carolina,356,Illinois,SEC,1,...,67.2,17.0,21.0,1,0,0,798.18,South Carolina_2579_2024_1,Illinois_356_2024_1,2
4,401677181,2024,1,2025-01-01 00:30:00+00:00,213,Penn State,68,Boise State,Big Ten,1,...,71.9,31.0,14.0,1,0,0,895.31,Penn State_213_2024_1,Boise State_68_2024_1,3


In [7]:
nodes_df.shape

(10901, 35)

In [8]:
nodes_df.columns.tolist()

['id',
 'season',
 'week',
 'startDate',
 'teamId',
 'team',
 'opponentId',
 'opponent',
 'conference',
 'classification',
 'ppa_plays',
 'ppa_overall_total',
 'ppa_passing_total',
 'ppa_rushing_total',
 'cumulativePpa_overall_total',
 'cumulativePpa_passing_total',
 'cumulativePpa_rushing_total',
 'successRates_overall_total',
 'successRates_standardDowns_total',
 'successRates_passingDowns_total',
 'rushing_stuffRate',
 'havoc_total',
 'scoringOpportunities_opportunities',
 'scoringOpportunities_points',
 'scoringOpportunities_pointsPerOpportunity',
 'fieldPosition_averageStart',
 'points',
 'opponent_points',
 'week_of_year',
 'is_home',
 'weeks_off',
 'talent',
 'node_id',
 'opponent_node_id',
 'conference_id']

## Build Graph

In [25]:
import torch
import numpy as np
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler


def build_year_graph(df: pd.DataFrame, year: int) -> Data:
    df = df[df['season'] == year].copy()
    df = df.drop_duplicates(keep='first').reset_index(drop=True)
    node_idx_map = dict(zip(df['node_id'], range(len(df))))

    non_feature_cols = [
        "id", "season", "week", "startDate", "teamId", "team", "opponentId", "opponent", "conference", "classification",
        "week_of_year", 'node_id', 'opponent_node_id','conference_id'
    ]
    target_cols = ['points', 'opponent_points']

    feature_cols = list(set(df.columns) - set(non_feature_cols) - set(target_cols))
    print(f"{feature_cols=}")

    df[feature_cols] = df[feature_cols].fillna(0)
    scaler = StandardScaler()
    df[feature_cols] = scaler.fit_transform(df[feature_cols])
    X = torch.tensor(df[feature_cols].values, dtype=torch.float32)
    N = X.shape[0]

    # Game edges
    team_node_idxs = df['node_id'].map(node_idx_map).values
    opponent_node_idxs = df['opponent_node_id'].map(node_idx_map).values

    # Get previous node ids for temporal edges
    season_group_cols = ['teamId', 'season']
    week_group_cols = season_group_cols + ['week_of_year']
    df.sort_values(week_group_cols, inplace=True)
    df['previous_node_id'] = df.groupby(season_group_cols)['node_id'].shift(1)
    df['previous_opponent_node_id'] = df.groupby(season_group_cols)['opponent_node_id'].shift(1)

    # Edge construction
    edge_source = []
    edge_target = []
    edge_weight = []

    ## Temporal edges
    has_previous_node_id_map = df['previous_node_id'].notna()
    previous_node_idxs = df.loc[has_previous_node_id_map, 'previous_node_id'].map(node_idx_map).values
    current_node_idxs = df.loc[has_previous_node_id_map, 'node_id'].map(node_idx_map).values

    for prev_idx, curr_idx in zip(previous_node_idxs, current_node_idxs):
        edge_source.append(prev_idx)
        edge_target.append(curr_idx)
        edge_weight.append(1.0)

    ## Game edges
    margin = (df['points'] - df['opponent_points']).values
    win = (df['points'] > df['opponent_points']).values

    for team_idx, opponent_idx, game_margin in zip(team_node_idxs, opponent_node_idxs, margin):
        edge_source.append(team_idx)
        edge_target.append(opponent_idx)
        weight = np.log1p(np.abs(game_margin)) # Limit weight growth
        edge_weight.append(weight)

    # Torch Geometric Data object
    edge_idxs = torch.tensor([edge_source, edge_target], dtype=torch.long)
    edge_weights = torch.tensor(edge_weight, dtype=torch.float32)
    data = Data(
        x=X, 
        edge_index=edge_idxs,
        edge_weight=edge_weights,
        num_nodes=N,
        margin=torch.tensor(margin, dtype=torch.float32),
        win=torch.tensor(win, dtype=torch.long),
        team_node_idxs=torch.tensor(previous_node_idxs, dtype=torch.long),
        opponent_node_idxs=torch.tensor(df['previous_opponent_node_id'].map(node_idx_map).values, dtype=torch.long),
        conference_ids=torch.tensor(df['conference_id'], dtype=torch.long)
    )
    return data

year_data = {year: build_year_graph(nodes_df, year) for year in nodes_df['season'].unique()}

feature_cols=['weeks_off', 'ppa_rushing_total', 'is_home', 'ppa_plays', 'ppa_overall_total', 'talent', 'cumulativePpa_passing_total', 'fieldPosition_averageStart', 'rushing_stuffRate', 'cumulativePpa_overall_total', 'havoc_total', 'successRates_standardDowns_total', 'successRates_overall_total', 'scoringOpportunities_opportunities', 'successRates_passingDowns_total', 'ppa_passing_total', 'scoringOpportunities_pointsPerOpportunity', 'cumulativePpa_rushing_total', 'scoringOpportunities_points']
feature_cols=['weeks_off', 'ppa_rushing_total', 'is_home', 'ppa_plays', 'ppa_overall_total', 'talent', 'cumulativePpa_passing_total', 'fieldPosition_averageStart', 'rushing_stuffRate', 'cumulativePpa_overall_total', 'havoc_total', 'successRates_standardDowns_total', 'successRates_overall_total', 'scoringOpportunities_opportunities', 'successRates_passingDowns_total', 'ppa_passing_total', 'scoringOpportunities_pointsPerOpportunity', 'cumulativePpa_rushing_total', 'scoringOpportunities_points']
feat

In [26]:
year_data[2024]

Data(x=[3214, 19], edge_index=[2, 6129], edge_weight=[6129], num_nodes=3214, margin=[3214], win=[3214], team_node_idxs=[2915], opponent_node_idxs=[3214], conference_ids=[3214])

In [10]:
# import torch_geometric
# import networkx as nx
# import matplotlib.pyplot as plt

# plt.figure(figsize=(20, 20))

# g = torch_geometric.utils.to_networkx(data)
# nx.draw(g, node_size=5, with_labels=True)

In [11]:
nodes_df.columns

Index(['id', 'season', 'week', 'startDate', 'teamId', 'team', 'opponentId',
       'opponent', 'conference', 'classification', 'ppa_plays',
       'ppa_overall_total', 'ppa_passing_total', 'ppa_rushing_total',
       'cumulativePpa_overall_total', 'cumulativePpa_passing_total',
       'cumulativePpa_rushing_total', 'successRates_overall_total',
       'successRates_standardDowns_total', 'successRates_passingDowns_total',
       'rushing_stuffRate', 'havoc_total',
       'scoringOpportunities_opportunities', 'scoringOpportunities_points',
       'scoringOpportunities_pointsPerOpportunity',
       'fieldPosition_averageStart', 'points', 'opponent_points',
       'week_of_year', 'is_home', 'weeks_off', 'talent', 'node_id',
       'opponent_node_id', 'conference_id', 'previous_node_id',
       'previous_opponent_node_id'],
      dtype='object')

In [12]:
data

Data(x=[10882, 19], edge_index=[2, 20675], edge_weight=[20675], num_nodes=10882)

## Training?

In [13]:
#TODO - figure out only letting it see previous weeks and an initial embedding for first week
# from sklearn.model_selection import GroupShuffleSplit

# gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# nodes_df['groups'] = nodes_df['season'].astype(str) + "_" + nodes_df['week_of_year'].astype(str)
# splits = gss.split(nodes_df, groups=nodes_df['groups'])
# train_idxs, test_idxs = next(splits)
# train_df = nodes_df.iloc[train_idxs]
# test_df = nodes_df.iloc[test_idxs]

train_df = nodes_df[nodes_df['week'] <= 10]
test_df = nodes_df[nodes_df['week'] > 10]

print(f"{len(train_df)=}")
print(f"{len(test_df)=}")

len(train_df)=8256
len(test_df)=2626


In [14]:
nodes_df[feature_cols].isna().sum().sum()

0

In [15]:
train_df.head()

Unnamed: 0,id,season,week,startDate,teamId,team,opponentId,opponent,conference,classification,...,opponent_points,week_of_year,is_home,weeks_off,talent,node_id,opponent_node_id,conference_id,previous_node_id,previous_opponent_node_id
392,401520151,2023,1,2023-09-02 19:30:00+00:00,2,Auburn,113,Massachusetts,SEC,1,...,14.0,35,1.0,-0.162297,1.281267,Auburn_2_2023_35,Massachusetts_113_2023_35,3,,
689,401520186,2023,2,2023-09-10 02:30:00+00:00,2,Auburn,25,California,SEC,1,...,10.0,36,-1.0,-0.162297,1.281267,Auburn_2_2023_36,California_25_2023_36,3,Auburn_2_2023_35,Massachusetts_113_2023_35
1045,401520221,2023,3,2023-09-16 23:00:00+00:00,2,Auburn,2535,Samford,SEC,1,...,13.0,37,1.0,-0.162297,1.281267,Auburn_2_2023_37,Samford_2535_2023_37,3,Auburn_2_2023_36,California_25_2023_36
1573,401520270,2023,4,2023-09-23 16:00:00+00:00,2,Auburn,245,Texas A&M,SEC,1,...,27.0,38,-1.0,-0.162297,1.281267,Auburn_2_2023_38,Texas A&M_245_2023_38,3,Auburn_2_2023_37,Samford_2535_2023_37
2283,401520280,2023,5,2023-09-30 19:30:00+00:00,2,Auburn,61,Georgia,SEC,1,...,27.0,39,1.0,-0.162297,1.281267,Auburn_2_2023_39,Georgia_61_2023_39,3,Auburn_2_2023_38,Texas A&M_245_2023_38


In [None]:
from typing import Dict
from tqdm import tqdm 

from cfb_predictor.modeling.models import GCNPredictor, GamePredictionOutput
from cfb_predictor.modeling.data import CFBMatchupDataset, CFBMatchupSample, collate_cfb_matchup_samples

from torch.utils.data import DataLoader
from torch_geometric.loader import NeighborLoader

def train(
    data: Data,
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    node_idx_map: Dict[str, int],
    conf_embedding_dim: int = 4,
    hidden_dim: int = 64,
    lr: float = 1e-4,
    batch_size: int = 64,
    num_epochs: int = 10,
    seed: int = 42,
):
    assert torch.cuda.is_available(), "CUDA is not available"
    device = torch.device('cuda')
    torch.manual_seed(seed)

    
    train_df = train_df[(train_df['previous_node_id'].notna()) & (train_df['previous_opponent_node_id'].notna())]
    test_df = test_df[(test_df['previous_node_id'].notna()) & (test_df['previous_opponent_node_id'].notna())]

    train_dataset = CFBMatchupDataset(train_df, node_idx_map)
    test_dataset = CFBMatchupDataset(test_df, node_idx_map)

    conference_ids = torch.tensor(nodes_df['conference_id'].tolist(), dtype=torch.long).to(device)

    # train_loader = DataLoader(
    #     train_dataset, 
    #     batch_size=batch_size, 
    #     shuffle=True, 
    #     collate_fn=collate_cfb_matchup_samples, 
    #     pin_memory=True
    # )
    train_loader = NeighborLoader(
        train_dataset,
        num_neighbors=[10, 10],
        batch_size=batch_size,
        shuffle=True,
        input_nodes=None,
        pin_memory=True,
        collate_fn=collate_cfb_matchup_samples
    )
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size, 
        shuffle=False, 
        collate_fn=collate_cfb_matchup_samples, 
        pin_memory=True
    )

    model = GCNPredictor(
        in_dim=data.num_node_features,
        num_confs = len(nodes_df['conference_id'].unique()),
        conf_emb_dim=conf_embedding_dim,
        hidden_dim=hidden_dim
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    win_loss_fn = torch.nn.CrossEntropyLoss()
    margin_fn = torch.nn.SmoothL1Loss()

    # TODO - neeeds to be batched with dataloader?
    x = data.x.to(device)
    edge_index = data.edge_index.to(device)
    edge_weight = None
    if hasattr(data, 'edge_weight'):
        edge_weight = data.edge_weight.to(device)
    
    total_steps = num_epochs * len(train_loader)
    pbar = tqdm(total=total_steps, desc="Training", position=0, leave=True)
    
    def single_batch(matchup_samples: CFBMatchupSample):
        team_node_idxs = matchup_samples.team_node_idxs.to(device)
        opponent_node_idxs = matchup_samples.opponent_node_idxs.to(device)
        # conference_ids = matchup_samples.conference_ids.to(device)
        win = matchup_samples.win.to(device)
        margin = matchup_samples.margin.to(device)

        out: GamePredictionOutput = model(
            x, 
            conf_ids = conference_ids,
            team_idxs = team_node_idxs,
            opponent_idxs = opponent_node_idxs,
            edge_index = edge_index,
            edge_weight = edge_weight
        )

        # print(f"{out.win_logits=}, {win=}")
        # print(f"{out.margin_logits=}, {margin=}")

        win_loss = win_loss_fn(out.win_logits, win)
        margin_loss = margin_fn(out.margin_logits, margin)

        loss = win_loss + margin_loss
        return win_loss, margin_loss, loss
    
    for epoch in range(num_epochs):
        pbar.set_description(f"Epoch {epoch+1}/{num_epochs}")
        model.train()
        for matchup_samples in train_loader:
            matchup_samples: CFBMatchupSample # Get type hints :^)
            optimizer.zero_grad()

            win_loss, margin_loss, loss = single_batch(matchup_samples)

            # print(f"Train [{pbar.n+1}/{total_steps}] Win Loss: {win_loss.item():.4f}, Margin Loss: {margin_loss.item():.4f}")

            loss.backward()

            optimizer.step()
            pbar.update(1)
        
        model.eval()
        with torch.no_grad():
            total_win_loss = 0.0
            total_margin_loss = 0.0
            total_loss = 0.0
            for matchup_samples in test_loader:
                matchup_samples: CFBMatchupSample
                win_loss, margin_loss, loss = single_batch(matchup_samples)
                total_win_loss += win_loss.item()
                total_margin_loss += margin_loss.item()
                total_loss += loss.item()
            print(f"Test [{epoch}/{num_epochs}] Win Loss: {total_win_loss/len(test_loader):.4f}, Margin Loss: {total_margin_loss/len(test_loader):.4f}")

    return model

model = train(data=data, train_df=train_df, test_df=test_df, node_idx_map=node_idx_map, num_epochs=5, lr=0.01)

Epoch 2/5:  22%|██▎       | 72/320 [00:01<00:06, 37.10it/s]

Test [0/5] Win Loss: 0.6952, Margin Loss: 16.1277


Epoch 3/5:  42%|████▎     | 136/320 [00:02<00:04, 40.50it/s]

Test [1/5] Win Loss: 0.6936, Margin Loss: 16.1274


Epoch 4/5:  63%|██████▎   | 202/320 [00:04<00:03, 36.15it/s]

Test [2/5] Win Loss: 0.6933, Margin Loss: 16.1279


Epoch 5/5:  82%|████████▎ | 264/320 [00:05<00:01, 34.70it/s]

Test [3/5] Win Loss: 0.6932, Margin Loss: 16.1282


Epoch 5/5: 100%|██████████| 320/320 [00:07<00:00, 45.46it/s]

Test [4/5] Win Loss: 0.6935, Margin Loss: 16.1286





In [19]:
import torch

t1 = torch.rand((3, 4))
t2 = torch.tensor([[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]])

torch.cat([t1, t2], dim=-1)

tensor([[0.7294, 0.6296, 0.6873, 0.6097, 0.1000, 0.1000],
        [0.2547, 0.2341, 0.2459, 0.1990, 0.2000, 0.2000],
        [0.7278, 0.5372, 0.4949, 0.6339, 0.3000, 0.3000]])

In [22]:
conference_ids = torch.tensor(nodes_df['conference_id'].tolist(), dtype=torch.long)
print(f"{conference_ids.shape=}")
print(f"{X.data.shape=}")

conference_ids.shape=torch.Size([6194])
X.data.shape=torch.Size([6194, 19])


Features to make
- Is Home
- Team havoc (how much havoc the team generated defensively)