# Predicting 2024 Results

- Train same kind of model but train on ALL available training data through 2023
- Gather as many of the features as we can find for 2024 and make list of predictions

In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping
import time
from bs4 import BeautifulSoup
import requests
import re

In [63]:
# Reproduce model from regression notebook but train with ALL data
train_df = pd.read_csv('data/working_df.csv')

# Sort data by time
train_df = train_df.sort_values(by='year').reset_index(drop=True)
train_df.head()

Unnamed: 0,team,year,conference,expected_wins,win_pct,recent_win_pct,prev_win_pct,elo,fpi,conference_rating,...,off_explode,def_explode,off_ppa,def_ppa,off_success_rate,def_success_rate,coach,career_win_pct,sor,sos
0,Air Force,2014,Mountain West,9.7,0.769231,0.531792,0.166667,1071.0,-16.682,-5.908333,...,0.860561,0.902638,0.096601,0.214295,0.439644,0.518913,Troy Calhoun,0.544444,115.0,113
1,San Diego State,2014,Mountain West,7.3,0.538462,0.423529,0.615385,1476.0,-3.15,-5.908333,...,1.080973,1.037095,0.049882,0.06901,0.385635,0.378771,Rocky Long,0.553333,62.0,104
2,Clemson,2014,ACC,9.8,0.769231,0.653631,0.846154,1928.0,19.981,10.264286,...,0.913386,1.05914,0.134379,0.01678,0.469538,0.343823,Dabo Swinney,0.675,11.0,50
3,SMU,2014,American Athletic,1.2,0.083333,0.358824,0.416667,1420.0,-3.933,-1.67,...,0.950207,0.861696,0.105657,0.095768,0.435374,0.46634,June Jones,0.569061,87.0,60
4,Colorado,2014,Pac-12,2.8,0.166667,0.408046,0.333333,1109.0,-5.744,14.0,...,0.814447,0.974594,-0.011197,0.09382,0.399249,0.434332,Mike MacIntyre,0.42,72.0,14


## Constructing a "test" set
- The above will be our training set. For our test set we need: talent_level and sos. Everything else should be available in our various data sets.
- Usages will not be calculable becuase rosters won't be finalized unitl August.

#### Scraping 2024 Strength of Schedule

In [16]:
# Scrape 2024 strength of schedule from https://collegefootballnetwork.com/2024-college-football-strength-of-schedule/
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

url = 'https://collegefootballnetwork.com/2024-college-football-strength-of-schedule/'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

In [17]:
# Get ranks and team names
name_ranks = soup.find_all('h3', class_=False)

ranks, teams = [], []

for entry in name_ranks:
    # Get rank from each entry string
    rank = int(re.findall(r'(?:T?)(\d+)',entry.text)[0])
    ranks.append(rank)
    
    # Get team from each entry string
    team = re.findall(r'\)\s(.+)$',entry.text)[0]
    teams.append(team)

print("Ranks:", ranks)
print("Team Names:", teams)
test = pd.DataFrame({'team': teams, 'rank': ranks})
test.to_csv('data/new_sos_ranks.csv', index=False)

Ranks: [1, 2, 3, 4, 5, 6, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 18, 19, 20, 21, 21, 23, 24, 25, 25, 27, 28, 29, 30, 31, 32, 32, 34, 34, 34, 37, 38, 38, 40, 41, 42, 43, 43, 45, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 115, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134]
Team Names: ['Florida Gators', 'Georgia Tech Yellow Jackets', 'Oklahoma Sooners', 'USC Trojans', 'Houston Cougars', 'Alabama Crimson Tide', 'Vanderbilt Commodores', 'Colorado Buffaloes', 'Arkansas Razorbacks', 'Purdue Boilermakers', 'Mississippi State Bulldogs', 'UCLA Bruins', 'Georgia Bulldogs', 'Wisconsin Badgers', 'Arizona State Sun Devils', 'Kentucky Wildcats', 'South Carolina Gamecocks', 'Michigan Wolverines', 'West Virgi

Note: We went through and added these by hand to avoid dealing with naming issues.

#### Getting 2024 Features
- Get 'elo', 'recent_win_pct','talent_level', 'turnover_margin', 'usages', 'off_success_rate', 'sos', 'career_win_pct'
- Has 2024 Data: talent_df

In [52]:
# Get features for 2024 from datasets
records_df = pd.read_csv('data/records_by_year_calcs.csv')
talent_df = pd.read_csv('data/team_recruiting_w_blue_chip_ratios.csv')
coach_df = pd.read_csv('data/coach_career_win_pct_last_year.csv')
trad_stats_df = pd.read_csv('data/season_stats_w_totals.csv')
ratings_df = pd.read_csv('data/team_conference_ratings.csv')
adv_stats_df = pd.read_csv('data/advanced_stats_seasons.csv')
sos_df = pd.read_csv('data/schedule_strength.csv')

Next we need to make a dataframe with what we need from 2023 and what we need from 2024

In [53]:
# Get talent from 2024. This does not need shifting.
talent_df = talent_df[talent_df['year']==2024][['team', 'year', 'talent_level']]
talent_df.head()

Unnamed: 0,team,year,talent_level
1839,Georgia,2024,311.405
1840,Alabama,2024,322.2275
1841,Oregon,2024,275.6025
1842,Miami,2024,268.5825
1843,Ohio State,2024,297.1375


In [54]:
# Get records from 2023. This will need to be shifted a year
records_df = records_df[records_df.year == 2023][['year', 'team', 'win_pct', 'recent_win_pct']]
records_df['year'] = records_df['year'] + 1
records_df.head()

Unnamed: 0,year,team,win_pct,recent_win_pct
2816,2024,Air Force,0.692308,0.586441
2817,2024,Akron,0.166667,0.335689
2818,2024,Alabama,0.857143,0.785047
2819,2024,Appalachian State,0.642857,0.730769
2820,2024,Arizona,0.769231,0.445993


In [55]:
# Get coaching career win pct through 2023. We'll need to shift this by 1
coach_df = coach_df[['year', 'team', 'coach', 'career_win_pct']]
coach_df['year'] = coach_df['year'] + 1
coach_df.head()

Unnamed: 0,year,team,coach,career_win_pct
0,2024,Tulsa,Kevin Wilson,0.391892
1,2024,South Alabama,Kane Wommack,0.578947
2,2024,Miami,Mario Cristobal,0.503401
3,2024,Houston,Dana Holgorsen,0.571429
4,2024,Ohio State,Ryan Day,0.88


In [56]:
# Get trad stats from 2023. We'll need to shift this by 1 year
trad_stats_df = trad_stats_df[trad_stats_df.year == 2023][['team', 'year', 'turnover_margin', 'total_tds']]
trad_stats_df['year'] = trad_stats_df['year'] + 1
trad_stats_df.head()

Unnamed: 0,team,year,turnover_margin,total_tds
16,Air Force,2024,-3.0,43.0
33,Akron,2024,-10.0,22.0
50,Alabama,2024,8.0,55.0
60,Appalachian State,2024,3.0,57.0
77,Arizona,2024,4.0,53.0


In [57]:
# Get ratings from 2023. These are from the end so shift by 1
ratings_df = ratings_df[ratings_df.year == 2023][['team', 'year', 'elo']]
ratings_df['year'] = ratings_df['year'] + 1
ratings_df.head()

Unnamed: 0,team,year,elo
2062,Air Force,2024,1608.0
2063,Akron,2024,1068.0
2064,Alabama,2024,2032.0
2065,Appalachian State,2024,1617.0
2066,Arizona,2024,1822.0


In [58]:
# Get ratings from 2023. These are from the end so shift by 1
adv_stats_df = adv_stats_df.rename(columns={'season': 'year'})
adv_stats_df = adv_stats_df[adv_stats_df.year == 2023][['year', 'team', 'off_success_rate']]
adv_stats_df['year'] = adv_stats_df['year'] + 1
adv_stats_df.head()

Unnamed: 0,year,team,off_success_rate
22,2024,Air Force,0.458591
44,2024,Akron,0.36921
66,2024,Alabama,0.446591
85,2024,Appalachian State,0.456033
107,2024,Arizona,0.501763


In [59]:
# Get strengths of schedule for 2024. Note that sor is from previous season.
sos_df = sos_df[sos_df.year == 2024]
sos_df.head()

Unnamed: 0,year,team,sor,sos
2382,2024,Michigan,1,18
2383,2024,Washington,2,28
2384,2024,Georgia,3,13
2385,2024,Alabama,4,6
2386,2024,Florida State,5,21


Now we'll merge them into a test dataframe. We'll be missing
1. Usages -- these aren't reported yet as of May, 2024

In [62]:
# Merge into test set
test_df = talent_df.merge(records_df, on=['year', 'team']) \
                        .merge(coach_df, on=['year', 'team']) \
                        .merge(trad_stats_df, on=['year', 'team']) \
                        .merge(ratings_df, on=['year', 'team']) \
                        .merge(adv_stats_df, on=['year', 'team']) \
                        .merge(sos_df, on=['year', 'team'])
test_df.head()

Unnamed: 0,team,year,talent_level,win_pct,recent_win_pct,coach,career_win_pct,turnover_margin,total_tds,elo,off_success_rate,sor,sos
0,Georgia,2024,311.405,0.928571,0.777429,Kirby Smart,0.854545,3.0,69.0,2273.0,0.526814,3,13
1,Alabama,2024,322.2275,0.857143,0.785047,Nick Saban,0.849498,8.0,55.0,2032.0,0.446591,4,6
2,Oregon,2024,275.6025,0.857143,0.732026,Dan Lanning,0.814815,11.0,82.0,2106.0,0.563559,10,30
3,Miami,2024,268.5825,0.538462,0.644518,Mario Cristobal,0.503401,-4.0,47.0,1580.0,0.434531,49,45
4,Ohio State,2024,297.1375,0.846154,0.838188,Ryan Day,0.88,-2.0,46.0,2065.0,0.459394,7,34


We will be missing the following for 2024: usages. So we need to retrain the model a bit.

In [65]:
# Grab features we have access to for 2024 (it won't be all of 2023)
X_cols = ['elo', 'recent_win_pct','talent_level', 'turnover_margin',
          'off_success_rate', 'sos', 'career_win_pct']
# Grab the target. Be careful as y_train needs to be a Pandas series or 1D numpy array
y_cols = 'win_pct'

# Create train and test sets
X_train, y_train = train_df[X_cols], train_df[y_cols]
X_test, y_test = test_df[X_cols], test_df[y_cols]

In [66]:
# Scale data, retaining original column titles and structure. Scale on TRAINING data
scaler = StandardScaler()
# Fit and transform on TRAINING data
X_train_scl = pd.DataFrame(scaler.fit_transform(X_train.copy()), 
                            columns = X_train.columns)
# Now transform TEST data
X_test_scl = pd.DataFrame(scaler.transform(X_test.copy()),
                            columns = X_test.columns)

X_t_scl.head()

Unnamed: 0,elo,recent_win_pct,talent_level,turnover_margin,off_success_rate,sos,career_win_pct
0,-1.52008,0.083599,-1.343475,-0.689479,0.22404,1.311724,0.287626
1,-0.122434,-0.703563,-0.647415,-1.588218,-1.011064,1.07333,0.324599
2,1.437407,0.969477,1.374947,0.594435,0.907663,-0.357033,0.830666
3,-0.315689,-1.17403,-0.583141,-1.074653,0.126389,-0.092151,0.390017
4,-1.388943,-0.816141,-0.221497,-0.561088,-0.699739,-1.310608,-0.229995


In [2]:
# Create model with available stats

In [None]:
# Make pro