This script creates a CSV that will be used to train the model. For each game, a team's offensive output will be concatenated with the average offensive output allowed by their opponent in the preceding games. This will help to quantify the relationship between offensive output and quality of opponent. These values are normalized and saved as a CSV.

In [1]:
import pandas as pd
import psycopg2
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn

In [2]:
# 2018 Week 4: '2018-09-27'
# 2018 Week 5: '2018-10-04'
# 2018 Week 6: '2018-10-11'

start_date = '2018-09-27'
end_date = '2023-02-12'

In [3]:
# Specify the number of games to average
num_games_average = 3

In [4]:
try:
    # Establish a connection to the PostgreSQL database
    connection = psycopg2.connect(
        host = "localhost",
        dbname = "nfl",
        user = "postgres",
        password = "Plenoir2002!", # Include correct password
        port = 5432
    )
    print("Connected to PostgreSQL database.")
except (Exception, psycopg2.Error) as error:
    print("Error connecting to PostgreSQL database: ", error)

# Create a cursor object to interact with the database
cursor = connection.cursor()

Connected to PostgreSQL database.


In [5]:
# Query to fetch column names from the nfl_data table
query = """
    SELECT column_name 
    FROM information_schema.columns 
    WHERE table_name = 'nfl_data'
    ORDER BY ordinal_position;
"""

cursor.execute(query)

column_names = [column[0] for column in cursor.fetchall()]

# Initialize an empty Pandas DataFrame with the fetched column names
nfl_df = pd.DataFrame(columns=column_names)

In [6]:
# Execute your query to fetch data from the nfl_data table for the specified date range
game_query = f"""
    SELECT gameid, tm, home, won, season, date, tm_opp, score
    FROM nfl_data
    WHERE Date BETWEEN '{start_date}' AND '{end_date}';
"""

cursor.execute(game_query)

rows = cursor.fetchall()

column_names = [desc[0] for desc in cursor.description]

# Convert the fetched data to a Pandas DataFrame
df_filtered = pd.DataFrame(rows, columns=column_names)

# Append the filtered data to the existing nfl_df DataFrame
nfl_df = pd.concat([nfl_df, df_filtered], ignore_index=True)

In [7]:
offensive_stats = nfl_df.loc[:, 'score':'fga']
defensive_stats = nfl_df.loc[:, 'score_opp':'fga_opp']

# Create empty pandas DF
col_names = list(offensive_stats.columns) + list(defensive_stats.columns)
testing_df = pd.DataFrame(columns=col_names)

This query concatenates a team's offensive perfomance with the average offensive output allowed by their opponent over previous games. This dataframe will be used to train the model. The average offensive output of the team we are trying to predict will be used only as an input when actually making predictions.

In [8]:
# Function to calculate the rolling average for a team
def calculate_rolling_average(offensive_team, defensive_opponent, current_date, num_games_avg):
    # SQL query to fetch the offensive stats from the specified date
    offense_query = f"""
    SELECT {', '.join(offensive_stats)}
    FROM {offensive_team}
    WHERE date = %s;
    """
    
    cursor.execute(offense_query, (current_date,))
    offense_game = cursor.fetchone()

    # SQL query to fetch the previous games for the defensive opponent (up to num_games_avg)
    defense_query = f"""
    SELECT {', '.join(defensive_stats)}
    FROM {defensive_opponent}
    WHERE date < %s
    ORDER BY date DESC
    LIMIT %s;
    """
    cursor.execute(defense_query, (current_date, num_games_avg))
    defense_games = cursor.fetchall()

    # Calculate the average for each defensive stat and convert to tuple
    avg_defense_game = tuple(round(sum(col) / len(col), 2) for col in zip(*defense_games))

    # Concatenate the offensive and defensive values from the previous games (rounded to 2 decimal pts)
    game_avgs = offense_game + avg_defense_game

    return game_avgs

In [9]:
# Iterate through your data and populate the DataFrame from the start date
for index, row in nfl_df.iterrows():
    team = row['tm']
    opponent = row['tm_opp']
    current_date = row['date']
    
    avg_values = calculate_rolling_average(team, opponent, current_date, num_games_average)
    testing_df.loc[index] = avg_values

In [10]:
# Close the cursor and the database connection
cursor.close()
connection.close()

In [11]:
nfl_df = nfl_df[['gameid', 'tm', 'home', 'won', 'season', 'date', 'tm_opp']]
nfl_df = pd.concat([nfl_df, testing_df], axis=1)

In [12]:
# Extract the numerical columns for normalization
numerical_columns = nfl_df.columns.difference(['gameid', 'tm', 'won', 'season', 'date', 'tm_opp', 'score'])

# Normalize the numerical values using Min-Max scaling
scaler = MinMaxScaler()
nfl_df[numerical_columns] = scaler.fit_transform(nfl_df[numerical_columns])

In [13]:
nfl_df

Unnamed: 0,gameid,tm,home,won,season,date,tm_opp,score,frdwns,totyds,...,qb_prss_opp,rush_att_opp,rush_yds_opp,rush_td_opp,rush_yac_opp,rush_ypc_opp,xpm_opp,xpa_opp,fgm_opp,fga_opp
0,96,MIN,0.0,False,2018.0,2018-09-27,LAR,31.0,0.65625,0.669463,...,0.456316,0.222121,0.281963,0.182561,0.409268,0.526978,0.166667,0.199400,0.230947,0.230947
1,97,LAR,1.0,True,2018.0,2018-09-27,MIN,38.0,0.65625,0.854027,...,0.491053,0.515152,0.322430,0.182561,0.337166,0.311151,0.333333,0.299850,0.769053,0.847575
2,98,LVR,1.0,True,2018.0,2018-09-30,CLE,45.0,0.71875,0.869128,...,0.491053,0.525152,0.341121,0.362398,0.443849,0.293165,0.333333,0.299850,0.230947,0.385681
3,99,NYJ,0.0,False,2018.0,2018-09-30,JAX,12.0,0.18750,0.219799,...,0.491053,0.464545,0.369159,0.089918,0.570589,0.406475,0.111667,0.100450,0.616628,0.769053
4,100,BAL,0.0,True,2018.0,2018-09-30,PIT,26.0,0.62500,0.677852,...,0.806842,0.434242,0.401869,0.182561,0.288234,0.467626,0.666667,0.599700,0.154734,0.230947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2643,2739,CIN,0.0,False,2022.0,2023-01-29,KAN,20.0,0.43750,0.439597,...,0.614211,0.323333,0.390981,0.272480,0.112389,0.634892,0.333333,0.299850,0.385681,0.385681
2644,2740,SFO,0.0,False,2022.0,2023-01-29,PHI,7.0,0.21875,0.196309,...,0.526316,0.403939,0.415888,0.272480,0.308377,0.586331,0.221667,0.199400,0.230947,0.230947
2645,2741,PHI,1.0,True,2022.0,2023-01-29,SFO,31.0,0.65625,0.372483,...,0.175263,0.313030,0.205607,0.182561,0.109536,0.287770,0.166667,0.250375,0.230947,0.230947
2646,2742,PHI,1.0,False,2022.0,2023-02-12,KAN,35.0,0.65625,0.620805,...,0.596316,0.252424,0.319346,0.182561,0.155615,0.593525,0.278333,0.250375,0.461894,0.461894


In [14]:
nfl_df.to_csv("training_nfl.csv")