In [1]:
import pandas as pd
import numpy as np
import json
import psycopg2
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# Retrieve postgres database credentials from json file
with open('nfl_project_postgres_info.json', 'r') as file:
    # Read the file contents
    postgres_json_data = file.read()

    # Parse the JSON data
    postgres_info = json.loads(postgres_json_data)

In [4]:
# Connect to the PostgreSQL database
conn = psycopg2.connect(
    
        host="localhost",
        database=postgres_info['database_name'],
        user=postgres_info['nfl_project_username'],
        password=postgres_info['nfl_project_password']
)

# Create a cursor
cursor = conn.cursor()

# Prepare the SQL statement to delete records
sql = "SELECT * FROM dbo.nfl_game_data"

# Execute the delete statement
cursor.execute(sql)

# Fetch the results
results = cursor.fetchall()

# Get the column names from the cursor description
columns = [desc[0] for desc in cursor.description]

# Create a DataFrame from the results and column names
game_table = pd.DataFrame(results, columns=columns)

# Close the cursor and connection
cursor.close()
conn.close()

In [5]:
# Sort the DataFrame by game date and game location
game_table = game_table.sort_values(['game_date', 'game_time'])

In [6]:
# Create a game_table copy to modify
game_table_working_copy = game_table.copy()

In [7]:
# Drop records with null values in specific columns
columns_to_check = ['game_location', 'game_stadium', 'away_rush_tds']
game_table_working_copy = game_table_working_copy.dropna(subset=columns_to_check)

In [8]:
# Calculate rolling averages for each statistical category per team, including away and home games
away_categories = ['away_team_total_win_percent', 'away_team_away_win_percent',
                   'away_team_final', 'away_pass_yards', 'away_pass_attempts',
                   'away_pass_tds', 'away_pass_ints', 'away_sacks_allowed',
                   'away_sack_yardage_allowed', 'away_pass_rating', 'away_rush_attempts',
                   'away_rush_yards', 'away_rush_yards_per_attempt', 'away_rush_tds',
                   'away_receptions', 'away_rec_yards', 'away_yards_per_reception', 'away_rec_tds'
                  ]

for category in away_categories:
    # Calculate rolling averages for each team using a window of 5 games
    game_table_working_copy[f'avg_{category}_last_5'] = game_table_working_copy.groupby('away_team')[category].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)

In [9]:
# Calculate rolling averages for each statistical category per team, including away and home games
home_categories = ['home_team_total_win_percent', 'home_team_home_win_percent',
                   'home_team_final', 'home_pass_yards', 'home_pass_attempts',
                   'home_pass_tds', 'home_pass_ints', 'home_sacks_allowed',
                   'home_sack_yardage_allowed', 'home_pass_rating', 'home_rush_attempts',
                   'home_rush_yards', 'home_rush_yards_per_attempt', 'home_rush_tds',
                   'home_receptions', 'home_rec_yards', 'home_yards_per_reception', 'home_rec_tds'
                  ]
    
for category in home_categories:
    # Calculate rolling averages for each team using a window of 5 games
    game_table_working_copy[f'avg_{category}_last_5'] = game_table_working_copy.groupby('home_team')[category].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)    

In [10]:
# Reduce column set.
selected_columns = ['away_team', 'home_team', 
 'avg_away_team_total_win_percent_last_5',
 'avg_away_team_away_win_percent_last_5',
 'avg_away_team_final_last_5',
 'avg_away_pass_yards_last_5',
 'avg_away_pass_attempts_last_5',
 'avg_away_pass_tds_last_5',
 'avg_away_pass_ints_last_5',
 'avg_away_sacks_allowed_last_5',
 'avg_away_sack_yardage_allowed_last_5',
 'avg_away_pass_rating_last_5',
 'avg_away_rush_attempts_last_5',
 'avg_away_rush_yards_last_5',
 'avg_away_rush_yards_per_attempt_last_5',
 'avg_away_rush_tds_last_5',
 'avg_away_receptions_last_5',
 'avg_away_rec_yards_last_5',
 'avg_away_yards_per_reception_last_5',
 'avg_away_rec_tds_last_5',
 'avg_home_team_total_win_percent_last_5',
 'avg_home_team_home_win_percent_last_5',
 'avg_home_team_final_last_5',
 'avg_home_pass_yards_last_5',
 'avg_home_pass_attempts_last_5',
 'avg_home_pass_tds_last_5',
 'avg_home_pass_ints_last_5',
 'avg_home_sacks_allowed_last_5',
 'avg_home_sack_yardage_allowed_last_5',
 'avg_home_pass_rating_last_5',
 'avg_home_rush_attempts_last_5',
 'avg_home_rush_yards_last_5',
 'avg_home_rush_yards_per_attempt_last_5',
 'avg_home_rush_tds_last_5',
 'avg_home_receptions_last_5',
 'avg_home_rec_yards_last_5',
 'avg_home_yards_per_reception_last_5',
 'avg_home_rec_tds_last_5']

In [11]:
# Separate the features (X) and the target variable (y)
X = game_table_working_copy[selected_columns]
y = game_table_working_copy['game_result']

In [12]:
from sklearn.preprocessing import LabelEncoder
# Initialize a LabelEncoder
label_encoder = LabelEncoder()
team_names = ['away_team', 'home_team']

# Encode categorical columns using label encoding
X_encoded = X.copy()
for col in team_names:
    X_encoded[col] = label_encoder.fit_transform(X[col])

In [13]:
# Choose 'feature_columns' to normalize
feature_columns = list(X_encoded.columns)[4:]

# Perform min-max normalization on the selected feature columns
scaler = MinMaxScaler()
X_encoded[feature_columns] = scaler.fit_transform(X_encoded[feature_columns])

In [14]:
# Identify the indices of rows to drop where the value in 'y' equals 2
indices_to_drop = y[y == 2].index

In [15]:
# Drop the corresponding rows from 'y' and 'X'
y_filtered = y.drop(indices_to_drop)
X_encoded_filtered = X_encoded.drop(indices_to_drop)

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded_filtered, y_filtered, test_size=0.2, random_state=42)

In [17]:
from sklearn.linear_model import LogisticRegression

# Create an instance of the Logistic Regression model
model = LogisticRegression(solver='lbfgs', max_iter=10000, tol=0.001)

In [18]:
# Train the model
model.fit(X_train, y_train)

In [19]:
# Make predictions on the testing set
y_pred = model.predict(X_test)

In [20]:
# Get the predicted outcomes for each game
y_pred_game = model.predict(X_encoded)
# Get the predicted probabilities for each game
y_pred_game_proba = model.predict_proba(X_encoded)

In [21]:
# Extract the predicted probabilities for the positive class (home team win)
away_win_probabilities = y_pred_game_proba[:, 1]
home_win_probabilities = y_pred_game_proba[:, 0]

In [22]:
# Drop engineered data for ML model that is now no longer needed.
game_table_working_copy = game_table_working_copy.filter(items=list(game_table_working_copy.columns[0:87]))

In [23]:
# Add on the array of predicted outcomes called 'y_pred_game'
game_table_working_copy['predicted_outcome'] = y_pred_game
game_table_working_copy['away_team_win_probability'] = away_win_probabilities
game_table_working_copy['home_team_win_probability'] = home_win_probabilities

In [24]:
from joblib import dump

# Assuming you have a trained logistic regression model called 'model'
dump(model, 'logistic_regression_model.joblib')

['logistic_regression_model.joblib']