In [13]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the data
df_game_data = pd.read_csv('nfl_team_stats_2002-2023.csv')

# Convert possession time to minutes for both home and away teams
def convert_possession_to_minutes(time_str):
    if pd.isna(time_str):  # Handle any missing values
        return 0
    minutes, seconds = map(int, time_str.split(':'))
    return minutes + seconds / 60  # Convert to total minutes

df_game_data['possession_home'] = df_game_data['possession_home'].apply(convert_possession_to_minutes)
df_game_data['possession_away'] = df_game_data['possession_away'].apply(convert_possession_to_minutes)

# Define home and away columns
home_columns = ['season', 'week', 'date', 'time_et', 'neutral', 'home', 
                'score_home', 'first_downs_home', 'first_downs_from_passing_home',
                'first_downs_from_rushing_home', 'first_downs_from_penalty_home',
                'third_down_comp_home', 'third_down_att_home',
                'fourth_down_comp_home', 'fourth_down_att_home', 'plays_home',
                'drives_home', 'yards_home', 'pass_comp_home', 'pass_att_home',
                'pass_yards_home', 'sacks_num_home', 'sacks_yards_home',
                'rush_att_home', 'rush_yards_home', 'pen_num_home',
                'pen_yards_home', 'redzone_comp_home', 'redzone_att_home',
                'fumbles_home', 'interceptions_home', 'def_st_td_home',
                'possession_home']

away_columns = ['season', 'week', 'date', 'time_et', 'neutral', 'away', 
                'score_away', 'first_downs_away', 'first_downs_from_passing_away',
                'first_downs_from_rushing_away', 'first_downs_from_penalty_away',
                'third_down_comp_away', 'third_down_att_away',
                'fourth_down_comp_away', 'fourth_down_att_away', 'plays_away',
                'drives_away', 'yards_away', 'pass_comp_away', 'pass_att_away',
                'pass_yards_away', 'sacks_num_away', 'sacks_yards_away',
                'rush_att_away', 'rush_yards_away', 'pen_num_away',
                'pen_yards_away', 'redzone_comp_away', 'redzone_att_away',
                'fumbles_away', 'interceptions_away', 'def_st_td_away',
                'possession_away']

# Split into home and away DataFrames
df_home = df_game_data[home_columns].copy()
df_away = df_game_data[away_columns].copy()

# Define PCA function
def perform_pca(df, target_column, n_components):
    team_name = 'home' if target_column == 'score_home' else 'away'
    # Drop non-numeric and target columns
    features = df.drop(columns=['season', 'week', 'date', 'time_et', 'neutral', team_name, target_column]).columns.tolist()
    X = df[features].dropna()  # Drop any rows with missing values
    y = df[target_column].loc[X.index]  # Align target with X
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply PCA
    pca = PCA(n_components=n_components)
    data_pca = pca.fit_transform(X_scaled)
    
    # Create DataFrame of principal components
    pca_df = pd.DataFrame(data_pca, columns=[f'PC{i+1}' for i in range(n_components)])
    explained_variance = pca.explained_variance_ratio_
    total_variance_explained = explained_variance.sum()
    
    # Concatenate with target and other columns
    result = pd.concat([pca_df, y.reset_index(drop=True)], axis=1)
    result = pd.concat([result, df[[f'season', 'week', 'date', team_name, target_column]].loc[X.index].reset_index(drop=True)], axis=1)
    
    return result, total_variance_explained

In [15]:
# Perform PCA on home and away DataFrames, Example with 4 components
n_components = 4
pca_home_df,  total_variance_explained_home = perform_pca(df_home, 'score_home', n_components)
pca_away_df, total_variance_explained_away = perform_pca(df_away, 'score_away', n_components)

pca_home_df.head(), total_variance_explained_home, pca_away_df.head(), total_variance_explained_away

(        PC1       PC2       PC3       PC4  score_home  season week  \
 0  1.849064 -3.307116  1.235584  2.179119          13    2002    1   
 1  1.457641  0.281261  2.254430 -1.673613          25    2002    1   
 2  3.133892 -0.135937  1.494209  0.479861          31    2002    1   
 3  2.070722  3.194792  0.003991  0.135340          49    2002    1   
 4  0.439634 -1.173896 -1.259517  1.276462          27    2002    1   
 
          date        home  score_home  
 0  2002-09-05      Giants          13  
 1  2002-09-08     Jaguars          25  
 2  2002-09-08  Washington          31  
 3  2002-09-08    Dolphins          49  
 4  2002-09-08      Titans          27  ,
 0.5575736151538119,
         PC1       PC2       PC3       PC4  score_away  season week  \
 0 -2.081086  1.554730 -1.685784 -0.075116          16    2002    1   
 1 -0.282844  0.962157 -0.526669 -0.612036          28    2002    1   
 2 -2.263646  0.265218 -1.070104 -1.319061          23    2002    1   
 3 -1.595550 -1.0712