# 02_feature_selection

This notebook contains all code required to select the features we will build out our models with.

In [None]:
import pandas as pd
from itertools import combinations
import warnings

In [2]:
master_data = pd.read_csv("https://raw.githubusercontent.com/srmcdevitt03/DS-440-Capstone/refs/heads/main/datasets/master_nfl_data.csv")
master_data.head()

Unnamed: 0,Rk_conversions,Tm,3DAtt,3DConv,3D%,4DAtt,4DConv,4D%,RZAtt,RZTD,...,Rushing_Yds,RushingTDs,Rushing_Y/A,Rushing_1stD,Pen_offense,Pen_Yds_offense,1stPy_offense,Sc%_y,TO%_allowed,EXP
0,3.0,Arizona Cardinals,96.0,43.0,44.8%,7.0,4.0,57.1%,27.0,15.0,...,773.0,5.0,4.4,38.0,54.0,393.0,15.0,40.6,10.1,29.75
1,16.0,Atlanta Falcons,78.0,33.0,42.3%,11.0,4.0,36.4%,18.0,8.0,...,818.0,6.0,4.6,43.0,33.0,230.0,12.0,34.9,9.5,17.8
2,26.0,Baltimore Ravens,69.0,28.0,40.6%,12.0,4.0,33.3%,18.0,8.0,...,757.0,6.0,5.3,38.0,38.0,274.0,10.0,40.3,16.1,4.43
3,27.0,Buffalo Bills,68.0,28.0,41.2%,5.0,3.0,60.0%,25.0,16.0,...,906.0,9.0,4.9,50.0,44.0,351.0,16.0,46.9,9.4,65.09
4,12.0,Carolina Panthers,91.0,34.0,37.4%,18.0,12.0,66.7%,22.0,13.0,...,981.0,3.0,4.7,53.0,42.0,350.0,16.0,38.6,11.4,15.95


## Build dataframe containing all possible matchups

In [17]:
matchups = []

df = master_data.copy()

# Clean up non-numeric strings and convert safely
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = (
            df[col]
            .astype(str)
            .str.replace('%', '', regex=False)
            .str.replace(',', '', regex=False)
        )
        try:
            df[col] = pd.to_numeric(df[col])
        except ValueError:
            # Column contains text that cannot be converted (e.g., team names)
            pass

# Identify numeric columns once
numeric_cols = df.select_dtypes(include='number').columns

matchups = []
for team1, team2 in combinations(df['Tm'], 2):
    team1_stats = df.loc[df['Tm'] == team1].iloc[0]
    team2_stats = df.loc[df['Tm'] == team2].iloc[0]
    
    diff = team1_stats[numeric_cols] - team2_stats[numeric_cols]
    diff.index = [f"{c}_diff" for c in diff.index]
    
    matchup_row = {'Team_A': team1, 'Team_B': team2} | diff.to_dict()
    matchups.append(matchup_row)

matchups_df = pd.DataFrame(matchups)
matchups_df

Unnamed: 0,Team_A,Team_B,Rk_conversions_diff,3DAtt_diff,3DConv_diff,3D%_diff,4DAtt_diff,4DConv_diff,4D%_diff,RZAtt_diff,...,Rushing_Yds_diff,RushingTDs_diff,Rushing_Y/A_diff,Rushing_1stD_diff,Pen_offense_diff,Pen_Yds_offense_diff,1stPy_offense_diff,Sc%_y_diff,TO%_allowed_diff,EXP_diff
0,Arizona Cardinals,Atlanta Falcons,-13.0,18.0,10.0,2.5,-4.0,0.0,20.7,9.0,...,-45.0,-1.0,-0.2,-5.0,21.0,163.0,3.0,5.7,0.6,11.95
1,Arizona Cardinals,Baltimore Ravens,-23.0,27.0,15.0,4.2,-5.0,0.0,23.8,9.0,...,16.0,-1.0,-0.9,0.0,16.0,119.0,5.0,0.3,-6.0,25.32
2,Arizona Cardinals,Buffalo Bills,-24.0,28.0,15.0,3.6,2.0,1.0,-2.9,2.0,...,-133.0,-4.0,-0.5,-12.0,10.0,42.0,-1.0,-6.3,0.7,-35.34
3,Arizona Cardinals,Carolina Panthers,-9.0,5.0,9.0,7.4,-11.0,-8.0,-9.6,5.0,...,-208.0,2.0,-0.3,-15.0,12.0,43.0,-1.0,2.0,-1.3,13.80
4,Arizona Cardinals,Chicago Bears,-18.0,19.0,12.0,4.5,0.0,1.0,14.2,7.0,...,-3.0,-1.0,0.0,-9.0,1.0,-61.0,8.0,-2.7,2.6,11.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,Seattle Seahawks,Tennessee Titans,-1.0,-11.0,1.0,5.0,-4.0,0.0,26.7,14.0,...,213.0,6.0,-0.1,11.0,-5.0,9.0,8.0,13.0,-1.3,108.85
492,Seattle Seahawks,Washington Commanders,1.0,5.0,0.0,-2.2,-3.0,-1.0,11.1,7.0,...,-299.0,-1.0,-1.7,-12.0,-7.0,-40.0,-2.0,0.8,3.0,-2.44
493,Tampa Bay Buccaneers,Tennessee Titans,-19.0,-3.0,8.0,9.9,0.0,1.0,10.0,9.0,...,167.0,3.0,0.2,10.0,-5.0,-8.0,10.0,13.0,-10.4,112.23
494,Tampa Bay Buccaneers,Washington Commanders,-17.0,13.0,7.0,2.7,1.0,0.0,-5.6,2.0,...,-345.0,-4.0,-1.4,-13.0,-7.0,-57.0,0.0,0.8,-6.1,0.94
