In [1]:
import pandas as pd

df = pd.read_csv('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/nfl-playing-surface-analytics/motion_df_encoded.csv')
df.head()

Unnamed: 0,DM_M1,DM_M7,DM_M28,DM_M42,PlayerDay,PlayerGame,Temperature,PlayerGamePlay,time_max,dir_max,...,PositionGroup_DB,PositionGroup_DL,PositionGroup_LB,PositionGroup_OL,PositionGroup_QB,PositionGroup_RB,PositionGroup_SPEC,PositionGroup_TE,PositionGroup_WR,PositionGroup_nan
0,0.0,0.0,0.0,0.0,1.0,1.0,63.0,1.0,29.8,337.87,...,False,False,False,False,True,False,False,False,False,False
1,0.0,0.0,0.0,0.0,1.0,1.0,63.0,10.0,27.5,353.24,...,False,False,False,False,True,False,False,False,False,False
2,0.0,0.0,0.0,0.0,1.0,1.0,63.0,11.0,36.8,357.78,...,False,False,False,False,True,False,False,False,False,False
3,0.0,0.0,0.0,0.0,1.0,1.0,63.0,12.0,35.6,359.97,...,False,False,False,False,True,False,False,False,False,False
4,0.0,0.0,0.0,0.0,1.0,1.0,63.0,13.0,25.5,357.43,...,False,False,False,False,True,False,False,False,False,False


In [2]:
def drop_nfl_feature_groups(df, feature_groups):
    """
    Simple function to drop NFL feature groups based on boolean dictionary
    
    Parameters:
    -----------
    df : pd.DataFrame
        NFL injury dataset
    feature_groups : dict
        Dictionary with boolean values for each feature group
        Format: {'stadium': True/False, 'weather': True/False, ...}
    
    Returns:
    --------
    pd.DataFrame
        Filtered dataframe with specified feature groups removed
    """
    
    # Define the feature group columns
    stadium = [col for col in df.columns if 'Stadium' in col]
    weather = [col for col in df.columns if 'Weather' in col]
    play_type = [col for col in df.columns if 'PlayType' in col]
    position_group = [col for col in df.columns if 'PositionGroup' in col]
    position = [col for col in df.columns if 'Position' in col]
    roster = [col for col in df.columns if 'Roster' in col]
    injury_days = [col for col in df.columns if 'DM_' in col]
    
    # Collect columns to drop
    columns_to_drop = []
    
    if not feature_groups.get('stadium', True):
        columns_to_drop.extend(stadium)
    if not feature_groups.get('weather', True):
        columns_to_drop.extend(weather)
    if not feature_groups.get('play_type', True):
        columns_to_drop.extend(play_type)
    if not feature_groups.get('position_group', True):
        columns_to_drop.extend(position_group)
    if not feature_groups.get('position', True):
        columns_to_drop.extend(position)
    if not feature_groups.get('roster', True):
        columns_to_drop.extend(roster),
    if not feature_groups.get('injury_days', True):
        columns_to_drop.extend(injury_days)
    
    # Drop the columns
    return df.drop(columns=columns_to_drop, errors='ignore')

In [3]:
feature_groups = {
    'stadium': False,
    'weather': False,
    'play_type': False,
    'position_group': False,
    'position': False,
    'roster': False,
    'injury_days': False  # Keep injury days
}
filtered_df = drop_nfl_feature_groups(df, feature_groups)
filtered_df = filtered_df.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)
filtered_df['Injury'] = filtered_df['Injury'].apply(lambda x: 1 if x > 0 else 0)
feature_cols = [col for col in filtered_df.columns if col != 'Injury']
filtered_df = filtered_df[~(filtered_df[feature_cols].eq(0.0).all(axis=1))].reset_index(drop=True)
filtered_df['Temperature'] = filtered_df['Temperature'].replace(-999, filtered_df['Temperature'].mean())
filtered_df['Temperature'] = filtered_df['Temperature'].replace(-35.04415537718783, filtered_df['Temperature'].mean())
filtered_df.head()

Unnamed: 0,PlayerDay,PlayerGame,Temperature,PlayerGamePlay,time_max,dir_max,dis_max,o_max,s_max,angle_max,time_avg,dir_avg,dis_avg,o_avg,s_avg,angle_avg,Injury,FieldType_Natural,FieldType_Synthetic,FieldType_nan
0,1.0,1.0,63.0,1.0,29.8,337.87,0.48,267.03,2.94,265.19,14.9,186.148361,0.056288,182.479766,0.535753,-3.668595,0,0,1,0
1,1.0,1.0,63.0,10.0,27.5,353.24,0.45,315.08,3.18,260.69,13.75,199.524167,0.061848,200.858225,0.576304,1.334058,0,0,1,0
2,1.0,1.0,63.0,11.0,36.8,357.78,0.35,358.2,2.94,314.68,18.4,201.65252,0.084553,188.591789,0.795772,-13.060732,0,0,1,0
3,1.0,1.0,63.0,12.0,35.6,359.97,0.46,302.26,1.83,299.54,17.8,174.236134,0.041176,215.746975,0.349692,41.51084,0,0,1,0
4,1.0,1.0,63.0,13.0,25.5,357.43,0.49,356.69,1.49,295.07,12.75,176.954727,0.051797,189.170742,0.427109,12.216016,0,0,1,0


In [5]:
# solidify X_test, X_train, y_test, y_train
X = filtered_df[feature_cols]
y = filtered_df['Injury']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y)

In [7]:
X_test.to_csv('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/scripts/play_surface/X_test.csv', index=False)
X_train.to_csv('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/scripts/play_surface/X_train.csv', index=False)
y_test.to_csv('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/scripts/play_surface/y_test.csv', index=False)
y_train.to_csv('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/scripts/play_surface/y_train.csv', index=False)