In [1]:
import os
import zipfile
from tqdm import tqdm
import itertools
import scipy.stats as ss

# Load the cuDF magic (ensures pandas operations are accelerated when possible)
%load_ext cudf.pandas

# Use cuDF for GPU-accelerated DataFrame operations
import cudf

# --- Read in Data using cuDF ---
play_df = cudf.read_csv('nfl-playing-surface-analytics/PlayList.csv')
player_df = cudf.read_csv('nfl-playing-surface-analytics/PlayerTrackData.csv')
injury_df = cudf.read_csv('nfl-playing-surface-analytics/InjuryRecord.csv')

# Check if GPU is available
!nvidia-smi

# --- Create Initial Game DataFrame ---
game_df = play_df[['GameID', 'StadiumType', 'FieldType', 'Weather', 'Temperature']]
game_df = game_df.drop_duplicates()
game_df = game_df.reset_index(drop=True)

# --- Define Vectorized Cleaning Functions ---

def clean_weather_vectorized(df):
    """
    Clean the 'Weather' column using vectorized operations.
    Maps raw weather strings to standard categories and sets special conditions to None.
    """
    # Define condition sets (as Python sets for fast membership checking)
    cloudy_conditions = {
        'Cloudy 50% change of rain', 'Hazy', 'Cloudy.', 'Overcast', 'Mostly Cloudy',
        'Cloudy, fog started developing in 2nd quarter', 'Partly Cloudy',
        'Mostly cloudy', 'Rain Chance 40%', ' Partly cloudy', 'Party Cloudy',
        'Rain likely, temps in low 40s', 'Partly Clouidy', 'Cloudy, 50% change of rain',
        'Mostly Coudy', '10% Chance of Rain', 'Cloudy, chance of rain', 
        '30% Chance of Rain', 'Cloudy, light snow accumulating 1-3"', 'cloudy', 
        'Coudy', 'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
        'Cloudy fog started developing in 2nd quarter', 'Cloudy light snow accumulating 1-3"',
        'Cloudywith periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
        'Cloudy and cold', 'Cloudy and Cool', 'Partly cloudy'
    }
    clear_conditions = {
        'Clear, Windy', 'Clear to Cloudy', 'Clear, highs to upper 80s', 'Clear and clear',
        'Partly sunny', 'Clear skies', 'Sunny', 'Partly Sunny', 'Mostly Sunny',
        'Clear Skies', 'Sunny Skies', 'Partly clear', 'Fair', 'Sunny, highs to upper 80s',
        'Sun & clouds', 'Mostly sunny', 'Sunny, Windy', 'Mostly Sunny Skies',
        'Clear and Sunny', 'Clear and sunny', 'Clear to Partly Cloudy', 'Clear Skies',
        'Clear and cold', 'Clear and warm', 'Clear and Cool', 'Sunny and cold',
        'Sunny and warm', 'Sunny and clear'
    }
    rainy_conditions = {
        'Rainy', 'Scattered Showers', 'Showers', 'Cloudy Rain', 'Light Rain', 
        'Rain shower', 'Rain likely, temps in low 40s.', 'Cloudy, Rain'
    }
    snow_conditions = {'Heavy lake effect snow'}
    indoor_conditions = {'Controlled Climate', 'Indoors', 'N/A Indoor', 'N/A (Indoors)'}
    
    # Perform vectorized conditional replacements on the Weather column
    df['Weather'] = df['Weather']\
        .mask(df['Weather'].isin(cloudy_conditions), 'Cloudy')\
        .mask(df['Weather'].isin(indoor_conditions), 'Indoor')\
        .mask(df['Weather'].isin(clear_conditions), 'Clear')\
        .mask(df['Weather'].isin(rainy_conditions), 'Rain')\
        .mask(df['Weather'].isin(snow_conditions), 'Snow')\
        .mask(df['Weather'].isin(['Cloudy.', 'Heat Index 95', 'Cold']), None)
    return df

def clean_stadiumtype_vectorized(df):
    """
    Clean the 'StadiumType' column using regex replacements and vectorized anomaly filtering.
    """
    # Standardize common StadiumType misspellings/formats using regex
    df['StadiumType'] = df['StadiumType'].str.replace(
        r'Oudoor|Outdoors|Ourdoor|Outddors|Outdor|Outside', 'Outdoor', regex=True
    )
    df['StadiumType'] = df['StadiumType'].str.replace(
        r'Indoors|Indoor, Roof Closed|Indoor, Open Roof', 'Indoor', regex=True
    )
    df['StadiumType'] = df['StadiumType'].str.replace(
        r'Closed Dome|Domed, closed|Domed, Open|Domed, open|Dome, closed|Domed', 'Dome', regex=True
    )
    df['StadiumType'] = df['StadiumType'].str.replace(
        r'Retr. Roof-Closed|Outdoor Retr Roof-Open|Retr. Roof - Closed|Retr. Roof-Open|Retr. Roof - Open|Retr. Roof Closed', 
        'Retractable Roof', regex=True
    )
    df['StadiumType'] = df['StadiumType'].str.replace('Open', 'Outdoor', regex=False)
    
    # Remove anomalies by setting certain values to None
    df['StadiumType'] = df['StadiumType'].mask(
        df['StadiumType'].isin(['Bowl', 'Heinz Field', 'Cloudy']), None
    )
    return df

def clean_play_df(df):
    """
    Clean the play-level DataFrame by standardizing 'StadiumType' and 'Weather' fields.
    """
    df_cleaned = df.copy()  # Avoid in-place modifications
    df_cleaned = clean_stadiumtype_vectorized(df_cleaned)
    df_cleaned = clean_weather_vectorized(df_cleaned)
    return df_cleaned

# --- Clean the Play DataFrame ---
play_df_cleaned = clean_play_df(play_df)

# Create a cleaned game-level DataFrame
game_df_cleaned = play_df_cleaned[['GameID', 'StadiumType', 'FieldType', 'Weather', 'Temperature']]
game_df_cleaned = game_df_cleaned.drop_duplicates().reset_index(drop=True)

# --- Join Game Data with Injury Data ---
game_injury_df = injury_df.set_index('GameID').join(
    game_df_cleaned.set_index('GameID'), how='outer'
)

# --- Fill Missing Injury Columns and Adjust Metrics ---
for col in ['DM_M1', 'DM_M7', 'DM_M28', 'DM_M42']:
    game_injury_df[col] = game_injury_df[col].fillna(0).astype('int32')

game_injury_df['DM_M1']  = game_injury_df['DM_M1']  - game_injury_df['DM_M7']
game_injury_df['DM_M7']  = game_injury_df['DM_M7']  - game_injury_df['DM_M28']
game_injury_df['DM_M28'] = game_injury_df['DM_M28'] - game_injury_df['DM_M42']

game_injury_df['Injury'] = (
    game_injury_df['DM_M1'] +
    game_injury_df['DM_M7'] +
    game_injury_df['DM_M28'] +
    game_injury_df['DM_M42']
)

# Drop unnecessary columns
game_injury_df = game_injury_df.drop(columns=['Surface', 'PlayerKey', 'PlayKey'])

# --- Create Dummy Variables for the Game-Injury Data ---
game_injury_df_dummies = cudf.get_dummies(game_injury_df, dummy_na=True, drop_first=True)
if 'FieldType_nan' in game_injury_df_dummies.columns:
    game_injury_df_dummies = game_injury_df_dummies.drop(columns=['FieldType_nan'])

# --- Merge Play Data and Injury Data on 'PlayKey' ---
play_injury_df = injury_df.dropna(subset=['PlayKey']).set_index('PlayKey').join(
    play_df_cleaned.set_index('PlayKey'), how='outer', lsuffix='_left', rsuffix='_right'
)

for col in ['DM_M1', 'DM_M7', 'DM_M28', 'DM_M42']:
    play_injury_df[col] = play_injury_df[col].fillna(0).astype('int32')

play_injury_df['DM_M1']  = play_injury_df['DM_M1']  - play_injury_df['DM_M7']
play_injury_df['DM_M7']  = play_injury_df['DM_M7']  - play_injury_df['DM_M28']
play_injury_df['DM_M28'] = play_injury_df['DM_M28'] - play_injury_df['DM_M42']

play_injury_df['Injury'] = (
    play_injury_df['DM_M1'] +
    play_injury_df['DM_M7'] +
    play_injury_df['DM_M28'] +
    play_injury_df['DM_M42']
)

play_injury_df = play_injury_df.drop(columns=['Surface'])

# --- Create Dummy Variables for Play-Level Data ---
play_injury_df_dummies = cudf.get_dummies(
    play_injury_df,
    columns=['PlayType', 'PositionGroup'],
    dummy_na=True,
    drop_first=True
)

# --- Create Motion-Aggregated DataFrame ---
def create_motion_data_df(injury_df, play_df, player_df):
    """
    Merge injury, play, and motion data. Compute new motion feature "angle" and aggregate motion metrics.
    """
    # Avoid in-place modifications by copying player_df
    player_df = player_df.copy()
    player_df['angle'] = player_df['o'] - player_df['dir']
    
    # Compute grouped max and average metrics by PlayKey
    grouped_max = player_df[['PlayKey', 'time', 'dir', 'dis', 'o', 's', 'angle']].groupby('PlayKey').max()
    grouped_avg = player_df[['PlayKey', 'time', 'dir', 'dis', 'o', 's', 'angle']].groupby('PlayKey').mean()
    
    # Merge aggregated motion features back into play data
    play_df = play_df.merge(
        grouped_max.reset_index(), on='PlayKey'
    ).merge(
        grouped_avg.reset_index(), on='PlayKey', suffixes=('_max', '_avg')
    )
    
    # Clean injury data and merge with play data
    injury_df_cleaned = injury_df.drop(columns=['PlayerKey', 'GameID', 'BodyPart', 'Surface'])
    merged_df = injury_df_cleaned.merge(play_df, on='PlayKey', how='outer').fillna(0)
    return merged_df

motion_df = create_motion_data_df(injury_df, play_df, player_df)

# Automatically select injury columns and compute the overall injury flag
injury_cols = [col for col in motion_df.columns if col.startswith('DM_M')]
motion_df['Injury'] = motion_df[injury_cols].sum(axis=1)


Mon Apr 14 22:01:06 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off |   00000000:01:00.0  On |                  Off |
|  0%   49C    P0             65W /  450W |   19386MiB /  24564MiB |     10%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# 1. Prepare the features (drop leakage columns)
X = motion_df.drop(columns=[
    'Injury', 'DM_M1', 'DM_M7', 'DM_M28', 'DM_M42', 'PlayKey', 'PlayerKey', 'GameID'
])

# 2. One-hot encode categorical columns
X = cudf.get_dummies(X, dummy_na=True, drop_first=True)

# For a cuDF DataFrame with some boolean columns:
for col in X.columns:
    if X[col].dtype == 'bool':
        X[col] = X[col].astype('int32')


# 3. Prepare the binary target (0 = no injury, 1 = injury)
y = motion_df['Injury'].copy()
y_binary = y.copy()
y_binary[y_binary > 0] = 1

In [3]:
X.head()

Unnamed: 0,PlayerDay,PlayerGame,Temperature,PlayerGamePlay,time_max,dir_max,dis_max,o_max,s_max,angle_max,...,Position_WR,PositionGroup_DB,PositionGroup_DL,PositionGroup_LB,PositionGroup_OL,PositionGroup_QB,PositionGroup_RB,PositionGroup_SPEC,PositionGroup_TE,PositionGroup_WR
0,1,1,63,1,29.8,337.87,0.48,267.03,2.94,265.19,...,0,0,0,0,0,1,0,0,0,0
1,1,1,63,10,27.5,353.24,0.45,315.08,3.18,260.69,...,0,0,0,0,0,1,0,0,0,0
2,1,1,63,11,36.8,357.78,0.35,358.2,2.94,314.68,...,0,0,0,0,0,1,0,0,0,0
3,1,1,63,12,35.6,359.97,0.46,302.26,1.83,299.54,...,0,0,0,0,0,1,0,0,0,0
4,1,1,63,13,25.5,357.43,0.49,356.69,1.49,295.07,...,0,0,0,0,0,1,0,0,0,0


In [4]:
from cuml.ensemble import RandomForestClassifier
from cuml.metrics import accuracy_score
from cuml.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y_binary, 
    test_size=0.2,     # 20% of data will be held out for testing
    random_state=42,   # For reproducibility
    shuffle=True       # Data is shuffled by default; can be set to False if needed
)

# Initialize and train a RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=10, n_streams=1,random_state=42)
rf.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", acc)

Test Accuracy: 0.9996628761291504


## model metrics

In [5]:
import cupy as cp
from cuml.metrics import confusion_matrix

# Convert your test labels (y_test) and predictions (y_pred) to cupy arrays if they aren't already:
y_true = cp.asarray(y_test)       # Assuming y_test is a cuDF Series
y_pred = cp.asarray(rf.predict(X_test))  # rf is your trained model

# Compute the confusion matrix
cm = confusion_matrix(y_true, y_pred)

# If you prefer to see the matrix as a numpy array, you can call .get():
print("Confusion Matrix:\n", cm.get())

Confusion Matrix:
 [[53374     0]
 [   18     5]]
