In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.base import TransformerMixin
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv', dtype={'WindSpeed': 'object'})
train.head()

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,...,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.4,81.99,177.18,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.7,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
2,2017090700,20170907000118,away,74.0,33.2,1.22,0.59,0.31,3.01,202.73,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
3,2017090700,20170907000118,away,71.46,27.7,0.42,0.54,0.02,359.77,105.64,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
4,2017090700,20170907000118,away,69.32,35.42,1.82,2.43,0.16,12.63,164.31,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW


In [3]:
CATEGORICAL_VARS = [
    'DisplayName', 'PlayerCollegeName', 'GameWeather', 'Location', 'OffensePersonnel',
    'Stadium', 'WindDirection', 'DefensePersonnel',
    'HomeTeamAbbr', 'VisitorTeamAbbr', 'FieldPosition', 'PossessionTeam',
    'StadiumType', 'Position', 'Turf', 'PlayerHeight', 'OffenseFormation'
]

In [4]:
cardinalities = train[CATEGORICAL_VARS].nunique().sort_values(ascending=False)
cardinalities

DisplayName          2230
PlayerCollegeName     301
GameWeather            61
Location               60
OffensePersonnel       56
Stadium                55
WindDirection          53
DefensePersonnel       38
HomeTeamAbbr           32
VisitorTeamAbbr        32
FieldPosition          32
PossessionTeam         32
StadiumType            29
Position               25
Turf                   20
PlayerHeight           16
OffenseFormation        8
dtype: int64

In [9]:
train['OffensePersonnel'][:5]

0          1 RB, 1 TE, 3 WR
1          1 RB, 1 TE, 3 WR
2          1 RB, 1 TE, 3 WR
3    6 OL, 2 RB, 2 TE, 0 WR
4          1 RB, 3 TE, 1 WR
Name: OffensePersonnel, dtype: object

In [5]:
class Preprocessor(TransformerMixin):
    def __init__(self):
        super().__init__()
        self.target = 'Yards'
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = self._correct_team_abbreviations(X)
        X = self._encode_player_height(X)
        X = self._process_time_variables(X)
        X = self._fix_wind_variables(X)
        X = self._misc_engineering(X)
        X = self._flatten_player_vars(X)
        return X
    
    def _correct_team_abbreviations(self, X):
        map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
        for abb in train['PossessionTeam'].unique():
            map_abbr[abb] = abb
            
        X['PossessionTeam'] = X['PossessionTeam'].map(map_abbr)
        X['HomeTeamAbbr'] = X['HomeTeamAbbr'].map(map_abbr)
        X['VisitorTeamAbbr'] = X['VisitorTeamAbbr'].map(map_abbr)
        
        X['HomePossesion'] = X['PossessionTeam'] == X['HomeTeamAbbr']
        return X
    
    def _encode_player_height(self, X):
        def string_to_inches(x):
            feet, inch = x.split('-')
            return int(inch) + 12 * int(feet)
        X['PlayerHeight'] = X['PlayerHeight'].apply(string_to_inches)
        return X
    
    def _process_time_variables(self, X):
        for col in ['TimeHandoff', 'TimeSnap', 'PlayerBirthDate']:
            X[col] = pd.to_datetime(X[col], utc=True, infer_datetime_format=True)
        X['TimeUntilHandoff'] = X['TimeSnap'] - X['TimeHandoff']
        X['TimeUntilHandoff'] = X['TimeUntilHandoff'].dt.total_seconds()
        
        X['PlayerAge'] = X['TimeSnap'] - X['PlayerBirthDate']
        X['PlayerAge'] = X['PlayerAge'].dt.total_seconds() / 31556952
        
        X['GameClock'] = 360 * X['GameClock'].str[:2].astype(int) \
            + 60 * X['GameClock'].str[3:5].astype(int) \
            + X['GameClock'].str[6:8].astype(int)

        X.drop(columns=['TimeHandoff', 'TimeSnap', 'PlayerBirthDate'], inplace=True)
        return X
    
    def _fix_wind_variables(self, X):
        def average_ranges(x):
            x = str(x)
            if '-' in x:
                low, high = x.split('-')
                return str((int(high) + int(low)) / 2)
            elif ' gusts up to ' in x:
                low, high = x.split(' gusts up to ')
                return str((int(high) + int(low)) / 2)
            else:
                return x

        def coerce_to_int(x):
            try:
                x = int(x)
            except:
                x = np.nan
            return x

        X['WindSpeed'] = X['WindSpeed'].str.lower().str.replace('mph', '')
        X['WindSpeed'] = X['WindSpeed'].str.strip()
        X['WindSpeed'] = X['WindSpeed'].apply(average_ranges)
        X['WindSpeed'] = X['WindSpeed'].apply(coerce_to_int)
        
        acceptable_directions = [
            'NE', 'SW', 'S', 'NW', 'WSW', 'SE', 'W', 'N', 'NNE', 'WNW', 'SSW',
            'NNW', 'SSE', 'E', 'ENE', 'ESE'
        ]
        X['WindDirection'] = X['WindDirection'].str.upper()
        X['WindDirection'] = X['WindDirection'].str.replace('FROM ', '').str.replace('-', '')
        X.loc[~X['WindDirection'].isin(acceptable_directions), 'WindDirection'] = np.nan
        return X
    
    def _misc_engineering(self, X):
        # Set binary variables
        X['Team'] = X['Team'] == 'away'
        X['PlayDirection'] = X['PlayDirection'] == 'away'
        # If a player is the rusher
        X['IsRusher'] = X['NflId'] == X['NflIdRusher']
        X.drop(columns=['NflId', 'NflIdRusher'], inplace=True)
        return X
    
    def _flatten_player_vars(self, X):
        player_cols = self._get_player_cols(X)
        n_player_cols = len(player_cols)
        player_data = X[player_cols].values.reshape(-1, n_player_cols*22)
        new_col_names = [col + '_' + str(player_num) 
                         for player_num in range(22) 
                         for col in player_cols]
        
        player_data = pd.DataFrame(
            data=player_data,
            columns=new_col_names
        )
        X.drop(columns=player_cols, inplace=True)
        X = X.drop_duplicates().reset_index(drop=True)
        X = pd.concat([X, player_data], axis=1)
        return X
    
    def _get_player_cols(self, X):
        str_vars = X.select_dtypes('object').columns
        single_play_df = X.iloc[:22, :].copy().reset_index(drop=True)

        for col in str_vars:
            encoder = LabelEncoder()
            single_play_df[col] = encoder.fit_transform(single_play_df[col])

        variance = single_play_df.var()
        return single_play_df.columns[variance != 0].tolist()

In [6]:
processor = Preprocessor()
train = processor.fit_transform(train)

In [7]:
train.shape

(23171, 384)