In [1]:
import numpy as np
import pandas as pd
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 125)

In [2]:
data = pd.read_csv("data/Sample_PFF_Data.csv")

In [3]:
dataset = data.copy(deep=True)

In [4]:
# should Shotgun and Pistol be NaN for run plays and 0 for passes where no shotgun? Or just zero fine?
dataset.SHOTGUN.where(dataset.SHOTGUN.isna(),1, inplace=True)
dataset.SHOTGUN.fillna(0, inplace=True)

dataset.PISTOL.where(dataset.PISTOL.isna(),1, inplace=True)
dataset.PISTOL.fillna(0, inplace=True)

dataset['MOFO_PLAYED'] = dataset.MOFOCPLAYED.replace({'O': 1, 'C': 0})
dataset['MOFO_SHOWN'] = dataset.MOFOCPLAYED.replace({'O': 1, 'C': 0})


In [5]:
# convert string time (2:00) into seconds as int (120)
def convert_time(time_str):
    return int(time_str[0:2])*60 + int(time_str[3:])
# remove * and +Q to simplify features
# decided to group all 3-RB sets together since had similar Run vs Pass rates (and all had low sample sizes)
def convert_off_personnel(personnel_str):
    new_alignment = personnel_str[0:2]
    if new_alignment[0] == '3':
        new_alignment = '3+'
    if new_alignment == 'Un':
        new_alignment = np.nan
    return new_alignment
# just return nans for values like (10 men, X-X-X)
def convert_def_personnel(personnel_str):
    try:
        num_lineman = int(personnel_str[0])
        num_linebackers = int(personnel_str[2])
        num_defensivebacks = int(personnel_str[4])
    except:
        num_lineman = np.nan 
        num_linebackers = np.nan 
        num_defensivebacks = np.nan
    return pd.Series([num_lineman, num_linebackers, num_defensivebacks])

In [6]:
dataset['CLOCK_INT'] = dataset['CLOCK'].apply(convert_time)

In [7]:
dataset.QUARTER = dataset.QUARTER.astype("category")
dataset.DOWN = dataset.DOWN.astype("category")
dataset.OFFTIMEOUTSREMAINING = dataset.OFFTIMEOUTSREMAINING.astype("category")
dataset.DEFTIMEOUTSREMAINING = dataset.DEFTIMEOUTSREMAINING.astype("category")
dataset.HASH = dataset.HASH.astype("category")

In [8]:
dataset = dataset.loc[(dataset.RUNPASS == 'P') | (dataset.RUNPASS == 'R') ]
dataset = dataset.query("DOWN != 0").reset_index(drop=True)
dataset.RUNPASS = dataset.RUNPASS.astype("category")

In [9]:
dataset['OFFPERSONNEL_SIMPLIFIED'] = dataset['OFFPERSONNELBASIC'].apply(convert_off_personnel)
dataset[['DEFPERSONNEL_num_linemen', 'DEFPERSONNEL_num_linebackers', 'DEFPERSONNEL_num_defensivebacks']] = dataset['DEFPERSONNEL'].apply(convert_def_personnel)

In [10]:
dataset.head()

Unnamed: 0,2MINUTE,CENTERPASSBLOCKDIRECTION,CLOCK,DEFENSIVELINESHIFT,DEFPERSONNEL,DEFSCORE,DEFSUBSTITUTIONS,DefTeam,DEFTIMEOUTSREMAINING,DISTANCE,DOWN,DRIVE,DRIVEENDEVENT,DRIVEENDFIELDPOSITION,DRIVEENDPLAYNUMBER,DRIVEPLAY,DRIVESTARTEVENT,DRIVESTARTFIELDPOSITION,DROPBACKDEPTH,DROPBACKTYPE,FIELDPOSITION,FORCEDFUMBLE,FUMBLE,FUMBLELOST,GAINLOSS,GAINLOSSNET,GAMEID,GARBAGETIME,HASH,HASHDEF,HIT,HURRY,INTERCEPTION,KICKYARDS,MOFOCPLAYED,MOFOCSHOWN,NEXTPLAYID,NOHUDDLE,NOPLAY,OFFFORMATIONUNBALANCED,OFFPERSONNELBASIC,OFFSCORE,OFFSUBSTITUTIONS,OffTeam,OFFTIMEOUTSREMAINING,OPERATIONTIME,OPTION,PASSBREAKUP,PASSDEPTH,PASSDIRECTION,PASSRECEIVERPOSITIONTARGET,PASSRESULT,PASSRUSHRESULT,PASSWIDTH,PENALTY,PENALTYYARDS,PISTOL,PLAYACTION,PLAYACTIONFAKE,PLAYCLOCK,PLAYENDFIELDPOSITION,PLAYID,POAACTUAL,POAINTENDED,PREVIOUSPFFPLAYID,PUMPFAKE,QBMOVEDOFFSPOT,QBPRESSURE,QBRESET,QUARTER,RBDIRECTION,RBSINBACKFIELD,RETURNYARDS,RUNPASS,RUNPASSOPTION,SACK,SCORE,SCOREDIFFERENTIAL,SCREEN,SHIFTMOTION,SHOTGUN,SNAPTIME,SORTORDER,SPOTLEFT,STUNT,TACKLE,TEALIGNMENT,TEMPO,TIMETOPRESSURE,TIMETOTHROW,TOUCHDOWN,TRICKLOOK,TRICKPLAY,WEEK,YARDSAFTERCATCH,YARDSAFTERCONTACT,MOFO_PLAYED,MOFO_SHOWN,CLOCK_INT,OFFPERSONNEL_SIMPLIFIED,DEFPERSONNEL_num_linemen,DEFPERSONNEL_num_linebackers,DEFPERSONNEL_num_defensivebacks
0,0,L,14:54,0,4-2-5,0,1,Team_3,3,10,1,1.0,FIELD GOAL,20.0,14.0,1.0,KICKOFF - RETURN,-39.0,8.0,SD,-39,0,0,0,,0,18548,0,R,L,0,0,0,,C,C,3528156.0,0,0,0,11,0,1,Team_23,3,,0,0,33.0,L,LWR,INCOMPLETE,,1.0,0,,0,1,1,13.0,-39,3528152,,,3528149.0,0,0,0,0,1,,1.0,,P,0,0,0.0,0,0,1,1,,2,29,0,0,R,0,,2.7,0,0,0,2,,,0.0,0.0,894,11,4.0,2.0,5.0
1,0,L,14:49,0,4-2-5,0,0,Team_3,3,10,2,1.0,FIELD GOAL,20.0,14.0,2.0,KICKOFF - RETURN,-39.0,7.0,SD,-39,0,0,0,10.0,10,18548,0,R,L,0,0,0,,C,C,3528157.0,0,0,0,11,0,1,Team_23,3,,0,0,5.0,M,SRWR,COMPLETE,,32.0,0,,0,0,0,22.0,-49,3528156,,,3528152.0,0,0,0,0,1,,,,P,0,0,0.0,0,0,0,1,,3,29,1,1,,0,,1.7,0,0,0,2,5.0,4.0,0.0,0.0,889,11,4.0,2.0,5.0
2,0,,14:09,0,4-2-5,0,0,Team_3,3,10,1,1.0,FIELD GOAL,20.0,14.0,3.0,KICKOFF - RETURN,-39.0,,,-49,0,0,0,2.0,2,18548,0,C,C,0,0,0,,C,O,3528160.0,0,0,0,11,0,1,Team_23,3,,0,0,,,,,,,0,,0,0,0,6.0,49,3528157,ML,ML,3528156.0,0,0,0,0,1,L,1.0,,R,0,0,0.0,0,0,0,1,,4,28,0,1,R,0,,,0,0,0,2,,2.0,0.0,0.0,849,11,4.0,2.0,5.0
3,0,R,13:37,0,4-2-5,0,0,Team_3,3,8,2,1.0,FIELD GOAL,20.0,14.0,4.0,KICKOFF - RETURN,-39.0,8.0,SD,49,0,0,0,3.0,3,18548,0,L,R,0,1,0,,O,O,3528162.0,0,0,0,12,0,1,Team_23,3,,0,0,3.0,M,RWR,COMPLETE,HURRY,33.0,0,,0,0,0,,46,3528160,,,3528157.0,0,0,1,0,1,,1.0,,P,0,0,0.0,0,0,0,1,,5,24,0,1,L;L,0,2.6,2.6,0,0,0,2,0.0,0.0,1.0,1.0,817,12,4.0,2.0,5.0
4,0,C,12:56,0,4-2-5,0,1,Team_3,3,5,3,1.0,FIELD GOAL,20.0,14.0,5.0,KICKOFF - RETURN,-39.0,8.0,SD,46,0,0,0,7.0,7,18548,0,R,L,0,1,0,,C,C,3528164.0,0,0,0,11,0,1,Team_23,3,,0,0,0.0,X,,RUN,HURRY,,0,,0,0,0,5.0,39,3528162,QB SCRAMBLE,QB SCRAMBLE,3528160.0,0,1,1,0,1,L,1.0,,P,0,0,0.0,0,0,1,1,,6,29,0,1,,0,2.2,4.0,0,0,0,2,,2.0,0.0,0.0,776,11,4.0,2.0,5.0


In [11]:
misc_col_names = [
    'historical_yards_per_carry',
    'historical_yards_per_pass_attempt', 
    'historical_yards_allowed_per_carry', 
    'historical_yards_allowed_per_pass_attempt'
]

col_names_for_numeric_previous = [
'FORCEDFUMBLE',
'HIT',
'HURRY',
'GAINLOSSNET',
'INTERCEPTION',
'NOHUDDLE',
'PENALTY',
'PASSDEPTH',
'PASSBREAKUP',

'DROPBACKDEPTH',

'MOFO_PLAYED',
'MOFO_SHOWN',
'DEFPERSONNEL_num_linemen',
'DEFPERSONNEL_num_linebackers',
'DEFPERSONNEL_num_defensivebacks',

'PISTOL',
'PLAYACTION',
'SACK',
'SCREEN',
'SHIFTMOTION',
'SHOTGUN',

'QBMOVEDOFFSPOT',
'QBPRESSURE',
'TIMETOPRESSURE',
'TIMETOTHROW',
'YARDSAFTERCATCH',
'YARDSAFTERCONTACT',
]

In [12]:
prev_names = ['prev_' + x for x in col_names_for_numeric_previous]
game_prev_names = ['game_prev_' + x for x in col_names_for_numeric_previous] # instead of just previous, get cumulative mean from all previous plays in game
historical_prev_names = ['historical_prev_' + x for x in col_names_for_numeric_previous] # get cumulative mean from all previous plays in all previous games

In [13]:
def rolling_mean_func(g, K=10):
    return g.expanding(min_periods=K).mean().shift(1)

In [14]:
# need to sort for adding calculated columns back to original dataframe
dataset = dataset.sort_values(["OffTeam", "GAMEID", "PLAYID"]).reset_index(drop=True)
dataset[prev_names] = dataset.groupby(["OffTeam", "GAMEID", "DRIVE"], sort=False)[col_names_for_numeric_previous].shift(1)  # don't need to sort since dataset already sorted
# shift(1) to exlcude current row from mean calculations
dataset[game_prev_names] = dataset.groupby(["OffTeam", "GAMEID"], sort=False)[col_names_for_numeric_previous].apply(rolling_mean_func).reset_index(drop=True)
dataset[historical_prev_names] = dataset.groupby(["OffTeam"], sort=False)[col_names_for_numeric_previous].apply(rolling_mean_func, K=100).reset_index(drop=True)


In [15]:
dataset['running_gain'] = dataset.RUNPASS.map({"R":1, "P":np.nan})*dataset.GAINLOSSNET
dataset['passing_gain'] = dataset.RUNPASS.map({"P":1, "R":np.nan})*dataset.GAINLOSSNET
dataset['historical_yards_per_carry'] = dataset.groupby(["OffTeam"])['running_gain'].apply(rolling_mean_func, K=100).reset_index(drop=True)
dataset['historical_yards_per_pass_attempt'] = dataset.groupby(["OffTeam"])['passing_gain'].apply(rolling_mean_func, K=100).reset_index(drop=True)

In [16]:
dataset = dataset.sort_values(["DefTeam", "GAMEID", "PLAYID"]).reset_index(drop=True)
dataset['historical_yards_allowed_per_carry'] = dataset.groupby(["DefTeam"])['running_gain'].apply(rolling_mean_func, K=100).reset_index(drop=True)
dataset['historical_yards_allowed_per_pass_attempt'] = dataset.groupby(["DefTeam"])['passing_gain'].apply(rolling_mean_func, K=100).reset_index(drop=True)

In [17]:
col_names_for_categorical_previous = [
    'OFFPERSONNEL_SIMPLIFIED', 
    'CENTERPASSBLOCKDIRECTION'
]
for col_name in col_names_for_categorical_previous:
    dataset[col_name] = dataset[col_name].astype("category")

prev_categorical_names = ['prev_' + x for x in col_names_for_categorical_previous]
# create previous play categorical
dataset[prev_categorical_names] = dataset.groupby(["OffTeam","GAMEID", "DRIVE"])[col_names_for_categorical_previous].shift(1)

In [18]:
# get a list of all the names of the columns that are dummy variables for the categorical features used
dummy_categorical_names = []
for col_name in col_names_for_categorical_previous:
    unique_vals = dataset[col_name].unique()
    for val in unique_vals:
        dummy_column_name = f"{col_name}_{val}"
        dummy_categorical_names.append(dummy_column_name)
game_prev_dummy_categorical_names = ['game_prev_' + x for x in dummy_categorical_names]

dummy_dataset = pd.get_dummies(dataset, prefix_sep="_", dummy_na=True, columns=col_names_for_categorical_previous, drop_first=False)
dummy_dataset = dummy_dataset.sort_values(["OffTeam", "GAMEID", "PLAYID"]).reset_index(drop=True)


In [19]:
dummy_dataset[game_prev_dummy_categorical_names] = dummy_dataset.groupby(["OffTeam", "GAMEID"])[dummy_categorical_names].apply(rolling_mean_func, K=10).reset_index(drop=True)

In [20]:
historical_prev_dummy_categorical_names = ['historical_prev_' + x for x in dummy_categorical_names]
dummy_dataset[historical_prev_dummy_categorical_names] = dummy_dataset.groupby(["OffTeam"])[dummy_categorical_names].apply(rolling_mean_func, K=100).reset_index(drop=True)

In [21]:
base_feature_names = [
    # 'DefTeam',
    # 'OffTeam',
    'WEEK',
    'QUARTER',
    'SCOREDIFFERENTIAL',
    'SCORE',
    'DISTANCE',
    'DOWN',
    'FIELDPOSITION',
    'DRIVE',
    'DRIVEPLAY',
    'OFFTIMEOUTSREMAINING',
    'DEFTIMEOUTSREMAINING',
    'HASH',
    'SPOTLEFT',
    '2MINUTE', 
    'CLOCK_INT']
engineered_features_names = prev_names + game_prev_names + historical_prev_names + \
                            prev_categorical_names + game_prev_dummy_categorical_names + historical_prev_dummy_categorical_names + \
                            misc_col_names
target_names = ['RUNPASS']

In [23]:
dummy_dataset = dummy_dataset.sort_values(["GAMEID", "PLAYID"]).reset_index(drop=True)    # so aligns with indices used in model fitting 

In [24]:
base_dataset = dummy_dataset[['GAMEID', 'PLAYID'] + base_feature_names + target_names]
#base_dataset.to_pickle("datasets/base_dataset.pkl")   # use pickle to keep data dtypes

In [25]:
non_categorical_dataset = dummy_dataset[['GAMEID', 'PLAYID'] + base_feature_names + prev_names + game_prev_names + historical_prev_names + \
                                  misc_col_names + target_names]
#non_categorical_dataset.to_pickle("datasets/non_categorical_dataset.pkl")   # use pickle to keep data dtypes

In [26]:
dummy_dataset_filtered = dummy_dataset[['GAMEID', 'PLAYID'] + base_feature_names + engineered_features_names + target_names]
#dummy_dataset_filtered.to_pickle("datasets/dummy_dataset.pkl")   # use pickle to keep data dtypes

CategoricalDtype(categories=[0, 1, 2, 3], ordered=False)

In [32]:
correlation_dataset = dummy_dataset_filtered.copy(deep=True)
correlation_dataset.OFFTIMEOUTSREMAINING = correlation_dataset.OFFTIMEOUTSREMAINING.astype(float)
correlation_dataset.DEFTIMEOUTSREMAINING = correlation_dataset.DEFTIMEOUTSREMAINING.astype(float)
correlation_dataset.QUARTER = correlation_dataset.QUARTER.astype(float)
corr_df = dummy_dataset_filtered.corr(numeric_only=True)
corr_df.style.background_gradient(cmap='coolwarm')

Unnamed: 0,GAMEID,PLAYID,WEEK,SCOREDIFFERENTIAL,SCORE,DISTANCE,FIELDPOSITION,DRIVE,DRIVEPLAY,SPOTLEFT,2MINUTE,CLOCK_INT,prev_FORCEDFUMBLE,prev_HIT,prev_HURRY,prev_GAINLOSSNET,prev_INTERCEPTION,prev_NOHUDDLE,prev_PENALTY,prev_PASSDEPTH,prev_PASSBREAKUP,prev_DROPBACKDEPTH,prev_MOFO_PLAYED,prev_MOFO_SHOWN,prev_DEFPERSONNEL_num_linemen,prev_DEFPERSONNEL_num_linebackers,prev_DEFPERSONNEL_num_defensivebacks,prev_PISTOL,prev_PLAYACTION,prev_SACK,prev_SCREEN,prev_SHIFTMOTION,prev_SHOTGUN,prev_QBMOVEDOFFSPOT,prev_QBPRESSURE,prev_TIMETOPRESSURE,prev_TIMETOTHROW,prev_YARDSAFTERCATCH,prev_YARDSAFTERCONTACT,game_prev_FORCEDFUMBLE,game_prev_HIT,game_prev_HURRY,game_prev_GAINLOSSNET,game_prev_INTERCEPTION,game_prev_NOHUDDLE,game_prev_PENALTY,game_prev_PASSDEPTH,game_prev_PASSBREAKUP,game_prev_DROPBACKDEPTH,game_prev_MOFO_PLAYED,game_prev_MOFO_SHOWN,game_prev_DEFPERSONNEL_num_linemen,game_prev_DEFPERSONNEL_num_linebackers,game_prev_DEFPERSONNEL_num_defensivebacks,game_prev_PISTOL,game_prev_PLAYACTION,game_prev_SACK,game_prev_SCREEN,game_prev_SHIFTMOTION,game_prev_SHOTGUN,game_prev_QBMOVEDOFFSPOT,game_prev_QBPRESSURE,game_prev_TIMETOPRESSURE,game_prev_TIMETOTHROW,game_prev_YARDSAFTERCATCH,game_prev_YARDSAFTERCONTACT,historical_prev_FORCEDFUMBLE,historical_prev_HIT,historical_prev_HURRY,historical_prev_GAINLOSSNET,historical_prev_INTERCEPTION,historical_prev_NOHUDDLE,historical_prev_PENALTY,historical_prev_PASSDEPTH,historical_prev_PASSBREAKUP,historical_prev_DROPBACKDEPTH,historical_prev_MOFO_PLAYED,historical_prev_MOFO_SHOWN,historical_prev_DEFPERSONNEL_num_linemen,historical_prev_DEFPERSONNEL_num_linebackers,historical_prev_DEFPERSONNEL_num_defensivebacks,historical_prev_PISTOL,historical_prev_PLAYACTION,historical_prev_SACK,historical_prev_SCREEN,historical_prev_SHIFTMOTION,historical_prev_SHOTGUN,historical_prev_QBMOVEDOFFSPOT,historical_prev_QBPRESSURE,historical_prev_TIMETOPRESSURE,historical_prev_TIMETOTHROW,historical_prev_YARDSAFTERCATCH,historical_prev_YARDSAFTERCONTACT,game_prev_OFFPERSONNEL_SIMPLIFIED_11,game_prev_OFFPERSONNEL_SIMPLIFIED_21,game_prev_OFFPERSONNEL_SIMPLIFIED_12,game_prev_OFFPERSONNEL_SIMPLIFIED_22,game_prev_OFFPERSONNEL_SIMPLIFIED_01,game_prev_OFFPERSONNEL_SIMPLIFIED_02,game_prev_OFFPERSONNEL_SIMPLIFIED_13,game_prev_OFFPERSONNEL_SIMPLIFIED_10,game_prev_OFFPERSONNEL_SIMPLIFIED_20,game_prev_OFFPERSONNEL_SIMPLIFIED_23,game_prev_OFFPERSONNEL_SIMPLIFIED_3+,game_prev_OFFPERSONNEL_SIMPLIFIED_03,game_prev_OFFPERSONNEL_SIMPLIFIED_nan,game_prev_OFFPERSONNEL_SIMPLIFIED_00,game_prev_OFFPERSONNEL_SIMPLIFIED_14,game_prev_CENTERPASSBLOCKDIRECTION_L,game_prev_CENTERPASSBLOCKDIRECTION_C,game_prev_CENTERPASSBLOCKDIRECTION_R,game_prev_CENTERPASSBLOCKDIRECTION_nan,historical_prev_OFFPERSONNEL_SIMPLIFIED_11,historical_prev_OFFPERSONNEL_SIMPLIFIED_21,historical_prev_OFFPERSONNEL_SIMPLIFIED_12,historical_prev_OFFPERSONNEL_SIMPLIFIED_22,historical_prev_OFFPERSONNEL_SIMPLIFIED_01,historical_prev_OFFPERSONNEL_SIMPLIFIED_02,historical_prev_OFFPERSONNEL_SIMPLIFIED_13,historical_prev_OFFPERSONNEL_SIMPLIFIED_10,historical_prev_OFFPERSONNEL_SIMPLIFIED_20,historical_prev_OFFPERSONNEL_SIMPLIFIED_23,historical_prev_OFFPERSONNEL_SIMPLIFIED_3+,historical_prev_OFFPERSONNEL_SIMPLIFIED_03,historical_prev_OFFPERSONNEL_SIMPLIFIED_nan,historical_prev_OFFPERSONNEL_SIMPLIFIED_00,historical_prev_OFFPERSONNEL_SIMPLIFIED_14,historical_prev_CENTERPASSBLOCKDIRECTION_L,historical_prev_CENTERPASSBLOCKDIRECTION_C,historical_prev_CENTERPASSBLOCKDIRECTION_R,historical_prev_CENTERPASSBLOCKDIRECTION_nan,historical_yards_per_carry,historical_yards_per_pass_attempt,historical_yards_allowed_per_carry,historical_yards_allowed_per_pass_attempt
GAMEID,1.0,0.976736,0.088143,0.004198,-0.086537,0.009849,-0.026556,0.008861,-0.01892,0.004633,-0.049773,0.003623,0.015362,-0.016246,0.015905,-0.002934,0.006625,0.018598,0.009029,-0.010021,-0.003347,0.009573,0.012565,0.012565,-0.108781,0.113732,-0.012975,-0.036771,0.020606,0.001378,0.025746,-0.0005,-0.009472,-0.002216,0.008686,0.009382,-0.006998,0.023769,0.010799,-0.041611,-0.035067,0.103792,-0.04884,0.051087,0.059589,0.082885,-0.073187,-0.015595,0.081474,0.0528,0.0528,-0.124464,0.12948,-0.020639,-0.066789,0.031216,0.0385,0.119044,-0.054719,-0.001787,0.055699,0.09161,0.019897,0.010119,0.053095,0.038695,0.043277,-0.033772,0.119998,-0.098881,0.006586,0.058416,0.022895,-0.078565,-0.053647,0.02059,0.022968,0.022968,-0.129321,0.087762,0.065203,-0.034308,-0.02969,0.036947,0.098413,-0.042282,0.034648,0.072089,0.090677,0.043233,0.024721,0.093837,0.025934,-0.027833,-0.053571,0.089791,-0.003622,0.058711,-0.101526,-0.015654,-0.014826,0.055761,0.021278,-0.051018,-0.00867,-0.042891,-0.022257,0.021939,0.061796,-0.036863,0.066888,-0.092411,0.016808,-0.050732,0.027404,-0.04332,0.078536,-0.032535,0.018869,-0.003542,0.100511,-0.049338,-0.00509,0.029604,-0.007404,-0.039605,0.009677,0.023754,0.040116,0.002313,-0.047045,-0.055975,-0.054576,-0.046971,-0.058677
PLAYID,0.976736,1.0,0.284066,0.004476,-0.082131,0.011752,-0.024562,0.012798,-0.019099,0.002859,-0.049496,0.003485,0.013809,-0.015456,0.014475,-0.006353,0.009313,0.0162,0.008189,-0.015305,-0.002384,0.006977,0.006502,0.006502,-0.106562,0.109541,-0.010062,-0.037975,0.015659,0.002548,0.026159,0.000125,-0.006628,-0.00646,0.008189,0.005805,-0.014389,0.028198,0.015825,-0.047432,-0.043612,0.096508,-0.069255,0.044734,0.050332,0.065828,-0.096191,-0.003961,0.074262,0.039499,0.039499,-0.123855,0.12514,-0.010931,-0.06496,0.005537,0.024779,0.129555,-0.058495,0.008958,0.035318,0.081505,0.024592,-0.016721,0.056557,0.056258,0.051407,-0.034752,0.119859,-0.103763,0.010013,0.0517,0.02472,-0.08021,-0.054589,0.032317,0.021197,0.021197,-0.13811,0.097375,0.06286,-0.038215,-0.025871,0.03093,0.115923,-0.03829,0.03079,0.064223,0.091265,0.038028,0.019936,0.112686,0.037142,-0.033122,-0.047674,0.081163,0.004611,0.076235,-0.08634,-0.025505,-0.001056,0.056624,0.016096,-0.060665,-0.003728,-0.043679,-0.019923,0.035709,0.056082,-0.039306,0.067227,-0.085514,0.014792,-0.049336,0.027964,-0.042399,0.09088,-0.03701,0.011846,-0.001407,0.107472,-0.049656,0.000875,0.024084,-0.01201,-0.044716,0.01874,0.021857,0.034799,0.009308,-0.047786,-0.058452,-0.061028,-0.066617,-0.069998
WEEK,0.088143,0.284066,1.0,0.001981,-0.010582,0.005362,-0.002196,0.009161,-0.012576,-0.003896,-0.008866,0.001791,-0.003131,-0.002272,-6.4e-05,-0.016679,0.007247,-0.001919,-0.005848,-0.028338,-0.000634,-0.01609,-0.025356,-0.025356,-0.016867,0.002692,0.019141,-0.009049,-0.017121,0.004661,0.008321,0.005932,0.016491,-0.017117,-5e-06,-0.026596,-0.036111,0.023135,0.020871,-0.044932,-0.040479,-0.004174,-0.106383,-0.027431,-0.015179,-0.080972,-0.123003,0.009225,-0.03904,-0.048378,-0.048378,-0.028106,0.00531,0.057351,0.005501,-0.105471,-0.055041,0.074341,-0.016169,0.0601,-0.078042,-0.016998,-0.016354,-0.13207,0.022613,0.067052,0.027847,-0.001261,-0.012464,-0.027869,0.02107,-0.028447,-0.020325,0.00827,-0.003139,0.036832,-0.019762,-0.019762,-0.037345,0.03896,-0.006393,-0.006674,0.000664,-0.026872,0.056187,0.019984,-0.011927,-0.042504,-0.00705,0.044028,-0.024537,0.04085,0.043292,-0.019755,0.006472,-0.027603,0.031326,0.091888,0.042056,-0.031672,0.048138,0.028915,-0.046694,-0.043694,0.040219,-0.037458,0.030034,0.053989,-0.007178,-0.017783,4.9e-05,0.020865,-0.015154,0.003183,0.006156,0.015459,0.040808,-0.00108,-0.018156,0.009055,0.017355,-0.009824,0.026994,-0.02305,-0.034046,-0.019325,0.040647,-0.03492,-0.009078,0.012769,0.021846,0.005022,-0.010499,-0.045888,-0.020417
SCOREDIFFERENTIAL,0.004198,0.004476,0.001981,1.0,-0.047486,0.010195,0.031457,-0.058384,-0.056195,-0.014531,0.053516,0.013658,0.002441,-0.046381,-0.090743,-0.011941,-0.018129,-0.158402,-0.003184,0.013546,-0.042806,-0.079997,-0.082416,-0.082416,0.06988,0.062153,-0.183069,0.052779,0.026019,-0.033214,-0.010012,0.102001,-0.148036,-0.058131,-0.104552,0.026396,-0.014775,0.037735,0.01617,-0.191585,-0.135579,-0.240252,0.388928,-0.30269,-0.081559,0.009727,0.106847,-0.174122,-0.04601,0.036993,0.036993,0.025906,0.027193,-0.136214,0.078208,0.05598,-0.229842,0.018245,0.124844,-0.101113,-0.119597,-0.297491,0.092994,-0.016586,0.10881,0.086283,-0.038441,-0.025486,-0.111978,0.176193,-0.091234,-0.065931,0.02281,0.015202,-0.097599,0.001962,0.132226,0.132226,0.001465,-0.012386,0.020439,0.077723,0.029246,-0.173686,0.103291,0.10877,0.039924,-0.021883,-0.123542,0.142008,0.006224,0.108307,0.013129,-0.08504,0.05695,0.019389,0.090536,-0.086954,-0.025352,0.005141,0.033417,0.002508,-0.002016,0.037205,-0.051401,0.017289,0.084971,0.006785,-0.070781,-0.07008,-0.167736,0.284931,-0.016448,0.028733,-0.066863,0.049729,-0.028835,-0.063217,-0.038564,0.09933,0.024293,-0.033474,0.050635,0.060948,0.027041,0.062141,0.001331,-0.028996,-0.013579,-0.094859,0.109442,0.074031,0.212032,0.073082,0.213796
SCORE,-0.086537,-0.082131,-0.010582,-0.047486,1.0,0.025206,-0.007641,0.694411,-0.021131,0.007636,0.260009,-0.201052,0.004062,0.022358,0.010578,-0.009154,0.008371,0.041099,0.008921,0.02356,0.024939,0.075709,0.026958,0.026958,-0.042008,-0.014865,0.078204,-0.036265,-0.070964,0.010033,-0.027438,-0.110004,0.039582,0.010892,0.019039,-0.008737,-0.004219,-0.03661,-0.011229,-0.001404,0.074617,-0.017383,0.122543,0.02275,0.028605,0.051103,0.100585,0.042541,0.07338,0.066789,0.066789,-0.035209,0.013559,0.053985,-0.036574,-0.113819,-0.030669,-0.077594,-0.105499,0.045681,-0.019351,0.003718,0.003224,-0.077389,0.041149,0.055744,-0.037793,-0.011006,-0.083375,0.130772,-0.084643,-0.023792,0.014264,0.002786,-0.013305,0.023174,-0.05078,-0.05078,0.017642,0.004616,-0.040024,-0.027582,-0.017515,-0.071337,-0.000554,0.034503,-0.045326,-0.057522,-0.073906,-0.017127,-0.056747,0.062126,0.086339,0.092265,-0.033565,-0.061023,0.015983,0.003914,-0.062945,-0.044618,-0.027566,-0.010537,0.026277,0.057592,-0.012619,0.044597,-0.01356,0.001114,0.047339,0.023642,0.010225,-0.072566,0.008178,0.007341,0.001648,0.001379,0.038657,0.027709,-0.012397,-0.04198,-0.025868,-0.046136,0.039172,0.059302,0.022242,-0.004115,-0.033714,0.00816,0.012553,-0.032076,0.011851,0.053331,0.110689,-0.034627,0.038037
DISTANCE,0.009849,0.011752,0.005362,0.010195,0.025206,1.0,-0.030558,0.025708,-0.125732,-0.010787,0.006118,-0.014874,0.037166,0.044612,0.090175,-0.136948,0.005918,-0.017286,0.238738,0.075284,0.041787,0.103891,-0.032979,-0.032979,-0.039437,-0.010566,0.06862,-0.001118,0.073843,0.272911,0.029313,-0.017293,0.030271,0.101513,0.166212,0.027643,0.106947,0.073569,0.003427,-0.000304,0.027124,0.029656,-0.057072,0.024052,-0.01006,0.071606,0.016551,0.010798,0.009213,-0.008148,-0.008148,-0.012659,0.008443,0.010188,0.006696,0.010011,0.031044,-0.007139,-0.004234,0.001837,0.015379,0.042598,0.023899,0.013503,-0.006509,0.001155,-0.01049,0.019341,0.000593,-0.014722,0.031314,-0.002928,0.02308,0.01619,-0.006726,-0.001266,-0.006406,-0.006406,-0.008214,0.012031,-0.007918,0.015794,-0.00282,0.021179,-0.022201,0.001038,0.003264,0.000152,0.01405,-0.018409,0.003751,-0.029675,-0.00488,-0.00764,0.015962,-0.004284,0.005241,0.003967,-0.002502,-0.005635,-0.00652,0.01486,-0.003446,-0.007324,0.000707,-0.00958,0.004632,-0.01761,0.018206,-0.006119,0.00743,-0.01908,-0.015018,0.012455,0.001793,0.022567,0.005579,-0.002118,-0.012468,-0.005111,0.012361,-0.00742,0.009006,-0.007312,0.007514,-0.002435,0.008391,0.000793,-0.004822,-0.013864,0.014148,-0.022379,-0.013882,0.001521,-0.028457
FIELDPOSITION,-0.026556,-0.024562,-0.002196,0.031457,-0.007641,-0.030558,1.0,0.012341,0.41594,0.000643,0.039143,-0.090761,-0.002334,0.012976,0.005039,0.117509,0.004409,0.046478,0.019942,0.079822,-0.007225,0.032206,-0.004513,-0.004513,-0.006795,-0.005538,0.017085,-0.000429,-0.010065,-0.020497,0.00959,-0.0397,0.033629,0.010189,0.003893,0.001999,0.024915,0.071522,0.049372,-0.020754,-0.028627,-0.024116,0.093392,-0.030804,0.036437,0.001883,0.021705,-0.023256,0.003266,-0.012679,-0.012679,0.007114,-0.001804,-0.013301,-0.004102,0.011973,-0.028714,-0.008829,-0.031095,0.000393,-0.015863,-0.03705,0.001631,0.007265,-0.016904,0.027633,0.013062,-0.019589,-0.015501,0.030109,-0.025123,0.02213,0.000309,-0.016952,-0.021741,-0.006682,0.025898,0.025898,-0.020237,0.012233,0.013116,-0.007829,0.019358,-0.003315,0.015751,-0.006203,0.007684,-0.008686,-0.019868,0.032352,0.00541,0.045849,0.00229,0.027519,-0.011781,-0.015424,-0.025169,0.00535,0.00323,-0.004841,0.006138,0.007495,-0.022857,0.012707,-0.02975,0.003049,0.006574,0.004087,0.003139,-0.018552,-0.007679,0.019511,0.020238,-0.007609,-0.005422,-0.021291,-0.018522,-0.005713,-0.01491,0.009483,0.008011,-0.001317,0.00108,-0.018693,0.002531,-0.003437,-0.012647,0.009021,0.005966,0.002715,-0.013079,0.031626,0.025046,0.005946,0.034965
DRIVE,0.008861,0.012798,0.009161,-0.058384,0.694411,0.025708,0.012341,1.0,-0.023579,0.005683,0.310761,-0.234992,0.016399,0.038232,0.033408,-0.003898,0.013773,0.083536,0.017189,0.028596,0.037705,0.081066,0.031266,0.031266,-0.035171,-0.027994,0.087487,-0.007447,-0.088078,0.013559,-0.032042,-0.159436,0.080863,0.020257,0.046596,-0.02386,0.000867,-0.042704,-0.010041,0.13273,0.165341,0.127981,-0.120944,0.105837,0.063452,0.080407,0.053795,0.152752,0.10605,0.037665,0.037665,-0.0407,-0.00704,0.121196,0.024169,-0.157858,0.109415,-0.033905,-0.224221,0.134384,0.085049,0.199695,-0.011475,-0.022356,-0.039905,-0.013807,0.02715,0.033465,0.033441,-0.030448,0.039177,0.005845,0.026096,0.026385,0.050101,0.005665,-0.053376,-0.053376,-0.015003,0.012826,0.002648,0.035706,-0.014764,0.017392,-0.015554,-0.013854,0.005653,-0.000967,0.048649,-0.041119,-0.028596,-0.005862,-0.015198,0.162127,-0.083984,-0.093751,-0.000832,0.021679,-0.0368,-0.097185,0.002268,0.021216,-0.017048,-0.035132,-0.047115,0.047619,-0.019691,0.028473,0.082101,0.041751,0.083699,-0.190646,0.012515,-0.017207,0.001579,0.012678,0.042433,0.029864,-0.046082,-0.00102,0.018976,-0.010381,-0.022042,-0.00467,0.023585,0.00265,-0.017213,0.029957,-0.007049,0.001943,-0.019292,-0.010342,-0.035364,0.01191,-0.056828
DRIVEPLAY,-0.01892,-0.019099,-0.012576,-0.056195,-0.021131,-0.125732,0.41594,-0.023579,1.0,-0.008393,0.016116,-0.124448,-0.008949,0.016968,0.033403,-0.075858,0.008913,0.050795,0.0482,-0.016939,0.025576,0.039,0.059774,0.059774,-0.003578,-0.040707,0.062494,-0.032733,-0.062645,0.0122,0.002389,-0.025512,0.088597,0.050324,0.036266,-0.041528,0.025972,-0.051254,-0.041218,-0.041014,-0.031062,-0.008626,0.045042,-0.037627,0.045228,0.015586,-0.009748,-0.032837,0.009958,0.019438,0.019438,0.020254,-0.014453,-0.013729,-0.022158,0.004873,-0.066563,-0.008505,0.015491,-0.003063,-0.010015,-0.042028,-0.007118,0.021334,-0.021051,0.001364,0.020989,-0.008691,-0.014445,0.017268,-0.030653,0.025023,0.00744,-0.039581,-0.023863,-0.005721,0.018042,0.018042,-0.003954,0.008823,-0.009476,-0.015695,0.020399,0.008034,0.024944,0.032042,0.006921,-0.007164,-0.014212,0.06015,0.01691,0.047439,0.023875,0.015762,0.012951,-0.019073,-0.004117,-0.015993,-0.010448,0.003465,0.004331,-0.027269,-0.019415,0.007248,-0.018408,-0.015768,-0.009195,-0.008283,-0.020588,0.000308,-0.005216,0.023776,0.009921,0.012729,-0.001442,-0.016557,-0.015618,-0.02385,-0.005546,-0.011641,-0.017871,0.005868,-0.010116,-0.008153,-0.009487,-0.013417,-0.004778,-0.008697,0.008596,0.026583,-0.021271,0.041065,-0.001269,-0.048293,0.007265
SPOTLEFT,0.004633,0.002859,-0.003896,-0.014531,0.007636,-0.010787,0.000643,0.005683,-0.008393,1.0,0.005104,-0.003981,0.000914,0.000143,0.012749,0.008388,-0.007705,0.002337,-0.007261,0.001621,-0.016862,-0.044965,0.007324,0.007324,-0.000779,0.000475,0.000388,0.008068,0.012628,0.000708,-0.013298,-0.012652,-0.006477,0.026529,0.010016,0.018438,0.028873,-0.018455,4e-06,-0.013649,0.014692,-0.007157,0.011058,0.009893,-0.012319,-0.002527,0.019429,0.013957,-0.011593,0.013521,0.013521,0.006964,-0.002207,-0.011965,0.013539,0.01118,-0.006971,-0.019019,-0.000883,-0.011998,-0.011727,-0.002176,0.005195,0.004801,0.000194,-0.000541,-0.008144,-0.000982,0.00943,-0.006409,0.016831,-0.003105,-0.00375,0.019558,0.027071,0.002316,-0.001445,-0.001445,-0.004113,0.010225,-0.011829,0.019182,-0.017982,0.016801,-0.037163,-0.002971,0.01328,0.005454,0.010192,-0.01981,0.001133,-0.023825,-0.001394,-0.028215,0.008338,0.018757,0.017793,0.010078,-0.00253,0.016275,-0.015515,-0.003291,0.019879,-0.005644,0.00032,-0.000769,0.003369,-0.009525,0.014191,0.002569,-0.020494,0.00483,-0.016296,0.001114,0.025286,0.01449,0.008072,0.00258,-0.009518,-0.014404,0.001365,-0.011261,-0.009587,0.007782,0.002866,0.005271,-0.009867,0.008832,-0.007806,0.00367,-0.004357,0.002398,-0.018305,-0.006495,0.016817


In [26]:
''' 
Tests for filtering 
'''
test_df = dummy_dataset.copy(deep=True)
test_df = test_df.sort_values(["OffTeam", "GAMEID", "PLAYID"]).reset_index(drop=True)


In [27]:
''' 
Want to make sure the indices are aligned, so check that rows match (e.g. gameid switches at 150)
'''
#test_col_names_for_numeric_previous = ['HIT','HURRY', 'GAINLOSSNET']

' \nWant to make sure the indices are aligned, so check that rows match (e.g. gameid switches at 150)\n'

In [28]:
# tests that we are calculating the rolling means correctly for a given team in a given game
# ASSUMES df is sorted how it should be sorted
def test_game_prev_calculation(df, game_id, off_team, col_name = 'HURRY'):
    slice = df.query("GAMEID == @game_id and OffTeam == @off_team")

    #### manually calculate some means #####
    hurry_first_10_play_avg = slice[col_name][0:10].mean()
    hurry_first_11_play_avg = slice[col_name][0:11].mean()
    hurry_first_12_play_avg = slice[col_name][0:12].mean()

    assert np.isnan(slice[f'game_prev_{col_name}'].iloc[0])
    assert np.isnan(slice[f'game_prev_{col_name}'].iloc[9])
    assert hurry_first_10_play_avg == slice[f'game_prev_{col_name}'].iloc[10]
    assert hurry_first_11_play_avg == slice[f'game_prev_{col_name}'].iloc[11]
    assert hurry_first_12_play_avg == slice[f'game_prev_{col_name}'].iloc[12]
    assert slice[col_name][0:-1].mean() == slice[f'game_prev_{col_name}'].iloc[-1]
    print(f"Successful")

In [29]:
test_game_prev_calculation(df=test_df, game_id=19752, off_team='Team_25')
test_game_prev_calculation(df=test_df, game_id=19752, off_team='Team_25', col_name='GAINLOSSNET')
test_game_prev_calculation(df=test_df, game_id=19752, off_team='Team_25', col_name='HIT')
test_game_prev_calculation(df=test_df, game_id=19746, off_team='Team_28')
test_game_prev_calculation(df=test_df, game_id=19663, off_team='Team_22')
# check categorical features
test_game_prev_calculation(df=test_df, game_id=18548, off_team='Team_3', col_name='CENTERPASSBLOCKDIRECTION_C')
test_game_prev_calculation(df=test_df, game_id=18548, off_team='Team_3', col_name='CENTERPASSBLOCKDIRECTION_L')
test_game_prev_calculation(df=test_df, game_id=18548, off_team='Team_3', col_name='CENTERPASSBLOCKDIRECTION_R')
test_game_prev_calculation(df=test_df, game_id=18548, off_team='Team_3', col_name='CENTERPASSBLOCKDIRECTION_nan')

Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful


In [40]:
# tests that we are calculating the rolling means correctly for a given team
# ASSUMES df is sorted how it should be sorted
def test_historical_prev_calculation(df, off_team, col_name = 'CENTERPASSBLOCKDIRECTION_C'):
    slice = df.query("OffTeam == @off_team")
    max_index = slice.shape[0]

    assert np.isnan(slice[f'historical_prev_{col_name}'].iloc[0])
    assert np.isnan(slice[f'historical_prev_{col_name}'].iloc[50])
    assert np.isnan(slice[f'historical_prev_{col_name}'].iloc[100-1])
    assert slice[col_name][0:100].mean() == slice[f'historical_prev_{col_name}'].iloc[100]
    assert slice[col_name][0:(max_index//2)].mean()  == slice[f'historical_prev_{col_name}'].iloc[(max_index//2)]
    assert slice[col_name][0:-1].mean() == slice[f'historical_prev_{col_name}'].iloc[-1]
    print(f"Successful")


In [41]:
test_historical_prev_calculation(df=test_df, off_team='Team_21', col_name='CENTERPASSBLOCKDIRECTION_C')
test_historical_prev_calculation(df=test_df, off_team='Team_21', col_name='CENTERPASSBLOCKDIRECTION_L')
test_historical_prev_calculation(df=test_df, off_team='Team_21', col_name='CENTERPASSBLOCKDIRECTION_R')
test_historical_prev_calculation(df=test_df, off_team='Team_21', col_name='CENTERPASSBLOCKDIRECTION_nan')

Successful
Successful
Successful
Successful


In [54]:
pd.get_dummies(dataset.groupby(["GAMEID", "OffTeam"]).get_group((18548, "Team_3"))[['OFFPERSONNEL_SIMPLIFIED']]).expanding(min_periods=10).mean()

Unnamed: 0,OFFPERSONNEL_SIMPLIFIED_11,OFFPERSONNEL_SIMPLIFIED_12,OFFPERSONNEL_SIMPLIFIED_13,OFFPERSONNEL_SIMPLIFIED_21,OFFPERSONNEL_SIMPLIFIED_22
92,,,,,
93,,,,,
94,,,,,
95,,,,,
96,,,,,
97,,,,,
98,,,,,
99,,,,,
100,,,,,
101,0.4,0.2,0.0,0.2,0.2


In [10]:
# for tendencies:
# DROPBACKTYPE, CENTERPASSBLOCKDIRECTION, DEFPERSONNEL, OFFPERSONNELBASIC, TEALIGNMENT, PASSRESULT

In [32]:
dataset.DROPBACKTYPE.unique()

array(['SD', nan, 'RR', 'SR', 'SL', 'RL', 'RSR', 'STP', 'RSL', 'FF',
       'RRL', 'WRP', 'RLR', 'BFP', 'RBP'], dtype=object)

In [30]:
dataset.PASSRESULT.unique()

array(['INCOMPLETE', 'COMPLETE', nan, 'RUN', 'SACK', 'THROWN AWAY',
       'SPIKE', 'HIT AS THREW', 'INTERCEPTION', 'BATTED PASS', 'LATERAL'],
      dtype=object)

In [28]:
''' 
Want to make sure the indices are aligned, so check that rows match (e.g. gameid switches at 150)
'''
dataset.groupby(["GAMEID", "OffTeam"])[col_names_for_numeric_previous].expanding(min_periods=10).mean().iloc[145:155]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,FORCEDFUMBLE,HIT,HURRY,GAINLOSSNET,INTERCEPTION,NOHUDDLE,PENALTY,PASSDEPTH,PASSBREAKUP,DROPBACKDEPTH,MOFO_PLAYED,MOFO_SHOWN,PISTOL,PLAYACTION,SACK,SCREEN,SHIFTMOTION,SHOTGUN,QBMOVEDOFFSPOT,QBPRESSURE,TIMETOPRESSURE,TIMETOTHROW,YARDSAFTERCATCH,YARDSAFTERCONTACT
GAMEID,OffTeam,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
18548,Team_3,145,0.0,0.0,0.148148,7.314815,0.018519,0.018519,0.074074,8.958333,0.0,6.708333,0.377358,0.377358,0.0,0.185185,0.0,0.018519,0.703704,0.333333,0.166667,0.148148,2.836364,3.383333,4.75,3.659574
18548,Team_3,146,0.0,0.0,0.145455,7.2,0.018182,0.018182,0.072727,8.958333,0.0,6.708333,0.37037,0.37037,0.0,0.181818,0.0,0.018182,0.709091,0.327273,0.163636,0.145455,2.836364,3.383333,4.75,3.583333
18548,Team_3,147,0.0,0.0,0.142857,7.25,0.017857,0.017857,0.071429,8.958333,0.0,6.708333,0.363636,0.363636,0.0,0.178571,0.0,0.017857,0.696429,0.321429,0.160714,0.142857,2.836364,3.383333,4.75,3.714286
18548,Team_3,148,0.0,0.0,0.140351,7.140351,0.017544,0.017544,0.070175,8.958333,0.0,6.708333,0.375,0.375,0.0,0.175439,0.0,0.017544,0.684211,0.315789,0.157895,0.140351,2.836364,3.383333,4.75,3.66
18548,Team_3,149,0.0,0.0,0.137931,6.982759,0.017241,0.017241,0.068966,8.958333,0.0,6.708333,0.375,0.375,0.0,0.172414,0.0,0.017241,0.672414,0.310345,0.155172,0.137931,2.836364,3.383333,4.75,3.588235
18549,Team_12,150,,,,,,,,,,,,,,,,,,,,,,,,
18549,Team_12,151,,,,,,,,,,,,,,,,,,,,,,,,
18549,Team_12,152,,,,,,,,,,,,,,,,,,,,,,,,
18549,Team_12,153,,,,,,,,,,,,,,,,,,,,,,,,
18549,Team_12,154,,,,,,,,,,,,,,,,,,,,,,,,


In [29]:
dataset.iloc[145:155]

Unnamed: 0,2MINUTE,CENTERPASSBLOCKDIRECTION,CLOCK,DEFENSIVELINESHIFT,DEFPERSONNEL,DEFSCORE,DEFSUBSTITUTIONS,DefTeam,DEFTIMEOUTSREMAINING,DISTANCE,DOWN,DRIVE,DRIVEENDEVENT,DRIVEENDFIELDPOSITION,DRIVEENDPLAYNUMBER,DRIVEPLAY,DRIVESTARTEVENT,DRIVESTARTFIELDPOSITION,DROPBACKDEPTH,DROPBACKTYPE,FIELDPOSITION,FORCEDFUMBLE,FUMBLE,FUMBLELOST,GAINLOSS,GAINLOSSNET,GAMEID,GARBAGETIME,HASH,HASHDEF,HIT,HURRY,INTERCEPTION,KICKYARDS,MOFOCPLAYED,MOFOCSHOWN,NEXTPLAYID,NOHUDDLE,NOPLAY,OFFFORMATIONUNBALANCED,OFFPERSONNELBASIC,OFFSCORE,OFFSUBSTITUTIONS,OffTeam,OFFTIMEOUTSREMAINING,OPERATIONTIME,OPTION,PASSBREAKUP,PASSDEPTH,PASSDIRECTION,PASSRECEIVERPOSITIONTARGET,PASSRESULT,PASSRUSHRESULT,PASSWIDTH,PENALTY,PENALTYYARDS,PISTOL,PLAYACTION,PLAYACTIONFAKE,PLAYCLOCK,PLAYENDFIELDPOSITION,PLAYID,POAACTUAL,POAINTENDED,PREVIOUSPFFPLAYID,PUMPFAKE,QBMOVEDOFFSPOT,QBPRESSURE,QBRESET,QUARTER,RBDIRECTION,RBSINBACKFIELD,RETURNYARDS,RUNPASS,RUNPASSOPTION,SACK,SCORE,SCOREDIFFERENTIAL,SCREEN,SHIFTMOTION,SHOTGUN,SNAPTIME,SORTORDER,SPOTLEFT,STUNT,TACKLE,TEALIGNMENT,TEMPO,TIMETOPRESSURE,TIMETOTHROW,TOUCHDOWN,TRICKLOOK,TRICKPLAY,WEEK,YARDSAFTERCATCH,YARDSAFTERCONTACT,MOFO_PLAYED,MOFO_SHOWN,CLOCK_INT,prev_FORCEDFUMBLE,prev_HIT,prev_HURRY,prev_GAINLOSSNET,prev_INTERCEPTION,prev_NOHUDDLE,prev_PENALTY,prev_PASSDEPTH,prev_PASSBREAKUP,prev_DROPBACKDEPTH,prev_MOFO_PLAYED,prev_MOFO_SHOWN,prev_PISTOL,prev_PLAYACTION,prev_SACK,prev_SCREEN,prev_SHIFTMOTION,prev_SHOTGUN,prev_QBMOVEDOFFSPOT,prev_QBPRESSURE,prev_TIMETOPRESSURE,prev_TIMETOTHROW,prev_YARDSAFTERCATCH,prev_YARDSAFTERCONTACT,game_prev_FORCEDFUMBLE,game_prev_HIT,game_prev_HURRY,game_prev_GAINLOSSNET,game_prev_INTERCEPTION,game_prev_NOHUDDLE,game_prev_PENALTY,game_prev_PASSDEPTH,game_prev_PASSBREAKUP,game_prev_DROPBACKDEPTH,game_prev_MOFO_PLAYED,game_prev_MOFO_SHOWN,game_prev_PISTOL,game_prev_PLAYACTION,game_prev_SACK,game_prev_SCREEN,game_prev_SHIFTMOTION,game_prev_SHOTGUN,game_prev_QBMOVEDOFFSPOT,game_prev_QBPRESSURE,game_prev_TIMETOPRESSURE,game_prev_TIMETOTHROW,game_prev_YARDSAFTERCATCH,game_prev_YARDSAFTERCONTACT
145,4,,04:23,0,6-2-3,23,1,Team_23,3,10,1,9.0,TOUCHDOWN,0.0,6.0,3.0,KICKOFF -,-25.0,,,45,0,0,0,33.0,33,18548,0,L,R,0,0,0,,C,C,3528452.0,0,0,0,22,28,1,Team_3,2,,0,0,,,,,,,0,,0,0,0,1.0,12,3528449,RT,RT,3528448.0,0,0,0,0,4,R,2.0,,R,0,0,28.23,5,0,1,0,,155,24,0,1,L;R,0,,,0,0,0,2,,28.0,0.0,0.0,263,0.0,0.0,0.0,26.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,15.0,0.0,0.0,0.148148,7.314815,0.018519,0.018519,0.074074,8.958333,0.0,6.708333,0.377358,0.377358,0.0,0.185185,0.0,0.018519,0.703704,0.333333,0.166667,0.148148,2.836364,3.383333,4.75,3.659574
146,4,,04:10,0,5-2-4,23,1,Team_23,2,10,1,9.0,TOUCHDOWN,0.0,6.0,4.0,KICKOFF -,-25.0,,,12,0,0,0,1.0,1,18548,0,R,L,0,0,0,,C,C,3528454.0,0,0,0,12,28,1,Team_3,2,,0,0,,,,,,,0,,0,0,0,15.0,11,3528452,LT,LT,3528449.0,0,0,0,0,4,L,1.0,,R,0,0,28.23,5,0,1,0,,156,29,0,1,L;L;L,0,,,0,0,0,2,,0.0,0.0,0.0,250,0.0,0.0,0.0,33.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,28.0,0.0,0.0,0.145455,7.2,0.018182,0.018182,0.072727,8.958333,0.0,6.708333,0.37037,0.37037,0.0,0.181818,0.0,0.018182,0.709091,0.327273,0.163636,0.145455,2.836364,3.383333,4.75,3.583333
147,4,,04:06,0,4-2-5,23,1,Team_23,1,9,2,9.0,TOUCHDOWN,0.0,6.0,5.0,KICKOFF -,-25.0,,,11,0,0,0,10.0,10,18548,0,R,L,0,0,0,,C,C,3528458.0,0,0,0,11,28,1,Team_3,2,,0,0,,,,,,,0,,0,0,0,,1,3528454,RT,RT,3528452.0,0,0,0,0,4,R,1.0,,R,0,0,28.23,5,0,0,0,,157,29,0,1,R,0,,,0,0,0,2,,10.0,0.0,0.0,246,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.142857,7.25,0.017857,0.017857,0.071429,8.958333,0.0,6.708333,0.363636,0.363636,0.0,0.178571,0.0,0.017857,0.696429,0.321429,0.160714,0.142857,2.836364,3.383333,4.75,3.714286
148,4,,03:59,0,4-2-5,23,0,Team_23,0,1,1,9.0,TOUCHDOWN,0.0,6.0,6.0,KICKOFF -,-25.0,,,1,0,0,0,1.0,1,18548,0,R,L,0,0,0,,O,O,3528459.0,0,0,0,11,28,0,Team_3,2,,0,0,,,,,,,0,,0,0,0,,0,3528458,RE,RE,3528454.0,0,0,0,0,4,R,1.0,,R,0,0,28.23,5,0,0,0,,158,29,0,0,R,0,,,1,0,0,2,,1.0,1.0,1.0,239,0.0,0.0,0.0,10.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,10.0,0.0,0.0,0.140351,7.140351,0.017544,0.017544,0.070175,8.958333,0.0,6.708333,0.375,0.375,0.0,0.175439,0.0,0.017544,0.684211,0.315789,0.157895,0.140351,2.836364,3.383333,4.75,3.66
149,4,,00:42,0,5-2-4,30,1,Team_23,0,10,1,10.0,END OF GAME,48.0,1.0,1.0,KICKOFF - ONSIDE,46.0,,,46,0,0,0,-2.0,-2,18548,0,R,L,0,0,0,,,,,0,0,0,22,35,1,Team_3,1,,0,0,,,,,,,0,,0,0,0,13.0,48,3528495,QB KNEEL,QB KNEEL,3528494.0,0,0,0,0,4,U,3.0,,R,0,0,35.3,5,0,0,0,,179,29,0,0,L;R,0,,,0,0,0,2,,0.0,,,42,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.137931,6.982759,0.017241,0.017241,0.068966,8.958333,0.0,6.708333,0.375,0.375,0.0,0.172414,0.0,0.017241,0.672414,0.310345,0.155172,0.137931,2.836364,3.383333,4.75,3.588235
150,0,,14:53,0,3-3-5,0,1,Team_29,3,10,1,1.0,TOUCHDOWN,0.0,13.0,1.0,KICKOFF - RETURN,-18.0,,,-18,0,0,0,3.0,3,18549,0,L,R,0,0,0,,C,C,3535741.0,0,0,0,12,0,1,Team_12,3,,0,0,,,,,,,0,,0,0,0,12.0,-21,3535738,ML,ML,3535723.0,0,0,0,0,1,R,1.0,,R,0,0,0.0,0,0,1,0,,2,24,0,1,L;R,0,,,0,1,0,2,,2.0,0.0,0.0,893,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
151,0,,14:16,0,3-3-5,0,0,Team_29,3,7,2,1.0,TOUCHDOWN,0.0,13.0,2.0,KICKOFF - RETURN,-18.0,,,-21,0,0,0,2.0,2,18549,0,C,C,0,0,0,,O,O,3535744.0,0,0,0,11,0,1,Team_12,3,,0,0,,,,,,,0,,0,0,0,6.0,-23,3535741,ML,ML,3535738.0,0,0,0,0,1,L,1.0,,R,0,0,0.0,0,0,1,0,,3,28,0,1,,0,,,0,0,0,2,,2.0,1.0,1.0,856,0.0,0.0,0.0,3.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,
152,0,C,13:35,0,0-3-8,0,1,Team_29,3,5,3,1.0,TOUCHDOWN,0.0,13.0,3.0,KICKOFF - RETURN,-18.0,8.0,SD,-23,0,0,0,17.0,17,18549,0,L,R,0,0,0,,O,C,3535747.0,0,0,0,11,0,1,Team_12,3,,0,0,16.0,M,RWR,COMPLETE,,31.0,0,,0,0,0,9.0,-40,3535744,,,3535741.0,0,0,0,0,1,L,1.0,,P,0,0,0.0,0,0,0,1,,4,24,1,1,,0,,3.2,0,0,0,2,1.0,0.0,1.0,1.0,815,0.0,0.0,0.0,2.0,0.0,0.0,0.0,,0.0,,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,
153,0,R,12:55,0,3-3-5,0,1,Team_29,3,10,1,1.0,TOUCHDOWN,0.0,13.0,4.0,KICKOFF - RETURN,-18.0,9.0,SD,-40,0,0,0,-6.0,-6,18549,0,R,L,0,1,0,,C,C,3535749.0,0,0,0,21,0,1,Team_12,3,,0,0,0.0,X,,SACK,SACK,,0,,0,1,1,9.0,-34,3535747,,,3535744.0,0,0,1,0,1,L,1.0,,P,0,1,0.0,0,0,1,0,,5,29,0,0,R,0,3.1,4.0,0,1,0,2,,,0.0,0.0,775,0.0,0.0,0.0,17.0,0.0,0.0,0.0,16.0,0.0,8.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,3.2,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,
154,0,L,12:10,0,3-3-5,0,0,Team_29,3,16,2,1.0,TOUCHDOWN,0.0,13.0,5.0,KICKOFF - RETURN,-18.0,6.0,SD,-34,0,0,0,15.0,15,18549,0,R,L,0,0,0,,C,C,3535753.0,0,0,0,11,0,1,Team_12,3,,0,0,-4.0,M,SLiWR,COMPLETE,,15.0,0,,0,0,0,3.0,-49,3535749,,,3535747.0,0,0,0,0,1,,,,P,0,0,0.0,0,1,1,1,,6,29,0,1,,0,,1.2,0,0,0,2,19.0,0.0,0.0,0.0,730,0.0,0.0,1.0,-6.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,3.1,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,


In [17]:
#dataset[game_prev_names] = dataset.groupby(["OffTeam"])[col_names_for_previous].expanding(min_periods=10).mean().reset_index(drop=True)

In [18]:
#groups = dataset.groupby(["GAMEID", "OffTeam", "DRIVE"])
#ex_group = groups.get_group((18548, 'Team_23', 1))

# checking that shifts and expanding mean do what we want

In [19]:
dataset.groupby(["GAMEID", "OffTeam"])[['FORCEDFUMBLE', 'PASSDEPTH', 'DRIVEPLAY']].expanding().mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,FORCEDFUMBLE,PASSDEPTH,DRIVEPLAY
GAMEID,OffTeam,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18548,Team_23,0,0.000000,33.000000,1.000000
18548,Team_23,1,0.000000,19.000000,1.500000
18548,Team_23,2,0.000000,19.000000,2.000000
18548,Team_23,3,0.000000,13.666667,2.500000
18548,Team_23,4,0.000000,10.250000,3.000000
...,...,...,...,...,...
19807,Team_7,19829,0.020000,6.564103,3.604167
19807,Team_7,19830,0.019608,6.325000,3.795918
19807,Team_7,19831,0.019231,6.170732,4.000000
19807,Team_7,19832,0.018868,6.523810,4.000000


In [20]:
dataset.query("GAMEID == 18548 and OffTeam == 'Team_23'").head(20)[['GAMEID', 'PLAYID', 'OffTeam', 'DOWN', 'DRIVE', 'DRIVEPLAY', 'FORCEDFUMBLE', 'PASSDEPTH', 'game_prev_PASSDEPTH']]

Unnamed: 0,GAMEID,PLAYID,OffTeam,DOWN,DRIVE,DRIVEPLAY,FORCEDFUMBLE,PASSDEPTH,game_prev_PASSDEPTH
0,18548,3528152,Team_23,1,1.0,1.0,0,33.0,
1,18548,3528156,Team_23,2,1.0,2.0,0,5.0,
2,18548,3528157,Team_23,1,1.0,3.0,0,,
3,18548,3528160,Team_23,2,1.0,4.0,0,3.0,
4,18548,3528162,Team_23,3,1.0,5.0,0,0.0,
5,18548,3528164,Team_23,1,1.0,6.0,0,1.0,
6,18548,3528165,Team_23,2,1.0,7.0,0,,
7,18548,3528167,Team_23,3,1.0,8.0,0,,
8,18548,3528169,Team_23,4,1.0,9.0,0,6.0,
9,18548,3528171,Team_23,1,1.0,10.0,0,,


In [21]:
''' 
Inspect section to make sure shifting was done properly
'''
dataset.loc[147:170, ['GAMEID', 'PLAYID', 'OffTeam', 'DOWN', 'DRIVE', 'DRIVEPLAY', 'FORCEDFUMBLE', 'prev_FORCEDFUMBLE', 'game_prev_FORCEDFUMBLE', 'SHOTGUN', 'prev_SHOTGUN', 'game_prev_SHOTGUN']]

Unnamed: 0,GAMEID,PLAYID,OffTeam,DOWN,DRIVE,DRIVEPLAY,FORCEDFUMBLE,prev_FORCEDFUMBLE,game_prev_FORCEDFUMBLE,SHOTGUN,prev_SHOTGUN,game_prev_SHOTGUN
147,18548,3528454,Team_3,2,9.0,5.0,0,0.0,0.0,0,0.0,0.321429
148,18548,3528458,Team_3,1,9.0,6.0,0,0.0,0.0,0,0.0,0.315789
149,18548,3528495,Team_3,1,10.0,1.0,0,,0.0,0,,0.310345
150,18549,3535738,Team_12,1,1.0,1.0,0,,,0,,
151,18549,3535741,Team_12,2,1.0,2.0,0,0.0,,0,0.0,
152,18549,3535744,Team_12,3,1.0,3.0,0,0.0,,1,0.0,
153,18549,3535747,Team_12,1,1.0,4.0,0,0.0,,0,1.0,
154,18549,3535749,Team_12,2,1.0,5.0,0,0.0,,1,0.0,
155,18549,3535753,Team_12,3,1.0,6.0,0,0.0,,1,1.0,
156,18549,3535760,Team_12,1,1.0,8.0,0,0.0,,1,1.0,


In [22]:
dataset.loc[250:291, ['GAMEID', 'PLAYID', 'OffTeam', 'DOWN', 'DRIVE', 'DRIVEPLAY', 'FORCEDFUMBLE', 'prev_FORCEDFUMBLE', 'game_prev_FORCEDFUMBLE', 'SHOTGUN', 'prev_SHOTGUN', 'game_prev_SHOTGUN']]

Unnamed: 0,GAMEID,PLAYID,OffTeam,DOWN,DRIVE,DRIVEPLAY,FORCEDFUMBLE,prev_FORCEDFUMBLE,game_prev_FORCEDFUMBLE,SHOTGUN,prev_SHOTGUN,game_prev_SHOTGUN
250,18549,3537611,Team_29,1,8.0,7.0,0,0.0,0.027778,1,1.0,0.527778
251,18549,3537632,Team_29,1,8.0,8.0,0,0.0,0.027027,1,1.0,0.540541
252,18549,3537645,Team_29,1,8.0,9.0,0,0.0,0.026316,1,1.0,0.552632
253,18549,3537659,Team_29,2,8.0,10.0,0,0.0,0.025641,1,1.0,0.564103
254,18549,3537676,Team_29,3,8.0,11.0,0,0.0,0.025,0,1.0,0.55
255,18549,3537688,Team_29,4,8.0,12.0,0,0.0,0.02439,0,0.0,0.536585
256,18549,3537861,Team_29,1,9.0,1.0,0,,0.02381,0,,0.52381
257,18549,3537875,Team_29,1,9.0,2.0,0,0.0,0.023256,0,0.0,0.511628
258,18549,3537878,Team_29,2,9.0,3.0,0,0.0,0.022727,1,0.0,0.522727
259,18549,3537886,Team_29,3,9.0,4.0,0,0.0,0.022222,1,1.0,0.533333


In [55]:
pd.get_dummies(dataset.groupby(["GAMEID", "OffTeam"]).get_group((18548, "Team_3"))[['OFFPERSONNEL_SIMPLIFIED']]).expanding(min_periods=10).mean()

Unnamed: 0,OFFPERSONNEL_SIMPLIFIED_11,OFFPERSONNEL_SIMPLIFIED_12,OFFPERSONNEL_SIMPLIFIED_13,OFFPERSONNEL_SIMPLIFIED_21,OFFPERSONNEL_SIMPLIFIED_22
92,,,,,
93,,,,,
94,,,,,
95,,,,,
96,,,,,
97,,,,,
98,,,,,
99,,,,,
100,,,,,
101,0.4,0.2,0.0,0.2,0.2


In [42]:
df = pd.DataFrame({
    'total_goals': [2, 3, 1, 4, 2, 3, 1, 1, 7, 3, 1],
    'home_team_id': [1277, 1245, 1242, 1261, 1259, 2981, 1244, 1254, 1247, 5681, 1249],
    'away_team_id': [1241, 1249, 1246, 1248, 1240, 1268, 1255, 1276, 12140, 1270, 5681],
    'home_goals': [1, 2, 0, 1, 2, 1, 1, 1, 5, 2, 0],
    'away_goals': [1, 1, 1, 3, 0, 2, 0, 0, 2, 1, 1]
})
df['home_rolling_avg'] = np.nan
df['away_rolling_avg'] = np.nan

In [56]:
df

Unnamed: 0,total_goals,home_team_id,away_team_id,home_goals,away_goals,home_rolling_avg,away_rolling_avg
0,2,1277,1241,1,1,,
1,3,1245,1249,2,1,,
2,1,1242,1246,0,1,,
3,4,1261,1248,1,3,,
4,2,1259,1240,2,0,,
5,3,2981,1268,1,2,,
6,1,1244,1255,1,0,,
7,1,1254,1276,1,0,,
8,7,1247,12140,5,2,,
9,3,5681,1270,2,1,,


In [54]:
hw = ['home_team_id', 'away_team_id']
hw_vals = df[hw].values.ravel()
idx_rep = df.index.values.repeat(2)  # repeat index [0, 0, 1, 1, ...
tot_rep = df.total_goals.values.repeat(2)  # repeat totals [2, 2, 3, 3, ...

s = pd.Series(tot_rep, [idx_rep, hw_vals])


In [55]:
s.groupby(level=1).apply(lambda x: x.expanding().mean().shift()).dropna()

1249  10  1249    3.0
5681  10  5681    3.0
dtype: float64

In [43]:
# easy tracking of long column names
hw = ['home_team_id', 'away_team_id'] # long column names

# I found it easier to melt myself with some numpy help
hw_vals = df[hw].values.ravel()  # flatten 2 columns
idx_rep = df.index.values.repeat(2)  # repeat index [0, 0, 1, 1, ...
tot_rep = df.total_goals.values.repeat(2)  # repeat totals [2, 2, 3, 3, ...

# This is the reshaped series of team ids with total_goals
s = pd.Series(tot_rep, [idx_rep, hw_vals])

# groupby with a combination of expanding().mean() and shift()
e = s.groupby(level=1).apply(lambda x: x.expanding().mean().shift()).dropna()

# style preference of mine to do assignments using index values
# and to get it done in one line
df.set_index(hw[0], append=1).assign(home_rolling_avg=e).reset_index(hw[0]) \
  .set_index(hw[1], append=1).assign(away_rolling_avg=e).reset_index(hw[1])

TypeError: incompatible index of inserted column with frame index