In [319]:
import pandas as pd
import numpy as np

In [320]:
plays_df = pd.read_csv("./data2023/plays.csv")
plays_df = plays_df.loc[plays_df['passResult'].isin(['C', 'I', 'IN'])]
player_stats_df = pd.read_csv("merged_df.csv").drop('Unnamed: 0', axis='columns')
backup_player_stats_df = pd.read_csv('data2023/players.csv')
medians_df = pd.read_csv('all_pos_df.csv').rename(columns={'Unnamed: 0': 'Position'})

In [321]:
plays_df['passResult'].unique()

array(['I', 'C', 'IN'], dtype=object)

In [322]:
player_stats_df["abrev_name"] = player_stats_df.displayName.apply(lambda x: str(x)[0]+ "."+"".join(str(x).split(" ")[1:]))
player_stats_df['abrev_name']

0                T.Brady
1       B.Roethlisberger
2              A.Rodgers
3          R.Fitzpatrick
4                M.Lewis
              ...       
1409         J.Jefferson
1410             D.Milne
1411            G.Stuard
1412            J.Jacobs
1413            J.Heflin
Name: abrev_name, Length: 1414, dtype: object

In [323]:
backup_player_stats_df["abrev_name"] = backup_player_stats_df.displayName.apply(lambda x: str(x)[0]+ "."+"".join(str(x).split(" ")[1:]))
backup_player_stats_df['abrev_name']

0                T.Brady
1       B.Roethlisberger
2               J.Peters
3              A.Rodgers
4          R.Fitzpatrick
              ...       
1674           F.Merrill
1675          R.McCollum
1676            J.Heflin
1677            J.Curhan
1678            F.Franks
Name: abrev_name, Length: 1679, dtype: object

In [324]:
# Remove time from play description
playDescText = plays_df.playDescription.apply(lambda x: " ".join(str(x).split()[1:]))
# Remove formation from play description
playDescText = playDescText.apply(lambda x: "".join(x.split(")")[1:]) if str(x).startswith("(") else x)
playDescText

0         T.Brady pass incomplete deep right to C.Godwin.
1        D.Prescott pass deep left to A.Cooper pushed ...
2        D.Prescott pass short middle to D.Schultz to ...
3         D.Prescott pass incomplete deep left to C.Lamb.
4        D.Prescott pass incomplete short left to C.La...
                              ...                        
8549     P.Mahomes pass incomplete short right [O.Ximi...
8550     P.Mahomes pass short right intended for B.Pri...
8551     P.Mahomes pass short right to T.Kelce to KC 3...
8553     D.Jones pass short right to E.Engram pushed o...
8555     D.Jones pass incomplete short right to E.Engram.
Name: playDescription, Length: 7565, dtype: object

In [325]:
# Create columns of names of players who carried out play
# People who pass the ball
passers = playDescText.map(lambda x: str(x).lower().split(" ")[str(x).lower().split(" ").index("pass")-1] if "pass" in str(x).lower().split(" ") else np.nan)

# People who received the ball
receivers = playDescText.map(lambda x: str(x).lower().split(" ")[str(x).lower().split(" ").index("to")+1] if ("pass" in str(x).lower().split(" ") and "to" in str(x).lower().split(" ")) & ("intercept" not in str(x).lower()) else np.nan)
# Remove full stop from some names at end
receivers.loc[receivers.map(lambda x: str(x)[-1]) == "."] = receivers.loc[receivers.map(lambda x: str(x)[-1]) == "."].apply(lambda x: "".join(str(x)[:-1]))
# 2 values have NO # instead of names, so we can remove these
receivers.loc[receivers=="no"] = np.nan
receivers.loc[~receivers.str.contains('.', na=False, regex=False)] = np.nan

In [353]:
key_players_df = pd.DataFrame({'gameId': plays_df['gameId'], 'playId': plays_df['playId'], 'passer': passers, 'receiver': receivers})
key_players_df = key_players_df.dropna()
bad_players = np.concatenate((key_players_df[key_players_df['passer'].str.index('.') != 1]['passer'].unique(),
                                     key_players_df[key_players_df['receiver'].str.index('.') != 1]['receiver'].unique(),
                                     ['a.st']))
print(bad_players)
key_players_df = key_players_df.drop(key_players_df[(key_players_df['passer'].isin(bad_players)) | (key_players_df['receiver'].isin(bad_players))].index)
key_players_df = key_players_df.reset_index(drop=True)
key_players_df

['ty.taylor' 'aa.rodgers' 'jos.allen' 'dj.moore' 'mi.carter' 'am.rodgers'
 'da.williams' 'a.st']


Unnamed: 0,gameId,playId,passer,receiver
0,2021090900,97,t.brady,c.godwin
1,2021090900,137,d.prescott,a.cooper
2,2021090900,187,d.prescott,d.schultz
3,2021090900,282,d.prescott,c.lamb
4,2021090900,349,d.prescott,c.lamb
...,...,...,...,...
6685,2021110100,3955,d.jones,d.booker
6686,2021110100,4016,p.mahomes,t.kelce
6687,2021110100,4113,p.mahomes,t.kelce
6688,2021110100,4363,d.jones,e.engram


In [327]:
def stats_from_name(name, stats, columns, backup_stats):
    player_stats = stats[stats['abrev_name'].str.lower() == name].drop(stats.columns.difference(columns), axis='columns')[:1]
    if player_stats.empty:
        position = backup_stats[backup_stats['abrev_name'].str.lower() == name]['officialPosition'].values[0]
        player_stats = medians_df[medians_df['Position'] == position][columns].sort_index(axis=1)
    return player_stats


def yards_from_play(gameId, playId, plays):
    return plays[(plays['gameId'] == gameId) & (plays['playId'] == playId)]['prePenaltyPlayResult'].values[0]

In [328]:
feature_columns = ['Age', 'Height', 'Weight', 'Overall', 'Speed',
 'Acceleration', 'Agility', 'Change of Dir', 'Strength', 'Jumping',
 'Awareness', 'Carrying', 'Break Tackle', 'Juke Move', 'Spin Move',
 'Trucking', 'Stiff Arm', 'BC Vision', 'Catching', 'Catch In Traffic',
 'Spec Catch', 'Release', 'Short RR', 'Medium RR', 'Deep RR',
 'Throw Power', 'Throw Acc Short', 'Throw Acc Mid', 'Throw Acc Deep',
 'Throw Under Pressure', 'Throw On The Run', 'Play Action', 'Break Sack',
 'Run Block', 'Run Block Power', 'Run Block Finesse', 'Pass Block',
 'Pass Block Power', 'Pass Block Finesse', 'Impact Blocking',
 'Lead Blocking', 'Tackle', 'Hit Power', 'Pursuit', 'Man Coverage',
 'Zone Coverage', 'Press', 'Play Recognition', 'Power Moves',
 'Finesse Moves', 'Block Shedding', 'Kick Power', 'Kick Accuracy',
 'Kick Return', 'Stamina', 'Injury', 'Toughness', 'Years Pro']

In [354]:
X = np.stack(key_players_df.apply(lambda x: np.concatenate((stats_from_name(x['passer'], player_stats_df, feature_columns, backup_player_stats_df),
                                                            stats_from_name(x['receiver'], player_stats_df, feature_columns, backup_player_stats_df)), axis=1)[0], axis=1))
X.shape

(6690, 116)