# Data Cleanup

## Necessary Steps

1. Understand the data structure: Look through the database, understand the main structure, i.e., what each row represents, and what columns and types of information are available
2. Merge game file to add in week identifier to the new dataset (games.csv is in '1. Additional Data File')
3. Create column categorizations to filter dataset for relevant purposes
4. Break down receiver into its individual row
5. Create playmaker column, and check id uniqueness - check that one ID corresponds to one name
6. Flag non-relevant plays - add a binary column that flags 1 for run, pass, reception, FG/XP, 0 for all others
7. Add any additional stat needed - reception (for plays that fall under 'reception'), target ('reception', 'pass')
8. Add position based off of the highest stat of a player. Position will be refined later with web scraping
9. Ensure that stat are correctly represented for a given position
10. Verify top 50 stats against the reported ones


## Setup Environment

In [3]:
import pandas as pd
import numpy as np
from pandas import ExcelWriter
pd.set_option('display.max_columns', None)

## 1. Upload Data

In [None]:
'''
Not sure why your folder names were changes. The working directory of this file is where its source code is located
so you can specify directories relatively
'''

In [68]:
# Store your files in the same folder as the source code, so you don't have to specify the directory
df_file_2019 = "../../1. Raw-Data/data2019.csv"
df_file_game = "../1. Additional-Data/games.csv"

# Use the convention df for dataframes
df1 = pd.read_csv(df_file_2019)
df_games = pd.read_csv(df_file_game)

  interactivity=interactivity, compiler=compiler, result=result)


## 2. Add in Weekly Identifier

In [None]:
'''
In each play row the week of that game will be displayed as a new column.
This will be done through a merge along the game_id column from the game.csv file. 
'''

In [69]:
df = df1.merge(df_games[["game_id", "week"]], on = 'game_id')

df

Unnamed: 0.1,Unnamed: 0,play_id,game_id,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,game_date,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,drive,sp,qtr,down,goal_to_go,time,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,shotgun,no_huddle,qb_dropback,qb_kneel,qb_spike,qb_scramble,pass_length,pass_location,air_yards,yards_after_catch,run_location,run_gap,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,home_timeouts_remaining,away_timeouts_remaining,timeout,timeout_team,td_team,posteam_timeouts_remaining,defteam_timeouts_remaining,total_home_score,total_away_score,posteam_score,defteam_score,score_differential,posteam_score_post,defteam_score_post,score_differential_post,no_score_prob,opp_fg_prob,opp_safety_prob,opp_td_prob,fg_prob,safety_prob,td_prob,extra_point_prob,two_point_conversion_prob,ep,epa,total_home_epa,total_away_epa,total_home_rush_epa,total_away_rush_epa,total_home_pass_epa,total_away_pass_epa,air_epa,yac_epa,comp_air_epa,comp_yac_epa,total_home_comp_air_epa,total_away_comp_air_epa,total_home_comp_yac_epa,total_away_comp_yac_epa,total_home_raw_air_epa,total_away_raw_air_epa,total_home_raw_yac_epa,total_away_raw_yac_epa,wp,def_wp,home_wp,away_wp,wpa,home_wp_post,away_wp_post,total_home_rush_wpa,total_away_rush_wpa,total_home_pass_wpa,total_away_pass_wpa,air_wpa,yac_wpa,comp_air_wpa,comp_yac_wpa,total_home_comp_air_wpa,total_away_comp_air_wpa,total_home_comp_yac_wpa,total_away_comp_yac_wpa,total_home_raw_air_wpa,total_away_raw_air_wpa,total_home_raw_yac_wpa,total_away_raw_yac_wpa,punt_blocked,first_down_rush,first_down_pass,first_down_penalty,third_down_converted,third_down_failed,fourth_down_converted,fourth_down_failed,incomplete_pass,touchback,interception,punt_inside_twenty,punt_in_endzone,punt_out_of_bounds,punt_downed,punt_fair_catch,kickoff_inside_twenty,kickoff_in_endzone,kickoff_out_of_bounds,kickoff_downed,kickoff_fair_catch,fumble_forced,fumble_not_forced,fumble_out_of_bounds,solo_tackle,safety,penalty,tackled_for_loss,fumble_lost,own_kickoff_recovery,own_kickoff_recovery_td,qb_hit,rush_attempt,pass_attempt,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,kickoff_attempt,punt_attempt,fumble,complete_pass,assist_tackle,lateral_reception,lateral_rush,lateral_return,lateral_recovery,passer_player_id,passer_player_name,receiver_player_id,receiver_player_name,rusher_player_id,rusher_player_name,lateral_receiver_player_id,lateral_receiver_player_name,lateral_rusher_player_id,lateral_rusher_player_name,lateral_sack_player_id,lateral_sack_player_name,interception_player_id,interception_player_name,lateral_interception_player_id,lateral_interception_player_name,punt_returner_player_id,punt_returner_player_name,lateral_punt_returner_player_id,lateral_punt_returner_player_name,kickoff_returner_player_name,kickoff_returner_player_id,lateral_kickoff_returner_player_id,lateral_kickoff_returner_player_name,punter_player_id,punter_player_name,kicker_player_name,kicker_player_id,own_kickoff_recovery_player_id,own_kickoff_recovery_player_name,blocked_player_id,blocked_player_name,tackle_for_loss_1_player_id,tackle_for_loss_1_player_name,tackle_for_loss_2_player_id,tackle_for_loss_2_player_name,qb_hit_1_player_id,qb_hit_1_player_name,qb_hit_2_player_id,qb_hit_2_player_name,forced_fumble_player_1_team,forced_fumble_player_1_player_id,forced_fumble_player_1_player_name,forced_fumble_player_2_team,forced_fumble_player_2_player_id,forced_fumble_player_2_player_name,solo_tackle_1_team,solo_tackle_2_team,solo_tackle_1_player_id,solo_tackle_2_player_id,solo_tackle_1_player_name,solo_tackle_2_player_name,assist_tackle_1_player_id,assist_tackle_1_player_name,assist_tackle_1_team,assist_tackle_2_player_id,assist_tackle_2_player_name,assist_tackle_2_team,assist_tackle_3_player_id,assist_tackle_3_player_name,assist_tackle_3_team,assist_tackle_4_player_id,assist_tackle_4_player_name,assist_tackle_4_team,pass_defense_1_player_id,pass_defense_1_player_name,pass_defense_2_player_id,pass_defense_2_player_name,fumbled_1_team,fumbled_1_player_id,fumbled_1_player_name,fumbled_2_player_id,fumbled_2_player_name,fumbled_2_team,fumble_recovery_1_team,fumble_recovery_1_yards,fumble_recovery_1_player_id,fumble_recovery_1_player_name,fumble_recovery_2_team,fumble_recovery_2_yards,fumble_recovery_2_player_id,fumble_recovery_2_player_name,return_team,return_yards,penalty_team,penalty_player_id,penalty_player_name,penalty_yards,replay_or_challenge,replay_or_challenge_result,penalty_type,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv,year,week
0,1,35,2019090500,CHI,GB,GB,away,CHI,CHI,35.0,2019-09-05,900.0,1800.0,3600.0,Half1,0,1,0,1,,0.0,15:00,CHI 35,0,-10,E.Pineiro kicks 65 yards from CHI 35 to end zo...,kickoff,0,0,0,0.0,0,0,0,,,,,,,,,,,3,3,0.0,,,3.0,3.0,0,0,,,,0.0,0.0,0.0,0.001374,0.162632,0.004441,0.254179,0.233081,0.003656,0.340639,0.0,0.0,0.814998,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,,,,,,0.000000,0.000000,0.000000,0.000000,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,E.Pineiro,00-0034173,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,,0,,,0.0,0.0,0.0,0.0,2019,1
1,2,50,2019090500,CHI,GB,GB,away,CHI,GB,75.0,2019-09-05,900.0,1800.0,3600.0,Half1,0,1,0,1,1.0,0.0,15:00,GB 25,10,-10,(15:00) A.Jones left tackle to GB 25 for no ga...,run,0,0,0,0.0,0,0,0,,,,,left,tackle,,,,,3,3,0.0,,,3.0,3.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.001374,0.162632,0.004441,0.254179,0.233081,0.003656,0.340639,0.0,0.0,0.814998,-0.764363,0.764363,-0.764363,0.764363,-0.764363,0.000000,0.000000,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.500007,0.499993,0.499993,0.500007,-0.020660,0.520654,0.479346,0.020660,-0.020660,0.000000,0.000000,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,00-0033293,A.Jones,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,CHI,,00-0034874,,R.Smith,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,,0,,,0.0,0.0,0.0,0.0,2019,1
2,3,71,2019090500,CHI,GB,GB,away,CHI,GB,75.0,2019-09-05,873.0,1773.0,3573.0,Half1,0,1,0,1,2.0,0.0,14:33,GB 25,10,-10,(14:33) A.Rodgers pass short left to A.Jones t...,pass,0,0,0,1.0,0,0,0,short,left,-1.0,1.0,,,,,,,3,3,0.0,,,3.0,3.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.001569,0.188484,0.005696,0.295010,0.213057,0.003982,0.292202,0.0,0.0,0.050636,-0.987734,1.752097,-1.752097,0.764363,-0.764363,0.987734,-0.987734,-1.095212,0.107477,-1.095212,0.107477,1.095212,-1.095212,-0.107477,0.107477,1.095212,-1.095212,-0.107477,0.107477,0.479346,0.520654,0.520654,0.479346,-0.026088,0.546742,0.453258,0.020660,-0.020660,0.026088,-0.026088,-0.031647,0.005559,-0.031647,0.005559,0.031647,-0.031647,-0.005559,0.005559,0.031647,-0.031647,-0.005559,0.005559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,00-0023459,A.Rodgers,00-0033293,A.Jones,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,CHI,,00-0034874,,R.Smith,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,,0,,,0.0,0.0,0.0,0.0,2019,1
3,4,95,2019090500,CHI,GB,GB,away,CHI,GB,75.0,2019-09-05,825.0,1725.0,3525.0,Half1,0,1,0,1,3.0,0.0,13:45,GB 25,10,-10,(13:45) (Shotgun) A.Rodgers sacked at GB 15 fo...,pass,-10,1,0,1.0,0,0,0,,,,,,,,,,,3,3,0.0,,,3.0,3.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.001863,0.226273,0.006642,0.349574,0.171216,0.004530,0.239902,0.0,0.0,-0.937099,-2.221273,3.973370,-3.973370,0.764363,-0.764363,3.209007,-3.209007,,,0.000000,0.000000,1.095212,-1.095212,-0.107477,0.107477,1.095212,-1.095212,-0.107477,0.107477,0.453258,0.546742,0.546742,0.453258,-0.066931,0.613673,0.386327,0.020660,-0.020660,0.093020,-0.093020,,,0.000000,0.000000,0.031647,-0.031647,-0.005559,0.005559,0.031647,-0.031647,-0.005559,0.005559,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,00-0023459,A.Rodgers,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,00-0032667,R.Robertson-Harris,,,00-0032667,R.Robertson-Harris,,,,,,,,,CHI,,00-0032667,,R.Robertson-Harris,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,,0,,,0.0,0.0,0.0,0.0,2019,1
4,5,125,2019090500,CHI,GB,GB,away,CHI,GB,85.0,2019-09-05,795.0,1695.0,3495.0,Half1,0,1,0,1,4.0,0.0,13:15,GB 15,20,-10,(13:15) (Punt formation) J.Scott punts 53 yard...,punt,0,0,0,0.0,0,0,0,,,,,,,,53.0,,,3,3,0.0,,,3.0,3.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.002114,0.327285,0.010957,0.466627,0.054465,0.004293,0.134258,0.0,0.0,-3.158372,0.714739,3.258631,-3.258631,0.764363,-0.764363,3.209007,-3.209007,,,0.000000,0.000000,1.095212,-1.095212,-0.107477,0.107477,1.095212,-1.095212,-0.107477,0.107477,0.386327,0.613673,0.613673,0.386327,0.057563,0.556110,0.443890,0.020660,-0.020660,0.093020,-0.093020,,,0.000000,0.000000,0.031647,-0.031647,-0.005559,0.005559,0.031647,-0.031647,-0.005559,0.005559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,00-0033556,T.Cohen,,,,,,,00-0034162,J.Scott,,,,,,,,,,,,,,,,,,,,,GB,,00-0031584,,A.Amos,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,CHI,11,,,,,0,,,0.0,0.0,0.0,0.0,2019,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45541,45542,3955,2019122915,SEA,SF,SEA,home,SF,SF,5.0,2019-12-29,22.0,22.0,22.0,Half2,0,16,0,4,2.0,1.0,00:22,SF 5,5,72,(:22) (Shotgun) R.Wilson pass incomplete short...,pass,0,1,0,1.0,0,0,0,short,left,5.0,,,,,,,,0,1,0.0,,,0.0,1.0,21,26,21.0,26.0,-5.0,21.0,26.0,-5.0,0.174034,0.008484,0.000037,0.004921,0.378260,0.000657,0.433607,0.0,0.0,4.111364,-0.661444,-3.064260,3.064260,-6.785425,6.785425,0.644493,-0.644493,2.888636,-3.550080,0.000000,0.000000,5.247535,-5.247535,0.817163,-0.817163,23.272013,-23.272013,-23.801664,23.801664,0.235896,0.764104,0.235896,0.764104,-0.167196,0.068701,0.931299,-0.263874,0.263874,0.041199,-0.041199,0.702366,-0.869562,0.000000,0.000000,0.708619,-0.708619,-0.053939,0.053939,3.265681,-3.265681,-3.277452,3.277452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,00-0029263,R.Wilson,00-0032211,T.Lockett,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,00-0032378,D.Buckner,,,,,,,,,,,,,,,,,,,,,,,,,,,00-0034730,M.Harris,,,,,,,,,,,,,,,,,,0,,,,,0,,,0.0,0.0,0.0,0.0,2019,17
45542,45543,3977,2019122915,SEA,SF,SEA,home,SF,SF,5.0,2019-12-29,15.0,15.0,15.0,Half2,0,16,0,4,3.0,1.0,00:15,SF 5,5,72,(:15) (Shotgun) R.Wilson pass incomplete short...,pass,0,1,0,1.0,0,0,0,short,middle,5.0,,,,,,,,0,1,0.0,,,0.0,1.0,21,26,21.0,26.0,-5.0,21.0,26.0,-5.0,0.197189,0.009679,0.000068,0.005978,0.496176,0.000792,0.290118,0.0,0.0,3.449920,-0.912051,-3.976311,3.976311,-6.785425,6.785425,-0.267558,0.267558,3.550080,-4.462131,0.000000,0.000000,5.247535,-5.247535,0.817163,-0.817163,26.822093,-26.822093,-28.263795,28.263795,0.068701,0.931299,0.068701,0.931299,-0.042559,0.026141,0.973859,-0.263874,0.263874,-0.001360,0.001360,0.899287,-0.941846,0.000000,0.000000,0.708619,-0.708619,-0.053939,0.053939,4.164968,-4.164968,-4.219298,4.219298,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,00-0029263,R.Wilson,00-0033387,J.Hollister,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,,0,,,0.0,0.0,0.0,0.0,2019,17
45543,45544,3999,2019122915,SEA,SF,SEA,home,SF,SF,5.0,2019-12-29,12.0,12.0,12.0,Half2,0,16,0,4,4.0,1.0,00:12,SF 5,5,72,(:12) (Shotgun) R.Wilson pass short middle to ...,pass,4,1,0,1.0,0,0,0,short,middle,4.0,0.0,,,,,,,0,1,0.0,,,0.0,1.0,21,26,21.0,26.0,-5.0,21.0,26.0,-5.0,0.206216,0.008679,0.000180,0.005227,0.713258,0.000800,0.065640,0.0,0.0,2.537869,-2.636491,-6.612802,6.612802,-6.785425,6.785425,-2.904049,2.904049,-2.636491,0.000000,-2.636491,0.000000,2.611044,-2.611044,0.817163,-0.817163,24.185602,-24.185602,-28.263795,28.263795,0.026141,0.973859,0.026141,0.973859,0.010686,0.036827,0.963173,-0.263874,0.263874,0.009326,-0.009326,0.010686,0.000000,0.010686,0.000000,0.719304,-0.719304,-0.053939,0.053939,4.175654,-4.175654,-4.219298,4.219298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,00-0029263,R.Wilson,00-0033387,J.Hollister,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SF,,00-0034982,,D.Greenlaw,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,,1,upheld,,0.0,0.0,0.0,0.0,2019,17
45544,45545,4080,2019122915,SEA,SF,SF,away,SEA,SF,99.0,2019-12-29,9.0,9.0,9.0,Half2,0,17,0,4,1.0,0.0,00:09,SF 1,10,2,(:09) J.Garoppolo up the middle to SF 3 for 2 ...,run,2,0,0,0.0,0,0,0,,,,,middle,,,,,,0,1,0.0,,,1.0,0.0,21,26,26.0,21.0,5.0,26.0,21.0,5.0,0.814185,0.044958,0.009874,0.032654,0.051277,0.000275,0.046777,0.0,0.0,0.098622,,-6.612802,6.612802,-6.785425,6.785425,-2.904049,2.904049,,,0.000000,0.000000,2.611044,-2.611044,0.817163,-0.817163,24.185602,-24.185602,-28.263795,28.263795,0.963173,0.036827,0.036827,0.963173,0.036827,0.000000,1.000000,-0.300701,0.300701,0.009326,-0.009326,,,0.000000,0.000000,0.719304,-0.719304,-0.053939,0.053939,4.175654,-4.175654,-4.219298,4.219298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,00-0031345,J.Garoppolo,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SEA,,00-0034831,,R.Green,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,,0,,,0.0,0.0,0.0,0.0,2019,17


## 3. Create Categorization Lists

In [None]:
'''
The columns in the 2019 data file are arranged into categorical lists.
Many of the lists can be automated due to the presence of key words in their title.
'''

### 3.1 Main Lists

In [56]:
# main lists encompass aspects of the game that lead to fantasy points


key = ['Flag', 'Playmaker_id', 'Playmaker_name', 'play_id', 'game_id', 'home_team', 'away_team', 'week', 'game_date', 'posteam', 'posteam_type', 'defteam', 'side_of_field', 'play_type']


pass_play = ['pass_length','pass_location','air_yards']
run_play = ['run_location', 'run_gap']
yard_info = ['yrdln','ydstogo','ydsnet','yards_gained', "fumble_recovery_1_yards", "fumble_recovery_2_yards", "return_yards"]

receiver_info = [col for col in df.columns if 'receiver' in col]
receiver_info += ['yards_after_catch', 'incomplete_pass', 'complete_pass', 'touchdown', 'pass_touchdown', 'fumble_lost', 'Playmaker_id', 'Playmaker_name']


passer_info = [col for col in df.columns if 'passer' in col]

# passer_info += [col for col in df.columns if 'interception' in col]
passer_info += ['yards_after_catch', 'incomplete_pass', 'complete_pass', 'touchdown', 'pass_touchdown', 'fumble_lost', 'Playmaker_id', 'Playmaker_name', 'interception']


rusher_info = [col for col in df.columns if 'rusher' in col]
rusher_info += ['touchdown', 'fumble_lost', 'Playmaker_id', 'Playmaker_name']

two_pt_con


xp = ['field_goal_result', 'kick_distance', 'extra_point_result', 'kicker_']



### 3.2 Other Lists

In [57]:

# catagories are created manually and through list comprehension

game_time_info = ['quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'game_half', 'quarter_end', 'time']

gen_play_info = ['drive', 'sp', 'down', 'goal_to_go','desc','play_type','shotgun','no_huddle','qb_dropback','qb_kneel','qb_spike','qb_scramble']

team_info = ["return_team", 'td_team', 'posteam_time', 'defteam_time', 'total_home_score','total_away_score', 'posteam_score_post','defteam_score_post', 'score_differential', "forced_fumble_player_1_team", "forced_fumble_player_2_team", "solo_tackle_1_team", "solo_tackle_2_team", "assist_tackle_1_team", "assist_tackle_2_team", "assist_tackle_3_team", "assist_tackle_4_team", "fumbled_1_team", "fumbled_2_team", "fumble_recovery_1_team", "fumble_recovery_2_team"]

game_info = ['play_id','game_id','home_team','away_team','posteam','posteam_type', 'defteam', 'side_of_field', 'yardline_100','game_date', "year"]


prob_cols = [col for col in df.columns if 'prob' in col]

penalty_info = [col for col in df.columns if 'penalty' in col]


# ask how to add or statement for punts
kickoff_punt_info = [col for col in df.columns if 'kickoff' in col]

kickoff_punt_info += [col for col in df.columns if 'kicker' in col]
kickoff_punt_info += [col for col in df.columns if 'punt' in col]


defensive_info = [col for col in df.columns if 'defensive' in col]

defensive_info += [col for col in df.columns if 'fumble' in col]
defensive_info += [col for col in df.columns if 'sack' in col]
defensive_info += [col for col in df.columns if 'interception' in col]
defensive_info += [col for col in df.columns if 'defense' in col]

epa_info = [col for col in df.columns if 'epa' in col]

wpa_info = [col for col in df.columns if 'wpa' in col]

touchdown_info = [col for col in df.columns if 'touchdown' in col]

timeout_info = [col for col in df.columns if 'timeout' in col]

wp_info = [col for col in df.columns if 'wp' in col]

score_info = [col for col in df.columns if 'score' in col]

tackle_info = [col for col in df.columns if 'tackle' in col]

all_lists = [game_info, team_info, game_time_info ,xp, wpa_info, epa_info, touchdown_info, yard_info, run_play, pass_play, key, prob_cols, penalty_info, kickoff_punt_info, defensive_info, timeout_info, passer_info, receiver_info, rusher_info, score_info, tackle_info]

big_list = [item for elem in all_lists for item in elem]

remainder_list = [col for col in df.columns if col not in big_list]

['passer_player_id',
 'passer_player_name',
 'interception',
 'interception_player_id',
 'interception_player_name',
 'lateral_interception_player_id',
 'lateral_interception_player_name',
 'yards_after_catch',
 'incomplete_pass',
 'complete_pass',
 'touchdown',
 'pass_touchdown',
 'fumble']

## 4. Receiver row creation

In [None]:
'''
For each pass play, creat an additional row for the recepetion
so that there can be two play makers (the passer and the receiver).
'''

In [70]:
new_df = pd.DataFrame()
new_df = df[(df['play_type']=="pass")]

new_df['play_type'].replace({'pass': 'reception'}, inplace=True)

df = pd.concat([df, new_df], sort= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


## 5. Playmaker Column

In [None]:
'''
Step 5 creates a playmaker column, and also checks if each player 
id matchs only one name and corrects those that do not.

Steven's approach starts by establishing a playmaker id and name column and uses these columns to identify 
which ids have more than one name.

Using grouby you can cross check the playmake_id column and the playmaker_name column and isolate that id that have 
more than one name.

'''

#### 5.1 Define Play Maker

In [87]:
# Create play maker column
# you will need to add a receiver segments as well
df['play_maker'] = np.where(df['play_type']=='rush',df['rusher_player_id'], df['play_maker'] )
df['play_maker'] = np.where(df['play_type']=='pass', df['passer_player_id'], df['play_maker'])
df['play_maker'] = np.where(df['play_type']=='reception',df['receiver_player_id'],df['play_maker'])
df['play_maker'] = np.where((df['play_type']=='extra_point')|(df['play_type']=='field_goal'),df['kicker_player_id'],df['play_maker'])



In [88]:
# Add in corresponding play maker ID
df['play_maker_id'] = np.where(df['play_type']=='rush',df['rusher_player_name'], df['play_maker_id'])
df['play_maker_id'] = np.where(df['play_type']=='pass',df['passer_player_name'],df['play_maker_id'])
df['play_maker_id'] = np.where((df['play_type']=='extra_point')|(df['play_type']=='field_goal'),df['kicker_player_name'],df['play_maker_id'])
# Now that we have a single column to identify play makers, it is a lot easier to check for ID uniqueness

#### 5.2 Identify Non Unique Player Names

In [75]:
# Create a data frame that contains the unique count of each player name under a given ID
# Filter on the IDs that correspond to more than one name

nunique_id = df[df.groupby(['play_maker_id'])['play_maker'].transform('nunique') > 1]['play_maker_id'].unique()


In [76]:
# Identify all the duplicate names

df[df['play_maker_id'].isin(nunique_id)]['play_maker'].unique()

array(['00-0030098', '00-0029665', '00-0030564', '00-0029857'],
      dtype=object)

In [74]:
# Create a dictionary of what the corrected names should be

name_corrections = {'D.Chark Jr.': 'D.Chark',
'Jos.Allen':'J.Allen',
'M.Ingram II': 'M.Ingram',
'A.Levine Sr.': 'A.Levine',
'R.Griffin III': 'R.Griffin',
'G.Minshew II':'G.Minshew',
'B.Snell Jr.':'B.Snell', 
'Tr.Edmunds':'T.Edmunds',
'R.James Jr.': 'R.James',
'J.Ross III':'J.Ross',
'W.Snead IV':'W.Snead', 
'M.Jones Jr.': 'M.Jones', 
'M.Sanu Sr.':'M.Sanu', 
'O.Beckham Jr.':'O.Beckham', 
'P.Dorsett II':'P.Dorsett'}

#### S5.3 Correct Name Uniqueness

In [75]:
# Create a function to correct the typos

def typo_correction(name):
    if name in name_corrections.keys():
        return name_corrections[name]
    else:
        return name

In [76]:
# apply the function to the dataframe

df['play_maker'] = df['play_maker'].apply(typo_correction)

#### S5.4 New Attempt

In [77]:
test = df[df['passer_player_id'].isin(nunique_id)].groupby(['passer_player_id','passer_player_name']).size().reset_index()

# creates a dataframe with the ids that are in the nunique_id array (names that are not unique) and groups them by 'passer_player_id' and 'passer_player_name'
# what is the purpose of the .size() and .reset_index()
# .reset_index() adds indicies to the side of the data frame

test.set_index('passer_player_id', inplace=True)

test




Unnamed: 0_level_0,passer_player_name,0
passer_player_id,Unnamed: 1_level_1,Unnamed: 2_level_1


In [146]:
# test.head()


array([], dtype=object)

In [65]:

# change to play_maker
d = {player_id:(test.loc[player_id,'passer_player_name'][0],test.loc[player_id,'passer_player_name'][1]) for player_id in nunique_id}


NameError: name 'nunique_id' is not defined

In [79]:
test2 = df.copy()

# creates a copy of df (why is this necessary)

In [80]:
test2.set_index('passer_player_id', inplace=True)

# sets the index on the side as 'passer_player_id'

In [81]:
for dup in d.keys():
    test2.loc[dup,'passer_player_name'] = d[dup][0]

## 6. Relevant Plays

In [82]:
'''
Using a where statement use a binary indicator to Flag relevant plays 
such as passes, receptions, runs, field goals, extra points, and qb_kneels. 
'''

'\nUsing a where statement use a binary indicator to Flag relevant plays \nsuch as passes, receptions, runs, field goals, extra points, and qb_kneels. \n'

In [78]:
df['Flag'] = np.where((df['play_type'] == 'pass') | (df['play_type'] == 'run') | (df['play_type'] == 'field_goal') | (df['play_type'] == 'qb_kneel') | (df['play_type'] == 'extra_point') , 1, 0)


## 7. Extra Columns

In [79]:

# when the pass is dropped there will be no yards after the catch, which indicates that the pass was dropped
df['Target'] = np.where(((df['play_type'] == 'pass') | (df['play_type'] == 'reception')) & (df['yards_after_catch'] != 'nan'), 1, 0)


# if the pass was caught the yards after the catch will be greater than or equal to zero
df['reception'] = np.where(((df['play_type'] == 'pass') | (df['play_type'] == 'reception')) & (df['yards_after_catch'] >= 0), 1, 0)


  result = method(y)


## 8. Player Positions

In [86]:

# use the completion column to filter data in both the pass and reception
pass_play = df[df.play_type == 'pass'][['play_maker', 'yards_gained']]

pass_yards = pass_play.groupby('play_maker', as_index = False).sum()

pass_yards = pass_yards.rename(columns = {'yards_gained': 'pass_yards'})

# pass_yards

reception_play = df[df.play_type == 'reception'][['play_maker', 'yards_gained']]
# change the yardage info that is used 

reception_yards = reception_play.groupby('play_maker', as_index = False).yards_after_catch.sum()

reception_yards = reception_yards.rename(columns = {'yards_after_catch': 'reception_yards'})


# reception_yards

# run_play = df[df.play_type == 'run'][['play_maker', 'yards_gained']]

# run_yards = run_play.groupby('play_maker', as_index = False).yards_gained.sum()

# # run_yards

# do this through their kick attempts

# kick_play = df[df.play_type == 'punt'][['play_maker', 'kick_distance']]

# kick_yards = kick_play.groupby('play_maker', as_index = False).kick_distance.sum()

# kick_yards

# change the nulls to zeros
merged = pass_yards.merge(reception_yards[["play_maker", "reception_yards"]], on = 'play_maker', how = 'outer')

merged['position'] = np.where((merged['pass_yards'] > merged['reception_yards']), 'QB', 'WR/TE')

# merged['pass_yards'][0] > merged['reception_yards'][0]

# merged['position'] = np.where((merged['rush_yards'] > merged['reception_yards']), 'RB', merged['position'])

# merged


AttributeError: 'DataFrameGroupBy' object has no attribute 'yards_after_catch'

In [30]:
# pass_play.columns
# for i in pass_play.columns:
#     if "maker" in i:
#         print(i)

In [153]:
df['play_maker']

KeyError: 'play_maker'

In [154]:

pass_play = df[df.play_type == 'pass']

pass_yards = pass_play.groupby('passer_player_name').yards_gained.sum()

In [161]:
df = df.reset_index()
df["play_maker"]

KeyError: 'play_maker'

In [155]:
pass_yards

passer_player_name
A.Beck           6
A.Dalton      3247
A.Erickson      26
A.Kamara        13
A.Lee           26
              ... 
T.Hill          46
T.Siemian      -14
T.Taylor        33
W.Grier        184
Z.Pascal         0
Name: yards_gained, Length: 110, dtype: int64

In [156]:
run_play = df[df.play_type == 'run']

run_yards = run_play.groupby('rusher_player_name').yards_gained.sum()

run_yards

rusher_player_name
A.Abdullah    115
A.Armah        11
A.Beck          3
A.Brown        65
A.Cooper        6
             ... 
W.Snead IV      0
Z.Jones         3
Z.Line         20
Z.Pascal       16
Z.Zenner       -2
Name: yards_gained, Length: 322, dtype: int64

In [81]:

run_play = df[df.play_type == 'run'][['play_maker', 'yards_gained']]

run_yards = run_play.groupby('play_maker', as_index = False).yards_gained.sum()

run_yards


Unnamed: 0,play_maker,yards_gained


In [90]:
df[df['play_type'] == 'run'][['play_type', 'yards_gained', 'play_maker']].head()
# df['play_type'].unique()

Unnamed: 0,play_type,yards_gained,play_maker
1,run,0,
6,run,5,
8,run,7,
10,run,1,
14,run,0,


In [33]:
# df_file_game = "../1. Additional-Data/2019 stats.xlsx"

xls = pd.ExcelFile("../1. Additional-Data/2019 stats.xlsx")

df_WR = pd.read_excel(xls, 'WR')

df_QB = pd.read_excel(xls, 'QB')

df_RB = pd.read_excel(xls, 'RB')


In [41]:
# df_WR['Player'].apply(lambda x: x.str.slice(0, 20))
# df_WR['Player'] = df_WR['Player'].str.slice(end = 5)

WR_ls = [col[:col.find('*')] for col in df_WR['Player']]
WR_ls = [col[:col.find('\\')] for col in WR_ls]
WR_ls
# df_WR['Player'] = df_WR['Player'][]
# df_WR['Player']

SyntaxError: EOL while scanning string literal (<ipython-input-41-1495cbc8d6a3>, line 5)