In [26]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Import data
injuries_df = pd.read_csv('../1_DataCollection/src/injuries.csv')
rosters_df = pd.read_csv('../1_DataCollection/src/team_rosters.csv')

In [27]:
from datetime import datetime

# Merging the datasets on the common identifier gsis_id may not be the best approach
merged_df = pd.merge(injuries_df, rosters_df, left_on=['full_name'], right_on=['player_name'], how='left')

# Convert date columns to datetime
merged_df['date_modified'] = pd.to_datetime(merged_df['date_modified'])
merged_df['birth_date'] = pd.to_datetime(merged_df['birth_date'])

# Calculate age at the time of injury
merged_df['age_at_injury'] = merged_df['date_modified'].dt.year - merged_df['birth_date'].dt.year

# Categorize injury types
# This is a placeholder 
merged_df['injury_category'] = merged_df['report_primary_injury'].apply(lambda x: 'Upper Body' if x in ['Shoulder', 'Arm'] else 'Lower Body' if x in ['Leg', 'Knee'] else 'Other')

# Calculate time since last injury
merged_df = merged_df.sort_values(by=['full_name', 'date_modified'])
merged_df['days_since_last_injury'] = merged_df.groupby('full_name')['date_modified'].diff().dt.days.fillna(0)

# Count of previous injuries
merged_df['previous_injuries_count'] = merged_df.groupby('full_name').cumcount()

# Displaying the first few rows of the merged and modified dataset
merged_df.head()

Unnamed: 0,season_x,game_type_x,team_x,week_x,gsis_id,position_x,full_name,first_name_x,last_name_x,report_primary_injury,...,smart_id,entry_year,rookie_year,draft_club,draft_number,age,age_at_injury,injury_category,days_since_last_injury,previous_injuries_count
55246,2022,REG,LA,11,00-0032889,DT,A'Shawn Robinson,A'Shawn,Robinson,Illness,...,3200524f-4236-7960-bf20-bc060ac0f49c,2016.0,2016.0,DET,46.0,27.688,27.0,Other,0.0,0
55247,2022,REG,LA,11,00-0032889,DT,A'Shawn Robinson,A'Shawn,Robinson,Illness,...,3200524f-4236-7960-bf20-bc060ac0f49c,2016.0,2016.0,DET,46.0,27.65,27.0,Other,0.0,1
55248,2022,REG,LA,11,00-0032889,DT,A'Shawn Robinson,A'Shawn,Robinson,Illness,...,3200524f-4236-7960-bf20-bc060ac0f49c,2016.0,2016.0,DET,46.0,27.765,27.0,Other,0.0,2
55249,2022,REG,LA,11,00-0032889,DT,A'Shawn Robinson,A'Shawn,Robinson,Illness,...,3200524f-4236-7960-bf20-bc060ac0f49c,2016.0,2016.0,DET,46.0,27.803,27.0,Other,0.0,3
55250,2022,REG,LA,11,00-0032889,DT,A'Shawn Robinson,A'Shawn,Robinson,Illness,...,3200524f-4236-7960-bf20-bc060ac0f49c,2016.0,2016.0,DET,46.0,27.748,27.0,Other,0.0,4


In [28]:
merged_df = merged_df[['full_name', 'gsis_id', 'season_x', 'team_x', 'position_x', 'status', 'height', 'weight', 'birth_date', 'age_at_injury', 'years_exp', 'injury_category', 'report_primary_injury', 'report_secondary_injury','practice_primary_injury', 'practice_secondary_injury', 'report_status', 'practice_status', 'days_since_last_injury', 'previous_injuries_count']]

merged_df.head()

Unnamed: 0,full_name,gsis_id,season_x,team_x,position_x,status,height,weight,birth_date,age_at_injury,years_exp,injury_category,report_primary_injury,report_secondary_injury,practice_primary_injury,practice_secondary_injury,report_status,practice_status,days_since_last_injury,previous_injuries_count
55246,A'Shawn Robinson,00-0032889,2022,LA,DT,RES,76.0,330.0,1995-03-21,27.0,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,0
55247,A'Shawn Robinson,00-0032889,2022,LA,DT,ACT,76.0,330.0,1995-03-21,27.0,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,1
55248,A'Shawn Robinson,00-0032889,2022,LA,DT,RES,76.0,330.0,1995-03-21,27.0,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,2
55249,A'Shawn Robinson,00-0032889,2022,LA,DT,RES,76.0,330.0,1995-03-21,27.0,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,3
55250,A'Shawn Robinson,00-0032889,2022,LA,DT,RES,76.0,330.0,1995-03-21,27.0,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,4


In [29]:

past_injuries = pd.read_csv('../data/cleaned_scraped_data.csv', index_col=0)

past_injuries = past_injuries[~past_injuries['Injury'].isna()]

past_injuries.rename(columns={
    'Player': 'full_name',  
    'Position': 'position_x',  
    'Injury': 'report_primary_injury',
    'Game Status': 'practice_status',
    'Game Type': 'report_status',
}, inplace=True)
past_injuries['season'] = pd.to_numeric(past_injuries['season'], errors='coerce')
past_injuries.head()

Unnamed: 0,full_name,position_x,report_primary_injury,practice_status,report_status,game_type,season
1,Brandin Cooks,WR,Quadricep,Limited Participation in Practice,Questionable,REG,2020
3,Cullen Gillaspia,RB,Hamstring,Limited Participation in Practice,Questionable,REG,2020
4,Jonathan Greenard,LB,Ankle,Limited Participation in Practice,Questionable,REG,2020
15,Derek Barnett,DE,Hamstring,Limited Participation in Practice,Out,REG,2020
17,Javon Hargrave,DT,"Pectoral, Hamstring",Did Not Participate In Practice,Out,REG,2020


In [30]:
past_injuries['season_21'] = [2021 if x == 2021 else np.nan for x in past_injuries['season']]
past_injuries['season'] = [x if x != 2021 else np.nan for x in past_injuries['season']]
past_injuries['season_20'] = past_injuries['season']
past_injuries = past_injuries.drop(columns=['season'])

past_injuries.head()

Unnamed: 0,full_name,position_x,report_primary_injury,practice_status,report_status,game_type,season_21,season_20
1,Brandin Cooks,WR,Quadricep,Limited Participation in Practice,Questionable,REG,,2020.0
3,Cullen Gillaspia,RB,Hamstring,Limited Participation in Practice,Questionable,REG,,2020.0
4,Jonathan Greenard,LB,Ankle,Limited Participation in Practice,Questionable,REG,,2020.0
15,Derek Barnett,DE,Hamstring,Limited Participation in Practice,Out,REG,,2020.0
17,Javon Hargrave,DT,"Pectoral, Hamstring",Did Not Participate In Practice,Out,REG,,2020.0


In [31]:
past_injuries.dtypes

full_name                 object
position_x                object
report_primary_injury     object
practice_status           object
report_status             object
game_type                 object
season_21                float64
season_20                float64
dtype: object

In [32]:
#Format names for merging
past_injuries['full_name'] = past_injuries['full_name'].str.strip().str.lower()
merged_df['full_name'] = merged_df['full_name'].str.strip().str.lower()

#Merge on name and season
final_merged_df = pd.merge(merged_df, past_injuries, left_on=['full_name',], right_on=['full_name'], how='left')

final_merged_df.head()

Unnamed: 0,full_name,gsis_id,season_x,team_x,position_x_x,status,height,weight,birth_date,age_at_injury,...,practice_status_x,days_since_last_injury,previous_injuries_count,position_x_y,report_primary_injury_y,practice_status_y,report_status_y,game_type,season_21,season_20
0,a'shawn robinson,00-0032889,2022,LA,DT,RES,76.0,330.0,1995-03-21,27.0,...,Did Not Participate In Practice,0.0,0,,,,,,,
1,a'shawn robinson,00-0032889,2022,LA,DT,ACT,76.0,330.0,1995-03-21,27.0,...,Did Not Participate In Practice,0.0,1,,,,,,,
2,a'shawn robinson,00-0032889,2022,LA,DT,RES,76.0,330.0,1995-03-21,27.0,...,Did Not Participate In Practice,0.0,2,,,,,,,
3,a'shawn robinson,00-0032889,2022,LA,DT,RES,76.0,330.0,1995-03-21,27.0,...,Did Not Participate In Practice,0.0,3,,,,,,,
4,a'shawn robinson,00-0032889,2022,LA,DT,RES,76.0,330.0,1995-03-21,27.0,...,Did Not Participate In Practice,0.0,4,,,,,,,


In [33]:

def clean_data(final_merged_df):
    # Drop column: 'game_type'
    final_merged_df = final_merged_df.drop(columns=['game_type'])
    # Drop column: 'report_status_y'
    final_merged_df = final_merged_df.drop(columns=['report_status_y'])
    # Drop column: 'practice_status_y'
    final_merged_df = final_merged_df.drop(columns=['practice_status_y'])
    # Drop column: 'report_primary_injury_y'
    final_merged_df = final_merged_df.drop(columns=['report_primary_injury_y'])
    # Drop column: 'position_x_y'
    final_merged_df = final_merged_df.drop(columns=['position_x_y'])
    return final_merged_df

merged_clean = clean_data(final_merged_df.copy())
merged_clean.head()

Unnamed: 0,full_name,gsis_id,season_x,team_x,position_x_x,status,height,weight,birth_date,age_at_injury,...,report_primary_injury_x,report_secondary_injury,practice_primary_injury,practice_secondary_injury,report_status_x,practice_status_x,days_since_last_injury,previous_injuries_count,season_21,season_20
0,a'shawn robinson,00-0032889,2022,LA,DT,RES,76.0,330.0,1995-03-21,27.0,...,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,0,,
1,a'shawn robinson,00-0032889,2022,LA,DT,ACT,76.0,330.0,1995-03-21,27.0,...,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,1,,
2,a'shawn robinson,00-0032889,2022,LA,DT,RES,76.0,330.0,1995-03-21,27.0,...,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,2,,
3,a'shawn robinson,00-0032889,2022,LA,DT,RES,76.0,330.0,1995-03-21,27.0,...,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,3,,
4,a'shawn robinson,00-0032889,2022,LA,DT,RES,76.0,330.0,1995-03-21,27.0,...,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,4,,


In [34]:
merged_clean.columns


Index(['full_name', 'gsis_id', 'season_x', 'team_x', 'position_x_x', 'status',
       'height', 'weight', 'birth_date', 'age_at_injury', 'years_exp',
       'injury_category', 'report_primary_injury_x', 'report_secondary_injury',
       'practice_primary_injury', 'practice_secondary_injury',
       'report_status_x', 'practice_status_x', 'days_since_last_injury',
       'previous_injuries_count', 'season_21', 'season_20'],
      dtype='object')

In [35]:
final_merged_df = final_merged_df[['full_name', 'gsis_id', 'season_20', 'season_21', 'season_x', 'team_x', 'position_x_x',
       'status', 'height', 'weight', 'birth_date', 'age_at_injury',
       'years_exp', 'injury_category', 'report_primary_injury_x',
       'report_secondary_injury', 'practice_primary_injury',
       'practice_secondary_injury', 'report_status_x', 'practice_status_x',
      'days_since_last_injury', 'previous_injuries_count' ]]

In [36]:
final_merged_df.to_csv('./cleaned_data/clean_merged_data.csv') 
final_merged_df.head()


Unnamed: 0,full_name,gsis_id,season_20,season_21,season_x,team_x,position_x_x,status,height,weight,...,years_exp,injury_category,report_primary_injury_x,report_secondary_injury,practice_primary_injury,practice_secondary_injury,report_status_x,practice_status_x,days_since_last_injury,previous_injuries_count
0,a'shawn robinson,00-0032889,,,2022,LA,DT,RES,76.0,330.0,...,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,0
1,a'shawn robinson,00-0032889,,,2022,LA,DT,ACT,76.0,330.0,...,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,1
2,a'shawn robinson,00-0032889,,,2022,LA,DT,RES,76.0,330.0,...,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,2
3,a'shawn robinson,00-0032889,,,2022,LA,DT,RES,76.0,330.0,...,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,3
4,a'shawn robinson,00-0032889,,,2022,LA,DT,RES,76.0,330.0,...,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,4


In [37]:
# 1. Creating a Column for Each Injury Type
# This creates a pivot table with players as rows and injury types as columns #REF: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot_table.html
injury_counts = final_merged_df.pivot_table(index='full_name', columns='report_primary_injury_x', aggfunc='size', fill_value=0)
print(injury_counts.shape)
injury_counts.to_csv('./cleaned_data/injury_counts.csv')
injury_counts.head()

(1031, 65)


report_primary_injury_x,Abdomen,Achilles,Ankle,Appendicitis,Appendix,Back,Biceps,Calf,Chest,Collarbone,...,right Elbow,right Finger,right Groin,right Hamstring,right Hip,right Knee,right Quadricep,right Shoulder,right Thumb,"toe, pec, knee, hip"
full_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a'shawn robinson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a.j. cann,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a.j. epenesa,0,0,38,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a.j. green,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a.j. parker,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:

final_merged_df.head()

Unnamed: 0,full_name,gsis_id,season_20,season_21,season_x,team_x,position_x_x,status,height,weight,...,years_exp,injury_category,report_primary_injury_x,report_secondary_injury,practice_primary_injury,practice_secondary_injury,report_status_x,practice_status_x,days_since_last_injury,previous_injuries_count
0,a'shawn robinson,00-0032889,,,2022,LA,DT,RES,76.0,330.0,...,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,0
1,a'shawn robinson,00-0032889,,,2022,LA,DT,ACT,76.0,330.0,...,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,1
2,a'shawn robinson,00-0032889,,,2022,LA,DT,RES,76.0,330.0,...,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,2
3,a'shawn robinson,00-0032889,,,2022,LA,DT,RES,76.0,330.0,...,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,3
4,a'shawn robinson,00-0032889,,,2022,LA,DT,RES,76.0,330.0,...,6.0,Other,Illness,,Illness,,Questionable,Did Not Participate In Practice,0.0,4
