### Read the README.md before

In [1]:
import json 
import pandas as pd 
import datetime 

# Import the Files

In [2]:
df_relinquished = pd.read_csv('data/season_2018_relinquished.csv')
df_acquired = pd.read_csv('data/season_2018_acquired.csv')
df_players = pd.read_csv('data/season_2018_players.csv')
df_teams = pd.read_csv('data/season_2018_teams.csv')

with open('injury_types.json') as json_file: 
    injury_types = json.load(json_file)

with open('teams_dict.json') as json_teams: 
    teams_dict = json.load(json_teams)


***Relinquished***: When a player appears to the injury list or missed a game. 
    
    - date of the statement, team, name of the player, notes 

***Acquired***: When a player is no longer on the injury list 

    - Same columns 

***Players***: Each game where a player participated

    - Date, Player, Minutes played 

***Teams***: Each game played by a team 

    - Date, team

***Injury Types***: Json file with keywords to describe the reason of abscence. 
     
     - 3 reasons of abs: **injuries**, illness, personal_reasons
     - For each reason: different detailed keywords 
     - For Injuries: different types are also detailed 

***Teams Dict***: Corresponding team names and abreviations



In [3]:
# Transform date columns to Timestamp
df_relinquished['date'] = pd.to_datetime(df_relinquished['date'], format = '%Y-%m-%d')
df_acquired['date'] = pd.to_datetime(df_acquired['date'], format = '%Y-%m-%d')
df_players['Date'] = pd.to_datetime(df_players['Date'], format = '%Y%m%d')
df_teams['Date'] = pd.to_datetime(df_teams['Date'], format = '%Y%m%d')

# Extract Datails from Relinquished 

In [4]:
df_injuries = df_relinquished.copy()
df_injuries

Unnamed: 0,key,date,team,name,notes
0,WilsonD20181017rel,2018-10-17,Bucks,D.J. Wilson,placed on IL with strained right hamstring
1,DuvalT20181017rel,2018-10-17,Bucks,Trevon Duval,left eye injury (DTD)
2,SmithJ20181017rel,2018-10-17,Cavaliers,J.R. Smith,sore right elbow (DTD)
3,NanceL20181017rel,2018-10-17,Cavaliers,Larry Nance Jr.,placed on IL with sprained right ankle
4,WatanabeY20181017rel,2018-10-17,Grizzlies,Yuta Watanabe,placed on IL
...,...,...,...,...,...
1494,IguodalaA20190520rel,2019-05-20,Warriors,Andre Iguodala,placed on IL with left calf injury
1495,LooneyK20190603rel,2019-06-03,Warriors,Kevon Looney,fractured collarbone (out indefinitely)
1496,LooneyK20190605rel,2019-06-05,Warriors,Kevon Looney,placed on IL with fractured collarbone
1497,ThompsonK20190605rel,2019-06-05,Warriors,Klay Thompson,strained left hamstring (DTD)


Instead of doing basic NLP with word recognition (like some people on Kaggle already did), I prefer using a list of body parts and injuries types. I believe this method can lead to a more precise analysis. 

In [5]:
reasons = []
types = []
injury_locations = []
for note in df_injuries['notes']: 

    if len([ele for ele in injury_types['injuries'] if ele in note]) > 0: 
        reason = 'injury'
        type_ = [ele for ele in injury_types['types'] if ele in note]
        injury_location = [ele for ele in injury_types['injuries'] if ele in note]

    elif len([ele for ele in injury_types['personal_reasons'] if ele in note]) > 0: 
        reason = 'personal_reasons'
        type_ = [ele for ele in injury_types['personal_reasons'] if ele in note]
        injury_location = None

    else: 
        reason = 'illness'
        if len([ele for ele in injury_types['illness'] if ele in note]) > 0: 
            type_ = [ele for ele in injury_types['illness'] if ele in note]
        else: 
            type_ = None 
        injury_location = None
    
    reasons.append(reason)
    if (type_ == None) or (type_ == []): 
        types.append(type_)
    else: 
        types.append(type_[0])
    
    if (injury_location == None) or (type == []): 
        injury_locations.append(injury_location)
    else: 
        injury_locations.append(injury_location[0])


In [6]:
df_injuries['reason'] = reasons
df_injuries['type'] = types
df_injuries['injury_location'] = injury_locations
df_injuries

Unnamed: 0,key,date,team,name,notes,reason,type,injury_location
0,WilsonD20181017rel,2018-10-17,Bucks,D.J. Wilson,placed on IL with strained right hamstring,injury,strained,hamstring
1,DuvalT20181017rel,2018-10-17,Bucks,Trevon Duval,left eye injury (DTD),injury,injury,eye
2,SmithJ20181017rel,2018-10-17,Cavaliers,J.R. Smith,sore right elbow (DTD),injury,sore,elbow
3,NanceL20181017rel,2018-10-17,Cavaliers,Larry Nance Jr.,placed on IL with sprained right ankle,injury,sprained,ankle
4,WatanabeY20181017rel,2018-10-17,Grizzlies,Yuta Watanabe,placed on IL,illness,,
...,...,...,...,...,...,...,...,...
1494,IguodalaA20190520rel,2019-05-20,Warriors,Andre Iguodala,placed on IL with left calf injury,injury,injury,calf
1495,LooneyK20190603rel,2019-06-03,Warriors,Kevon Looney,fractured collarbone (out indefinitely),illness,,
1496,LooneyK20190605rel,2019-06-05,Warriors,Kevon Looney,placed on IL with fractured collarbone,illness,,
1497,ThompsonK20190605rel,2019-06-05,Warriors,Klay Thompson,strained left hamstring (DTD),injury,strained,hamstring


# Merge with Acquired 

In [7]:
missed_games = []
missed_days = []
return_dates = []

for i in range(len(df_injuries)): 
    name = df_injuries.iloc[i]['name']
    team = df_injuries.iloc[i]['team']  
    statement_date = df_injuries.iloc[i]['date']

    name_acquired = df_acquired[(df_acquired['name'] == name) & (df_acquired['date'] > statement_date)]

    player = df_players[(df_players['Team'] == name) & (df_players['Date'] > statement_date)]
    
    if len(player) == 0: # Temporary line. Some names aren't the same between my Injuries data and my game data. 
        missed_games.append(None)
        missed_days.append(None)
        return_dates.append(None)
        continue 

    team = teams_dict[team]
    team_calendar = df_teams[(df_teams['Team']== team) & (df_teams['Date'] > statement_date)]

    if len(name_acquired) > 0: # We find the end of the absence of the player on the acquired dataset. 
        return_date = name_acquired.iloc[0]['date']
        
        # Double check with the game data 
        next_game_played = player.iloc[0]['Date']
        if next_game_played < return_date: # We double check this date with the next game he played, based on a game dataset. 
            return_date = next_game_played # We change the return_date if the older one wasn't correct. 
    elif len(player) > 0: 
        return_date = player.iloc[0]['Date']
    else: 
        return_date = 'Still absent or Retired' # TODO: Chech if the player is retired or not


    return_dates.append(return_date)
    if isinstance(return_date, str):
        missed_games.append(None)
        missed_days.append(None)
        
    else:
        missed_days.append((return_date - statement_date).days) # Calculated days between the begining and the end of the absence
        missed_games.append(len(team_calendar[team_calendar['Date'] < return_date])) # During this period, we calculate the number of games his team played, and so he missed



In [8]:
df_injuries['missed_days'] = missed_days
df_injuries['missed_games'] = missed_games
df_injuries['return_date'] = return_dates
df_injuries

Unnamed: 0,key,date,team,name,notes,reason,type,injury_location,missed_days,missed_games,return_date
0,WilsonD20181017rel,2018-10-17,Bucks,D.J. Wilson,placed on IL with strained right hamstring,injury,strained,hamstring,49.0,21.0,2018-12-05
1,DuvalT20181017rel,2018-10-17,Bucks,Trevon Duval,left eye injury (DTD),injury,injury,eye,114.0,52.0,2019-02-08
2,SmithJ20181017rel,2018-10-17,Cavaliers,J.R. Smith,sore right elbow (DTD),injury,sore,elbow,2.0,0.0,2018-10-19
3,NanceL20181017rel,2018-10-17,Cavaliers,Larry Nance Jr.,placed on IL with sprained right ankle,injury,sprained,ankle,,,NaT
4,WatanabeY20181017rel,2018-10-17,Grizzlies,Yuta Watanabe,placed on IL,illness,,,10.0,3.0,2018-10-27
...,...,...,...,...,...,...,...,...,...,...,...
1494,IguodalaA20190520rel,2019-05-20,Warriors,Andre Iguodala,placed on IL with left calf injury,injury,injury,calf,10.0,0.0,2019-05-30
1495,LooneyK20190603rel,2019-06-03,Warriors,Kevon Looney,fractured collarbone (out indefinitely),illness,,,4.0,1.0,2019-06-07
1496,LooneyK20190605rel,2019-06-05,Warriors,Kevon Looney,placed on IL with fractured collarbone,illness,,,2.0,0.0,2019-06-07
1497,ThompsonK20190605rel,2019-06-05,Warriors,Klay Thompson,strained left hamstring (DTD),injury,strained,hamstring,2.0,0.0,2019-06-07


# Analysis
NOTE: Some duplicates can be found in the dataset. i'll check these in a later version

In [9]:
df_analysis = df_injuries.dropna( subset = ['missed_days']) # For the analysis we remove all the absence we are unsure about their end. 
df_analysis

Unnamed: 0,key,date,team,name,notes,reason,type,injury_location,missed_days,missed_games,return_date
0,WilsonD20181017rel,2018-10-17,Bucks,D.J. Wilson,placed on IL with strained right hamstring,injury,strained,hamstring,49.0,21.0,2018-12-05
1,DuvalT20181017rel,2018-10-17,Bucks,Trevon Duval,left eye injury (DTD),injury,injury,eye,114.0,52.0,2019-02-08
2,SmithJ20181017rel,2018-10-17,Cavaliers,J.R. Smith,sore right elbow (DTD),injury,sore,elbow,2.0,0.0,2018-10-19
4,WatanabeY20181017rel,2018-10-17,Grizzlies,Yuta Watanabe,placed on IL,illness,,,10.0,3.0,2018-10-27
5,HamiltonD20181017rel,2018-10-17,Hawks,Daniel Hamilton,torn rotator cuff in shoulder (out indefinitely),injury,torn,shoulder,14.0,6.0,2018-10-31
...,...,...,...,...,...,...,...,...,...,...,...
1493,WilsonD20190519rel,2019-05-19,Bucks,D.J. Wilson,sore left ankle (DTD),injury,sore,ankle,2.0,0.0,2019-05-21
1494,IguodalaA20190520rel,2019-05-20,Warriors,Andre Iguodala,placed on IL with left calf injury,injury,injury,calf,10.0,0.0,2019-05-30
1495,LooneyK20190603rel,2019-06-03,Warriors,Kevon Looney,fractured collarbone (out indefinitely),illness,,,4.0,1.0,2019-06-07
1496,LooneyK20190605rel,2019-06-05,Warriors,Kevon Looney,placed on IL with fractured collarbone,illness,,,2.0,0.0,2019-06-07


In [10]:
df_analysis['reason'].value_counts()

injury              786
illness             314
personal_reasons     70
Name: reason, dtype: int64

The large majority of absence are due to injuries

In [11]:
injury_analysis = df_analysis[df_analysis['reason'] == 'injury']
injury_analysis

Unnamed: 0,key,date,team,name,notes,reason,type,injury_location,missed_days,missed_games,return_date
0,WilsonD20181017rel,2018-10-17,Bucks,D.J. Wilson,placed on IL with strained right hamstring,injury,strained,hamstring,49.0,21.0,2018-12-05
1,DuvalT20181017rel,2018-10-17,Bucks,Trevon Duval,left eye injury (DTD),injury,injury,eye,114.0,52.0,2019-02-08
2,SmithJ20181017rel,2018-10-17,Cavaliers,J.R. Smith,sore right elbow (DTD),injury,sore,elbow,2.0,0.0,2018-10-19
5,HamiltonD20181017rel,2018-10-17,Hawks,Daniel Hamilton,torn rotator cuff in shoulder (out indefinitely),injury,torn,shoulder,14.0,6.0,2018-10-31
6,DedmonD20181017rel,2018-10-17,Hawks,Dewayne Dedmon,placed on IL with left ankle injury,injury,injury,ankle,7.0,2.0,2018-10-24
...,...,...,...,...,...,...,...,...,...,...,...
1489,DurantK20190509rel,2019-05-09,Warriors,Kevin Durant,strained right calf (DTD),injury,strained,calf,32.0,9.0,2019-06-10
1491,DurantK20190510rel,2019-05-10,Warriors,Kevin Durant,placed on IL with strained right calf,injury,strained,calf,31.0,8.0,2019-06-10
1493,WilsonD20190519rel,2019-05-19,Bucks,D.J. Wilson,sore left ankle (DTD),injury,sore,ankle,2.0,0.0,2019-05-21
1494,IguodalaA20190520rel,2019-05-20,Warriors,Andre Iguodala,placed on IL with left calf injury,injury,injury,calf,10.0,0.0,2019-05-30


In [12]:
injury_analysis['injury_location'].value_counts()

ankle         172
knee          165
back           64
hamstring      51
foot           42
hip            39
quadricep      27
concussion     23
toe            22
shoulder       21
groin          17
calf           15
thumb          12
neck           12
Achilles       11
wrist          11
finger         10
adductor       10
heel           10
hand           10
thigh           8
elbow           8
head            6
leg             4
eye             3
nose            3
abdominal       2
oblique         2
pelvis          2
shin            1
thoracic        1
mouth           1
tailbone        1
Name: injury_location, dtype: int64

As expected, the ankle and knee are responsible of a large part of all injuries 

## Ankle Analysis 

In [13]:
ankle_analysis = injury_analysis[injury_analysis['injury_location'] == 'ankle']
ankle_analysis

Unnamed: 0,key,date,team,name,notes,reason,type,injury_location,missed_days,missed_games,return_date
6,DedmonD20181017rel,2018-10-17,Hawks,Dewayne Dedmon,placed on IL with left ankle injury,injury,injury,ankle,7.0,2.0,2018-10-24
8,WaitersD20181017rel,2018-10-17,Heat,Dion Waiters,placed on IL with left ankle injury,injury,injury,ankle,77.0,34.0,2019-01-02
11,EllingtonW20181017rel,2018-10-17,Heat,Wayne Ellington,sore left ankle (DTD),injury,sore,ankle,10.0,3.0,2018-10-27
17,MudiayE20181017rel,2018-10-17,Knicks,Emmanuel Mudiay,placed on IL with sprained right ankle,injury,sprained,ankle,12.0,5.0,2018-10-29
18,HicksI20181017rel,2018-10-17,Knicks,Isaiah Hicks,placed on IL with sprained left ankle,injury,sprained,ankle,107.0,49.0,2019-02-01
...,...,...,...,...,...,...,...,...,...,...,...
1444,GobertR20190410rel,2019-04-10,Jazz,Rudy Gobert,placed on IL with sore left ankle (out for se...,injury,sore,ankle,4.0,0.0,2019-04-14
1452,TurnerM20190410rel,2019-04-10,Pacers,Myles Turner,placed on IL with sore right ankle,injury,sore,ankle,3.0,0.0,2019-04-13
1462,CurryS20190410rel,2019-04-10,Warriors,Stephen Curry,placed on IL with sprained right ankle,injury,sprained,ankle,2.0,0.0,2019-04-12
1487,WilsonD20190508rel,2019-05-08,Bucks,D.J. Wilson,sprained left ankle (DTD),injury,sprained,ankle,9.0,1.0,2019-05-17


In [14]:
ankle_analysis['type'].value_counts()

sprained    116
sore         33
injury       21
surgery       2
Name: type, dtype: int64

In [15]:
ankle_injury = list(ankle_analysis['type'].unique())

In [16]:
for injury in ankle_injury: 
    missed_games = ankle_analysis[ankle_analysis['type'] == injury]['missed_days'].mean()
    print(injury, "%.2f" % missed_games)

injury 22.29
sore 4.36
sprained 10.70
surgery 40.00


The average number of missed days with each ankle injury type