In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os      
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#reading the CSV (comma separated values) file and convert it to a pandas dataframe      #df_i are injuries     #df_p are the players
df_i = pd.read_csv('/kaggle/input/nba-injuries-2010-2018/injuries_2010-2020.csv')

df_p = pd.read_csv('/kaggle/input/nba-players-data/all_seasons.csv', index_col=0)


In [None]:
#Dropped an unneccessary column
df_i = df_i.drop('Acquired', axis=1)
df_i = df_i.loc[df_i['Relinquished'].notnull()]

In [None]:
# Add a year column to match the season
df_i['year'] = pd.DatetimeIndex(df_i['Date']).year

#seeing what month and year a player suffered an injury
df_i['month'] = pd.DatetimeIndex(df_i['Date']).month

df_i.drop('adj_season', axis=1, errors='ignore')

In [None]:
#Adjusted season is the starting year a player suffered an injury. Any injury suffered before July will be adjusted to the previous season
df_i.drop('adj_season', axis=1, errors='ignore')
for index, row in df_i.iterrows():
    month = df_i.loc[index, 'month']
    df_i.loc[index, 'adj_season'] = str(int(df_i.loc[index, 'year']))
    if month < 7:
        df_i.loc[index, 'adj_season'] = str(int(df_i.loc[index, 'year'] - 1))
        

In [None]:
#Removal of the (- end season) in the adjusted season
df_p['adj_season'] = df_p.season.str.split('-').str.get(0)
df_p['adj_season'] = df_p['adj_season'].astype(str)
df_i['adj_season'] = df_i['adj_season'].astype(str)


In [None]:
#Combined the dataset of the injured players and all players into a new dataset. I did this to find the heights and weights for each player and correspond it to the injuries suffered by the player
df_comb= pd.merge(df_i, df_p, left_on=['Relinquished', 'adj_season'], right_on=['player_name', 'adj_season'], how='inner')

#Added a column to determine if a player suffered a knee injury by seeing if the injury notes contain the word "knee"
df_comb['is_knee_injury'] = df_comb['Notes'].str.find('knee') 

In [None]:
#Creating a shallow copy of df_comb for knee injured players
df_final = df_comb[df_comb['is_knee_injury']!=-1]

In [None]:
#New dataframe that includes all these features
df_injuries = df_final[['Date', 'Team', 'player_name', 'Notes', 'age', 'player_height', 'player_weight', 'gp', 'pts']].copy()

In [None]:
#Creating a new column that counts injuries and assigns it the value 1. 
df_injuries['injury_count']=1

In [None]:
#Creating a deep copy of df_injuries
df_export=df_injuries[['player_name', 'player_height', 'player_weight']].copy()

In [None]:
#Creating a deep copy of all players with their heights and weights. 
df_final_players= df_p[['player_name', 'player_height', 'player_weight']].copy()

#I created a deep copy with only the players that played after 2010. After I dropped adjusted season. I did this to get the players that aren't injured fromm 2010-20
df_final_players['adj_season']=pd.to_numeric(df_p['adj_season'], downcast='integer')
df_final_players=df_final_players.loc[df_final_players['adj_season']>=2010]
df_final_players.drop('adj_season', axis=1)

In [None]:
#A list of dataframes containing injured and non injured players. Concatanate the list to get both groups together.
x=[df_final_players, df_export]
export_csv=pd.concat(x)

In [None]:
#Injury count is grouped by player name. Injuries are counted and added to the column injury count. Duplicates are dropped and injury counts are subtracted by 1 because we want the non injured players to have an injury count of 0 since they were initially assigned 1.
export_csv['injury_count']=export_csv.groupby('player_name')['player_name'].transform('count')
export_csv=export_csv.drop_duplicates(subset='player_name')
export_csv['injury_count']= export_csv['injury_count']-1

In [None]:
#Assigned a column to determine if a player is above average in height or at the 75th percentile or higher. .quantile is a function acting on a list containing 0.5 and 0.75
export_csv['had_KneeInjury']=export_csv['injury_count']>0
quartiles= export_csv['player_height'].quantile([0.5, 0.75])
export_csv['above_averageHeight']=False
export_csv['above_3quarters']=False


In [None]:
#Creating a new column and assigning it to export_csv player height if it meets the threshold. Quantile is a dataframe
export_csv['above_averageHeight']= export_csv['player_height']>quartiles[0.5]
export_csv['above_3quarters']= export_csv['player_height']>quartiles[0.75]

In [None]:
#exported to CSV
export_csv.to_csv('NBA Injuries Analysis')