<a href="https://colab.research.google.com/github/spyrosviz/Injury_Prediction_MidLong_Distance_Runners/blob/main/Preprocessing/Runners_Injury_Prediction_Preprocessing_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. PREPROCESSING**
 
 ## We will start by importing the daily excel, which contains variables from 74 athletes on daily basis. We will use some of that data to construct 4 new features related to load. Training load is usually defined as training duration x perceived exertion in the literature and has arbitary units. However since in this dataset there is no information regarding training duration we will use total running distance instead to construct the following new features
* **Total weekly running distance**
* **Acute Load (Seven day average of training loads)**
* **Monotony (Seven day average of training loads / SD of the training loads)**
* **Strain (Seven day average of training loads squared / SD of the training loads)**

In [None]:
# Import libraries
import pandas as pd
import numpy as np

In [None]:
# Sort dataframe by athlete id and date
df_daily = pd.read_excel(r'/content/drive/MyDrive/Runners_Injury_MLproject/MidLongDistanceRunnersInjuryDaily.xlsx',index_col=0)

# Replace -0.01 with 0 in some columns. For example in perceived exertion column, when it's not a training day it should have a value of 0.
# Instead the cell value has been filled with -0.01 so we replace it with 0 
df_daily.replace(-0.01,0,inplace=True)
df_daily = df_daily.sort_values(by=['Athlete ID','Date'])

df_counts = df_daily.groupby('Athlete ID').count()
no_days_per_athl = df_counts['Date'].values

# Print days for athlete id = 0. We observe that not all days are sequential
print(df_daily[df_daily['Athlete ID']==0]['Date'].values)

In [None]:
df_daily.to_excel('Daily_Injury_Clean.xlsx',columns = df_daily.columns)

In [None]:
def loads(idx,no_date):
  array_dist = np.array([float(df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['total km'].values[0]), float(df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['total km.1'].values[0]),
  float(df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['total km.2'].values[0]), float(df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['total km.3'].values[0]), float(df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['total km.4'].values[0]),
  float(df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['total km.5'].values[0]), float(df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['total km.6'].values[0])])

  array_perc_exert = np.array([df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['perceived exertion'].values[0], df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['perceived exertion.1'].values[0],
  df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['perceived exertion.2'].values[0], df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['perceived exertion.3'].values[0], df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['perceived exertion.4'].values[0],
  df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['perceived exertion.5'].values[0], df_daily[(df_daily['Athlete ID']==id) & (df_daily['Date']==day)]['perceived exertion.6'].values[0]])

  array_train_loads = array_dist * array_perc_exert

  total_dist =  np.sum(array_dist)

  act_load = np.mean(array_train_loads)

  if np.std(array_train_loads) != 0:
    monotn = np.mean(array_train_loads) / np.std(array_train_loads)
    strain = (np.mean(array_train_loads) ** 2) / np.std(array_train_loads)
  else:
    monotn = np.nan
    strain = np.nan

  lista = [total_dist, act_load, monotn, strain]

  return lista

In [None]:
acute_load = []
total_weekly_dist = []
monotony = []
strain = []

num_of_athletes = 74
for id in range(num_of_athletes):
  print(f'Athlete ID is {id}.')
  for day in df_daily[df_daily['Athlete ID']==id]['Date'].values:

    tot_dist = loads(id,day)[0]
    total_weekly_dist.append(tot_dist)

    act_load = loads(id,day)[1]
    acute_load.append(act_load)

    monotn = loads(id,day)[2]
    monotony.append(monotn)

    strn = loads(id,day)[3]
    strain.append(strn)

df_daily['Acute Load'] = np.array(acute_load)
df_daily['Total Weekly Distance'] = np.array(total_weekly_dist)
df_daily['Monotony'] = np.array(monotony)
df_daily['Strain'] = np.array(strain)

print(df_daily.head)

Athlete ID is 0.
Athlete ID is 1.
Athlete ID is 2.
Athlete ID is 3.
Athlete ID is 4.
Athlete ID is 5.
Athlete ID is 6.
Athlete ID is 7.
Athlete ID is 8.
Athlete ID is 9.
Athlete ID is 10.
Athlete ID is 11.
Athlete ID is 12.
Athlete ID is 13.
Athlete ID is 14.
Athlete ID is 15.
Athlete ID is 16.
Athlete ID is 17.
Athlete ID is 18.
Athlete ID is 19.
Athlete ID is 20.
Athlete ID is 21.
Athlete ID is 22.
Athlete ID is 23.
Athlete ID is 24.
Athlete ID is 25.
Athlete ID is 26.
Athlete ID is 27.
Athlete ID is 28.
Athlete ID is 29.
Athlete ID is 30.
Athlete ID is 31.
Athlete ID is 32.
Athlete ID is 33.
Athlete ID is 34.
Athlete ID is 35.
Athlete ID is 36.
Athlete ID is 37.
Athlete ID is 38.
Athlete ID is 39.
Athlete ID is 40.
Athlete ID is 41.
Athlete ID is 42.
Athlete ID is 43.
Athlete ID is 44.
Athlete ID is 45.
Athlete ID is 46.
Athlete ID is 47.
Athlete ID is 48.
Athlete ID is 49.
Athlete ID is 50.
Athlete ID is 51.
Athlete ID is 52.
Athlete ID is 53.
Athlete ID is 54.
Athlete ID is 55.
At

In [None]:
# Print dataframe's original shape
print(f'Original df shape is {df_daily.shape}')

# Print number of samples containing na values
print(f'Number of instances with na values is {df_daily[df_daily.isnull().any(axis=1)].shape[0]}')

# Replace nan values from monotony and strain with 0, because it if nan it means there were no training days 1 week prior the event
df_daily.replace(np.nan,0,inplace=True)

# Print number of non injury instances and number of injury instances
print(len(df_daily[df_daily['injury']==0].values))
print(len(df_daily[df_daily['injury']==1].values))

Original df shape is (42766, 77)
Number of instances with na values is 5164
42183
583


In [None]:
df_daily.to_excel('run_injur_with_acuteloads.xlsx',columns=df_daily.columns)
print(df_daily)

       nr. sessions  total km  ...  Monotony    Strain
0                 1       5.8  ...  0.550653  0.192099
1                 0       0.0  ...  0.529376  0.175753
2                 1       0.0  ...  0.529376  0.175753
3                 0       0.0  ...  0.529376  0.175753
4                 1       0.0  ...  0.744251  0.452930
...             ...       ...  ...       ...       ...
42178             0       0.0  ...  0.832279  2.389117
42179             1       8.0  ...  0.900571  2.713936
42180             1      14.7  ...  0.943715  3.588140
42181             1      12.2  ...  0.845264  3.000806
42182             1       9.3  ...  1.034092  5.335174

[42766 rows x 77 columns]
