In [78]:
!pip install imbalanced-learn scikit-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [79]:
# importing libraries here to better keep track of them

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

In [5]:
# Reading csv files into memory

injury_history_raw = pd.read_csv("data/injury_history(injury_history).csv", sep = ",", encoding = 'ISO-8859-1')
muscle_imbalance_raw = pd.read_csv("data/injury_history(muscle_imbalance_data).csv", sep = ",", encoding = 'ISO-8859-1')
player_sessions_raw = pd.read_csv("data/injury_history(player_sessions).csv", sep = ",", encoding = 'ISO-8859-1')

In [112]:
injury_history_raw['Injury Date'] = pd.to_datetime(injury_history_raw['Injury Date'], errors = 'coerce')
injury_history_raw['Month'] = injury_history_raw['Injury Date'].dt.month
injury_history_raw['Year'] = injury_history_raw['Injury Date'].dt.year

player_sessions_raw['Month'] = pd.to_datetime(player_sessions_raw['Session_Date']).dt.month
player_sessions_raw['Year'] = pd.to_datetime(player_sessions_raw['Session_Date']).dt.year

muscle_imbalance_raw['Month'] = pd.to_datetime(muscle_imbalance_raw['Date Recorded']).dt.month
muscle_imbalance_raw['Year'] = pd.to_datetime(muscle_imbalance_raw['Date Recorded']).dt.year

In [113]:
injury_history_raw[['Player.ID', 'Name', 'Injury Date', 'Month','Year']].value_counts()

Player.ID  Name              Injury Date  Month  Year
101        Jordan Matthews   2023-07-22   7      2023    1
                             2023-10-25   10     2023    1
                             2023-12-05   12     2023    1
103        Malik Robinson    2023-02-14   2      2023    1
                             2023-06-28   6      2023    1
                             2023-09-27   9      2023    1
105        Noah Bradley      2023-01-13   1      2023    1
                             2023-09-20   9      2023    1
                             2023-12-19   12     2023    1
106        Lennon Van        2024-01-01   1      2024    1
107        Cameron Howard    2023-12-21   12     2023    1
109        Miles Richardson  2023-07-05   7      2023    1
110        Kyle Saunders     2023-10-09   10     2023    1
112        Anthony Lopez     2023-01-26   1      2023    1
                             2023-07-15   7      2023    1
                             2023-11-18   11     2023    1
11

In [9]:
print(f"Raw Injury Data: \n {injury_history_raw.head()} \n")
print(f"Raw Muscle Imbalance Data: \n {muscle_imbalance_raw.head()} \n")
print(f"Raw Player Session Data: \n {player_sessions_raw.head()} \n")

Raw Injury Data: 
    Player.ID             Name  Group.Id    Injury Type   Body Part   Side  \
0        101  Jordan Matthews       201  Muscle Strain  Quadriceps  Right   
1        101  Jordan Matthews       201     Tendonitis       Wrist   Left   
2        101  Jordan Matthews       201     Tendonitis    Shoulder  Right   
3        103   Malik Robinson       203         Strain       Groin  Right   
4        103   Malik Robinson       203       Fracture       Wrist   Left   

  Injury Date Severity  Recovery Time (days)  \
0   12/5/2023  Grade 2                    51   
1  10/25/2023      NaN                    11   
2   7/22/2023      NaN                    12   
3   6/28/2023  Grade 1                    20   
4   2/14/2023      NaN                    68   

                                    Additional Notes  Month  
0  Grade 2 quadriceps strain with partial tearing...     12  
1  De Quervain's tenosynovitis. Swelling and pain...     10  
2  Rotator cuff tendonitis due to overuse. 

In [10]:
print(f'Columns present in injury data: \n {injury_history_raw.columns} \n')
print(f'Columns present in muscle imbalance data: \n {muscle_imbalance_raw.columns} \n')
print(f'Columns present in player session data: \n {player_sessions_raw.columns} \n')

Columns present in injury data: 
 Index(['Player.ID', 'Name', 'Group.Id', 'Injury Type', 'Body Part', 'Side',
       'Injury Date', 'Severity', 'Recovery Time (days)', 'Additional Notes',
       'Month'],
      dtype='object') 

Columns present in muscle imbalance data: 
 Index(['Player.ID', 'Session ID', 'Player Name', 'Date Recorded',
       'Hamstring To Quad Ratio', 'Quad Imbalance Percent',
       'HamstringImbalance Percent', 'Calf Imbalance Percent',
       'Groin Imbalance Percent', 'Month'],
      dtype='object') 

Columns present in player session data: 
 Index(['Name', 'Player.ID', 'Group.Id', 'Group.name', 'League.ID',
       'Session.ID', 'Session_Date', 'Position', 'Distance..mi.',
       'Distance...min..mi.', 'Duration..s.', 'Steps', 'Speed....of.max......',
       'Speed..max....mph.', 'Speed..?ò...mph.', 'Time..s.',
       'Accumulated.Acceleration.Load', 'Anaerobic.Activity..distance...mi.',
       'Jump.Load..J.', 'Heart.Rate..?ò...bpm.', 'Heart.Rate..min....bpm.',


## Player Session Data column research

#### TRIMP - A way to measure Training Impulse (Aerobic Training Load). It is a product of Training volume (minutes) and Training intensity (beats per minutes). 

TRIMP = Training VOlume * Training Intensity. For example if a workout is 50 minutes and the average heart rate (bpm) is 140 bpm, TRIMP score is calculated out to 7000. TRIMP is affected by a number of factors but we can look for correlation on a correlation matrix as well. 
    - Physical Settings
    - Resting and Maximal Heart Rate
    - Gender (since its mbb there's not going to be any variance with this feature)

TRIMP can be used to compare sessions of different lengths, or to compare high-intensity sessions with longer game data. For example if a session with TRIMP score of 108 might be considered moderate, the TRIMP/min score could indicate an intense session if Training Volume is lower. Training Stress Score (TSS) is a proprietary variant based on a score of 100 for a 1-hour maximum sustained effort. 
Positive correlation between Heart Rate and Effort

---

In [11]:
print(f'Size of injury history data: \n {injury_history_raw.shape}\n')
print(f'Size of muscle imbalance data: \n {muscle_imbalance_raw.shape}\n')
print(f'Size of player session data: \n {player_sessions_raw.shape}\n')

Size of injury history data: 
 (21, 11)

Size of muscle imbalance data: 
 (182, 10)

Size of player session data: 
 (2604, 31)



## Null Values

In [12]:
print(f'Null Values in injury history data: \n {injury_history_raw.isnull().sum()}')
print(f'Null Values in muscle imbalance data: \n {muscle_imbalance_raw.isnull().sum()}')
print(f'Null Values in player session data: \n {player_sessions_raw.isnull().sum()}')

Null Values in injury history data: 
 Player.ID                0
Name                     0
Group.Id                 0
Injury Type              0
Body Part                0
Side                     5
Injury Date              0
Severity                10
Recovery Time (days)     0
Additional Notes         0
Month                    0
dtype: int64
Null Values in muscle imbalance data: 
 Player.ID                     0
Session ID                    0
Player Name                   0
Date Recorded                 0
Hamstring To Quad Ratio       0
Quad Imbalance Percent        0
HamstringImbalance Percent    0
Calf Imbalance Percent        0
Groin Imbalance Percent       0
Month                         0
dtype: int64
Null Values in player session data: 
 Name                                  0
Player.ID                             0
Group.Id                              0
Group.name                            0
League.ID                             0
Session.ID                            0
S

Okay so with null values, injury history data is the only dataset with null values. There are 2 columns with null values with a total of 15 null values. Less than 10% of the data. We should delete the data since we are working with health data instead of imputing the values. I think it will introduce bias through assumption. Our train test split is going to be pretty weak since we only will have 167 instances. 

In [114]:
twothird_data = pd.merge(muscle_imbalance_raw, player_sessions_raw, on = ['Player.ID', 'Month', 'Year'], how = 'right')
twothird_data.head()

Unnamed: 0,Player.ID,Session ID,Player Name,Date Recorded,Hamstring To Quad Ratio,Quad Imbalance Percent,HamstringImbalance Percent,Calf Imbalance Percent,Groin Imbalance Percent,Month,...,Heart.Rate..min....bpm.,Heart.Rate..max....bpm.,Human.Core.Temperature..?ò....F.,Human.Core.Temperature..max.....F.,TRIMP,Heart.Rate.Recoveries,Jump.Height..max....ft.,Changes.of.Orientation,Exertions,Disk.Usage....
0,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,74,198,99.47,101.24,261,5,2.31,229,307,58.56
1,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,62,179,99.56,99.33,270,6,2.44,427,180,44.93
2,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,78,172,100.06,102.31,149,4,3.04,383,440,15.32
3,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,64,186,100.45,101.1,180,10,3.17,462,450,21.46
4,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,62,146,98.73,100.91,152,4,1.28,118,416,20.51


In [115]:
obt = pd.merge(twothird_data, injury_history_raw, on = ['Player.ID', 'Month', 'Year'], how = 'left')
obt.tail()

Unnamed: 0,Player.ID,Session ID,Player Name,Date Recorded,Hamstring To Quad Ratio,Quad Imbalance Percent,HamstringImbalance Percent,Calf Imbalance Percent,Groin Imbalance Percent,Month,...,Disk.Usage....,Name_y,Group.Id_y,Injury Type,Body Part,Side,Injury Date,Severity,Recovery Time (days),Additional Notes
2599,108,112,Xavier Foster,12/1/2023,1.106029,-8.500787,-9.402117,-8.825422,-8.92167,12,...,2.65,,,,,,NaT,,,
2600,108,112,Xavier Foster,12/1/2023,1.106029,-8.500787,-9.402117,-8.825422,-8.92167,12,...,87.83,,,,,,NaT,,,
2601,108,112,Xavier Foster,12/1/2023,1.106029,-8.500787,-9.402117,-8.825422,-8.92167,12,...,36.56,,,,,,NaT,,,
2602,108,112,Xavier Foster,12/1/2023,1.106029,-8.500787,-9.402117,-8.825422,-8.92167,12,...,53.61,,,,,,NaT,,,
2603,108,112,Xavier Foster,12/1/2023,1.106029,-8.500787,-9.402117,-8.825422,-8.92167,12,...,16.67,,,,,,NaT,,,


In [116]:
obt.isnull().sum()

Player.ID                                0
Session ID                               0
Player Name                              0
Date Recorded                            0
Hamstring To Quad Ratio                  0
Quad Imbalance Percent                   0
HamstringImbalance Percent               0
Calf Imbalance Percent                   0
Groin Imbalance Percent                  0
Month                                    0
Year                                     0
Name_x                                   0
Group.Id_x                               0
Group.name                               0
League.ID                                0
Session.ID                               0
Session_Date                             0
Position                                 0
Distance..mi.                            0
Distance...min..mi.                      0
Duration..s.                             0
Steps                                    0
Speed....of.max......                    0
Speed..max.

In [117]:
obt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2604 entries, 0 to 2603
Data columns (total 49 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   Player.ID                           2604 non-null   int64         
 1   Session ID                          2604 non-null   int64         
 2   Player Name                         2604 non-null   object        
 3   Date Recorded                       2604 non-null   object        
 4   Hamstring To Quad Ratio             2604 non-null   float64       
 5   Quad Imbalance Percent              2604 non-null   float64       
 6   HamstringImbalance Percent          2604 non-null   float64       
 7   Calf Imbalance Percent              2604 non-null   float64       
 8   Groin Imbalance Percent             2604 non-null   float64       
 9   Month                               2604 non-null   int32         
 10  Year                    

In [118]:
# Dropping columns that are repeated and redundant (no variance among values in column). 
distinct_obt = obt.drop(columns = ['Group.name', 'League.ID', 'Name_y', 'Group.Id_y', 'Name_x'])

In [119]:
distinct_obt.isna().sum()

Player.ID                                0
Session ID                               0
Player Name                              0
Date Recorded                            0
Hamstring To Quad Ratio                  0
Quad Imbalance Percent                   0
HamstringImbalance Percent               0
Calf Imbalance Percent                   0
Groin Imbalance Percent                  0
Month                                    0
Year                                     0
Group.Id_x                               0
Session.ID                               0
Session_Date                             0
Position                                 0
Distance..mi.                            0
Distance...min..mi.                      0
Duration..s.                             0
Steps                                    0
Speed....of.max......                    0
Speed..max....mph.                       0
Speed..?ò...mph.                         0
Time..s.                                 0
Accumulated

In [120]:
# Injury type, Body Part, Injury Date, Recovery Time, and Additional Notes all have the same number of null values indicating that these players may not have injuries. 
distinct_obt['Injury Type'].fillna("Not Injured", inplace=True)
distinct_obt['Body Part'].fillna("None", inplace = True)
distinct_obt['Injury Date'].fillna("1/1/1990", inplace = True)
distinct_obt['Recovery Time (days)'].fillna(0, inplace = True) 
distinct_obt['Additional Notes'].fillna("Not Injured", inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  distinct_obt['Injury Type'].fillna("Not Injured", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  distinct_obt['Body Part'].fillna("None", inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on 

In [121]:
distinct_obt.loc[distinct_obt['Injury Type'] == "Not Injured", 'Severity'] = distinct_obt.loc[distinct_obt['Injury Type'] == "Not Injured", 'Severity'].fillna("Grade 0")
distinct_obt.loc[distinct_obt['Injury Type'] == "Not Injured", 'Side'] = distinct_obt.loc[distinct_obt['Injury Type'] == "Not Injured", 'Side'].fillna("No Injury")

In [122]:
clean_obt = distinct_obt.dropna()
clean_obt.head()

Unnamed: 0,Player.ID,Session ID,Player Name,Date Recorded,Hamstring To Quad Ratio,Quad Imbalance Percent,HamstringImbalance Percent,Calf Imbalance Percent,Groin Imbalance Percent,Month,...,Changes.of.Orientation,Exertions,Disk.Usage....,Injury Type,Body Part,Side,Injury Date,Severity,Recovery Time (days),Additional Notes
0,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,229,307,58.56,Strain,Knee,Left,2023-01-26,Grade 1,28.0,Strain of the posterior cruciate ligament (PCL...
1,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,427,180,44.93,Strain,Knee,Left,2023-01-26,Grade 1,28.0,Strain of the posterior cruciate ligament (PCL...
2,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,383,440,15.32,Strain,Knee,Left,2023-01-26,Grade 1,28.0,Strain of the posterior cruciate ligament (PCL...
3,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,462,450,21.46,Strain,Knee,Left,2023-01-26,Grade 1,28.0,Strain of the posterior cruciate ligament (PCL...
4,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,118,416,20.51,Strain,Knee,Left,2023-01-26,Grade 1,28.0,Strain of the posterior cruciate ligament (PCL...


In [123]:
clean_sorted_obt = clean_obt.sort_values(['Player.ID', 'Session ID', 'Date Recorded', 'Session.ID', 'Session_Date', 'Injury Date'])
clean_sorted_obt.head()

Unnamed: 0,Player.ID,Session ID,Player Name,Date Recorded,Hamstring To Quad Ratio,Quad Imbalance Percent,HamstringImbalance Percent,Calf Imbalance Percent,Groin Imbalance Percent,Month,...,Changes.of.Orientation,Exertions,Disk.Usage....,Injury Type,Body Part,Side,Injury Date,Severity,Recovery Time (days),Additional Notes
930,101,101,Jordan Matthews,1/1/2023,0.610768,5.481295,3.347801,5.409497,5.616962,1,...,308,132,23.12,Not Injured,,No Injury,1990-01-01,Grade 0,0.0,Not Injured
931,101,101,Jordan Matthews,1/1/2023,0.610768,5.481295,3.347801,5.409497,5.616962,1,...,188,194,74.68,Not Injured,,No Injury,1990-01-01,Grade 0,0.0,Not Injured
932,101,101,Jordan Matthews,1/1/2023,0.610768,5.481295,3.347801,5.409497,5.616962,1,...,181,413,52.18,Not Injured,,No Injury,1990-01-01,Grade 0,0.0,Not Injured
933,101,101,Jordan Matthews,1/1/2023,0.610768,5.481295,3.347801,5.409497,5.616962,1,...,115,359,37.75,Not Injured,,No Injury,1990-01-01,Grade 0,0.0,Not Injured
934,101,101,Jordan Matthews,1/1/2023,0.610768,5.481295,3.347801,5.409497,5.616962,1,...,156,405,62.07,Not Injured,,No Injury,1990-01-01,Grade 0,0.0,Not Injured


In [124]:
clean_sorted_obt['Injury_status'] = 1  
clean_sorted_obt['Injury_status'] = clean_sorted_obt['Injury_status'].where(clean_sorted_obt['Injury Type'] == 'Not Injured', 0)

In [125]:
clean_sorted_obt.columns

Index(['Player.ID', 'Session ID', 'Player Name', 'Date Recorded',
       'Hamstring To Quad Ratio', 'Quad Imbalance Percent',
       'HamstringImbalance Percent', 'Calf Imbalance Percent',
       'Groin Imbalance Percent', 'Month', 'Year', 'Group.Id_x', 'Session.ID',
       'Session_Date', 'Position', 'Distance..mi.', 'Distance...min..mi.',
       'Duration..s.', 'Steps', 'Speed....of.max......', 'Speed..max....mph.',
       'Speed..?ò...mph.', 'Time..s.', 'Accumulated.Acceleration.Load',
       'Anaerobic.Activity..distance...mi.', 'Jump.Load..J.',
       'Heart.Rate..?ò...bpm.', 'Heart.Rate..min....bpm.',
       'Heart.Rate..max....bpm.', 'Human.Core.Temperature..?ò....F.',
       'Human.Core.Temperature..max.....F.', 'TRIMP', 'Heart.Rate.Recoveries',
       'Jump.Height..max....ft.', 'Changes.of.Orientation', 'Exertions',
       'Disk.Usage....', 'Injury Type', 'Body Part', 'Side', 'Injury Date',
       'Severity', 'Recovery Time (days)', 'Additional Notes',
       'Injury_status'],

In [126]:
clean_sorted_obt['Date Recorded'] = pd.to_datetime(clean_sorted_obt['Date Recorded'])
clean_sorted_obt['Session_Date'] = pd.to_datetime(clean_sorted_obt['Session_Date'])
clean_sorted_obt['Injury Date'] = pd.to_datetime(clean_sorted_obt['Injury Date'])

In [127]:
columns_to_drop = [
    'Quad Imbalance Percent', 
    'HamstringImbalance Percent', 
    'Group.Id_x', 
    'Session ID', 
    'Session.ID', 
    'Changes.of.Orientation', 
    'Player Name', 
    'Additional Notes']
for column in columns_to_drop:
    if column in clean_sorted_obt.columns:
        clean_sorted_obt.drop(columns = column, inplace = True)

In [128]:

num_obt = clean_sorted_obt.select_dtypes(include=['number'])
correlation_matrix = num_obt.corr()
round(correlation_matrix['Recovery Time (days)'], 4).sort_values()

Injury_status                        -0.6755
Time..s.                             -0.0300
Jump.Height..max....ft.              -0.0297
Human.Core.Temperature..max.....F.   -0.0251
Speed..max....mph.                   -0.0229
Speed....of.max......                -0.0051
Speed..?ò...mph.                     -0.0027
Heart.Rate.Recoveries                -0.0019
TRIMP                                 0.0008
Month                                 0.0010
Heart.Rate..?ò...bpm.                 0.0030
Heart.Rate..max....bpm.               0.0055
Disk.Usage....                        0.0067
Calf Imbalance Percent                0.0086
Groin Imbalance Percent               0.0089
Accumulated.Acceleration.Load         0.0106
Duration..s.                          0.0126
Steps                                 0.0153
Anaerobic.Activity..distance...mi.    0.0283
Jump.Load..J.                         0.0296
Distance...min..mi.                   0.0297
Heart.Rate..min....bpm.               0.0311
Distance..

In [129]:
non_cat_data = clean_sorted_obt.select_dtypes(include=['number', 'datetime'])
non_cat_data

Unnamed: 0,Player.ID,Date Recorded,Hamstring To Quad Ratio,Calf Imbalance Percent,Groin Imbalance Percent,Month,Year,Session_Date,Distance..mi.,Distance...min..mi.,...,Human.Core.Temperature..?ò....F.,Human.Core.Temperature..max.....F.,TRIMP,Heart.Rate.Recoveries,Jump.Height..max....ft.,Exertions,Disk.Usage....,Injury Date,Recovery Time (days),Injury_status
930,101,2023-01-01,0.610768,5.409497,5.616962,1,2023,2023-01-01,5.27,0.13,...,99.38,101.96,114,10,2.37,132,23.12,1990-01-01,0.0,1
931,101,2023-01-01,0.610768,5.409497,5.616962,1,2023,2023-01-03,5.79,0.12,...,101.00,102.23,105,4,3.52,194,74.68,1990-01-01,0.0,1
932,101,2023-01-01,0.610768,5.409497,5.616962,1,2023,2023-01-04,3.94,0.11,...,99.31,98.16,268,5,2.41,413,52.18,1990-01-01,0.0,1
933,101,2023-01-01,0.610768,5.409497,5.616962,1,2023,2023-01-06,3.11,0.10,...,100.78,100.33,245,8,2.12,359,37.75,1990-01-01,0.0,1
934,101,2023-01-01,0.610768,5.409497,5.616962,1,2023,2023-01-07,3.79,0.08,...,99.16,101.59,183,5,2.10,405,62.07,1990-01-01,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271,115,2023-12-01,1.173804,3.361765,3.485792,12,2023,2023-12-24,3.09,0.14,...,100.97,102.51,131,7,1.93,296,15.41,1990-01-01,0.0,1
272,115,2023-12-01,1.173804,3.361765,3.485792,12,2023,2023-12-25,2.98,0.06,...,100.72,101.75,196,4,2.00,395,24.67,1990-01-01,0.0,1
273,115,2023-12-01,1.173804,3.361765,3.485792,12,2023,2023-12-26,1.99,0.07,...,98.92,98.65,276,1,2.34,326,48.27,1990-01-01,0.0,1
274,115,2023-12-01,1.173804,3.361765,3.485792,12,2023,2023-12-28,3.02,0.08,...,100.85,101.76,178,6,1.89,444,50.06,1990-01-01,0.0,1


In [130]:
clean_sorted_obt = clean_sorted_obt.reset_index()

In [131]:
clean_sorted_obt.columns

Index(['index', 'Player.ID', 'Date Recorded', 'Hamstring To Quad Ratio',
       'Calf Imbalance Percent', 'Groin Imbalance Percent', 'Month', 'Year',
       'Session_Date', 'Position', 'Distance..mi.', 'Distance...min..mi.',
       'Duration..s.', 'Steps', 'Speed....of.max......', 'Speed..max....mph.',
       'Speed..?ò...mph.', 'Time..s.', 'Accumulated.Acceleration.Load',
       'Anaerobic.Activity..distance...mi.', 'Jump.Load..J.',
       'Heart.Rate..?ò...bpm.', 'Heart.Rate..min....bpm.',
       'Heart.Rate..max....bpm.', 'Human.Core.Temperature..?ò....F.',
       'Human.Core.Temperature..max.....F.', 'TRIMP', 'Heart.Rate.Recoveries',
       'Jump.Height..max....ft.', 'Exertions', 'Disk.Usage....', 'Injury Type',
       'Body Part', 'Side', 'Injury Date', 'Severity', 'Recovery Time (days)',
       'Injury_status'],
      dtype='object')

In [132]:
ohe = OneHotEncoder(sparse_output=False, drop = 'first')

non_cat_data = clean_sorted_obt.select_dtypes(include=['number', 'datetime'])
non_cat_data_columns = non_cat_data.columns.tolist()
cat_columns = set(clean_sorted_obt.columns) - set(non_cat_data.columns)
cat_data = clean_sorted_obt[list(cat_columns)]

multi_colinear_columns = ['Injury Type',
       'Body Part', 'Side', 'Injury Date', 'Severity', 'Recovery Time (days)', 'Player.ID', 'Date Recorded', 'Session_Date', 'Month', 'Year' 
       ]
for col in multi_colinear_columns:
    if col in non_cat_data.columns:
        non_cat_data.drop(columns = col, inplace = True)

ohe_cat_data = ohe.fit_transform(cat_data)

ohe_cat_df = pd.DataFrame(ohe_cat_data, columns=list(ohe.get_feature_names_out()))
ohe_injury_data = pd.concat([non_cat_data, ohe_cat_df], axis = 1)
ohe_cat_columns = list(ohe.get_feature_names_out())

col_to_drop = ['level_0', 'index']
for col in col_to_drop :
    if col in ohe_injury_data.columns:
        ohe_injury_data.drop(columns = col, inplace = True)

ohe_injury_data

Unnamed: 0,Hamstring To Quad Ratio,Calf Imbalance Percent,Groin Imbalance Percent,Distance..mi.,Distance...min..mi.,Duration..s.,Steps,Speed....of.max......,Speed..max....mph.,Speed..?ò...mph.,...,Body Part_Knee,Body Part_None,Body Part_Quadriceps,Side_No Injury,Side_Right,Position_Forward,Position_Guard,Severity_Grade 1,Severity_Grade 2,Severity_Grade 3
0,0.610768,5.409497,5.616962,5.27,0.13,6246,6552,73.10,13.41,5.93,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.610768,5.409497,5.616962,5.79,0.12,3758,6701,56.88,14.78,7.31,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.610768,5.409497,5.616962,3.94,0.11,4895,10088,57.30,17.84,7.43,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.610768,5.409497,5.616962,3.11,0.10,5319,6355,66.00,16.64,8.64,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.610768,5.409497,5.616962,3.79,0.08,4327,5828,95.48,15.90,9.73,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,1.173804,3.361765,3.485792,3.09,0.14,4832,5730,72.69,12.70,6.98,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2461,1.173804,3.361765,3.485792,2.98,0.06,5762,5950,89.08,19.82,5.85,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2462,1.173804,3.361765,3.485792,1.99,0.07,6521,5456,78.26,12.02,9.72,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2463,1.173804,3.361765,3.485792,3.02,0.08,4320,7776,82.90,18.98,9.15,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [133]:
scaler = StandardScaler(with_std = True, with_mean= True)

data_to_scale = non_cat_data.copy()
columns = ['level_0', 'index', 'Injury_status', 'Severity_Grade 1',
       'Severity_Grade 2', 'Severity_Grade 3', 'Side_No Injury', 'Injury Type_Not Injured', 'Injury Type_Sprain',
       'Injury Type_Strain', 'Body Part_Hamstring', 'Body Part_Knee',
       'Body Part_None', 'Body Part_Quadriceps']

for col in columns:
    if col in data_to_scale.columns:
        data_to_scale.drop(columns = col, inplace = True)
        
unscaled_data = data_to_scale.select_dtypes(include = ['number'])

columns_to_scale = list(unscaled_data.columns)

scaled_ohe_injury_data = ohe_injury_data.copy()
scaled_ohe_injury_data[columns_to_scale] = scaler.fit_transform(ohe_injury_data[columns_to_scale])

In [134]:
scaled_ohe_injury_data

Unnamed: 0,Hamstring To Quad Ratio,Calf Imbalance Percent,Groin Imbalance Percent,Distance..mi.,Distance...min..mi.,Duration..s.,Steps,Speed....of.max......,Speed..max....mph.,Speed..?ò...mph.,...,Body Part_Knee,Body Part_None,Body Part_Quadriceps,Side_No Injury,Side_Right,Position_Forward,Position_Guard,Severity_Grade 1,Severity_Grade 2,Severity_Grade 3
0,-0.391441,0.343330,0.362208,1.243962,1.085821,0.851841,-1.033717,-0.208076,-0.544690,-1.183300,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-0.391441,0.343330,0.362208,1.614493,0.723489,-1.648744,-0.955161,-1.447217,-0.050556,-0.150164,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.391441,0.343330,0.362208,0.296260,0.361157,-0.505993,0.830528,-1.415131,1.053133,-0.060326,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.391441,0.343330,0.362208,-0.295164,-0.001176,-0.079848,-1.137579,-0.750487,0.620314,0.845539,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.391441,0.343330,0.362208,0.189376,-0.725841,-1.076866,-1.415423,1.501664,0.353409,1.661566,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,2.146932,0.111649,0.122505,-0.309415,1.448154,-0.569311,-1.467090,-0.239398,-0.800775,-0.397218,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2461,2.146932,0.111649,0.122505,-0.387797,-1.450506,0.365393,-1.351102,1.012730,1.767284,-1.243192,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2462,2.146932,0.111649,0.122505,-1.093229,-1.088173,1.128232,-1.611548,0.186127,-1.046039,1.654080,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2463,2.146932,0.111649,0.122505,-0.359294,-0.725841,-1.083901,-0.388401,0.540604,1.464311,1.227350,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [135]:
scaled_ohe_injury_data.columns

Index(['Hamstring To Quad Ratio', 'Calf Imbalance Percent',
       'Groin Imbalance Percent', 'Distance..mi.', 'Distance...min..mi.',
       'Duration..s.', 'Steps', 'Speed....of.max......', 'Speed..max....mph.',
       'Speed..?ò...mph.', 'Time..s.', 'Accumulated.Acceleration.Load',
       'Anaerobic.Activity..distance...mi.', 'Jump.Load..J.',
       'Heart.Rate..?ò...bpm.', 'Heart.Rate..min....bpm.',
       'Heart.Rate..max....bpm.', 'Human.Core.Temperature..?ò....F.',
       'Human.Core.Temperature..max.....F.', 'TRIMP', 'Heart.Rate.Recoveries',
       'Jump.Height..max....ft.', 'Exertions', 'Disk.Usage....',
       'Injury_status', 'Injury Type_Not Injured', 'Injury Type_Sprain',
       'Injury Type_Strain', 'Body Part_Hamstring', 'Body Part_Knee',
       'Body Part_None', 'Body Part_Quadriceps', 'Side_No Injury',
       'Side_Right', 'Position_Forward', 'Position_Guard', 'Severity_Grade 1',
       'Severity_Grade 2', 'Severity_Grade 3'],
      dtype='object')

In [136]:
smote = SMOTE(random_state=42)
log_reg = LogisticRegression(solver = 'lbfgs')

predictors = scaled_ohe_injury_data.drop(columns = ["Injury_status", 'Severity_Grade 1',
       'Severity_Grade 2', 'Severity_Grade 3', 'Side_No Injury', 'Injury Type_Not Injured', 'Injury Type_Sprain',
       'Injury Type_Strain', 'Body Part_Hamstring', 'Body Part_Knee',
       'Body Part_None', 'Body Part_Quadriceps', 'Position_Forward', 'Position_Guard', 'Side_Right']) # Dropping target variable and multi-colinear features as well
predictors = predictors.select_dtypes(exclude = ['datetime']) # Removing date time as it should not be needed. 
target = scaled_ohe_injury_data['Injury_status']

x_res, y_res = smote.fit_resample(predictors, target)


In [137]:
x_train, x_test, y_train, y_test = train_test_split(x_res, y_res, train_size=0.7, random_state = 999)

In [138]:
x_train.shape

(3329, 24)

In [139]:
y_train

4105    0
1553    1
4599    0
670     1
2216    1
       ..
217     1
2523    0
225     1
481     1
4444    0
Name: Injury_status, Length: 3329, dtype: int64

In [140]:
scores = cross_val_score(log_reg, x_train, y_train, cv = 5, scoring = 'f1')
cv_score = np.mean(scores)
cv_score

np.float64(0.7566028372921345)

In [141]:
log_reg_model = log_reg.fit(x_train, y_train)
prediction = log_reg_model.predict(x_test)

accuracy = accuracy_score(y_test, prediction)
report = classification_report(y_test, prediction)
print(report)

              precision    recall  f1-score   support

           0       0.78      0.78      0.78       734
           1       0.77      0.76      0.77       693

    accuracy                           0.77      1427
   macro avg       0.77      0.77      0.77      1427
weighted avg       0.77      0.77      0.77      1427



In [142]:
accuracy

0.7736510161177295

In [143]:
prediction = log_reg_model.predict_proba(x_test)
prediction[:,0]

array([0.70286123, 0.01809986, 0.39857142, ..., 0.58781191, 0.25982325,
       0.19840128])

In [144]:
# Get the coefficients for each feature
coefficients = log_reg.coef_

# Print the coefficients
print("Coefficients:", coefficients)

Coefficients: [[-1.76486369 -1.48491494  0.96421054  0.08627874 -0.47514455 -0.15750041
   0.09016081  0.26818735  0.03406354 -0.17323989  0.22691326 -0.05967891
   0.01338482  0.04148389  0.28281148  0.00727509  0.03599163  0.06775359
   0.04312118 -0.00756855 -0.17790073  0.33445418 -0.23183909  0.03901298]]


In [147]:
columns = x_train.columns.tolist()
coefficients_with_intercept = coefficients[0].tolist()  

coeff_df = pd.DataFrame({
    'Features': columns,
    'Coefficients': coefficients_with_intercept
})

coeff_df.sort_values('Coefficients')


Unnamed: 0,Features,Coefficients
0,Hamstring To Quad Ratio,-1.764864
1,Calf Imbalance Percent,-1.484915
4,Distance...min..mi.,-0.475145
22,Exertions,-0.231839
20,Heart.Rate.Recoveries,-0.177901
9,Speed..?ò...mph.,-0.17324
5,Duration..s.,-0.1575
11,Accumulated.Acceleration.Load,-0.059679
19,TRIMP,-0.007569
15,Heart.Rate..min....bpm.,0.007275
