In [151]:
# importing libraries here to better keep track of them

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Reading csv files into memory

injury_history_raw = pd.read_csv("data/injury_history(injury_history).csv", sep = ",", encoding = 'ISO-8859-1')
muscle_imbalance_raw = pd.read_csv("data/injury_history(muscle_imbalance_data).csv", sep = ",", encoding = 'ISO-8859-1')
player_sessions_raw = pd.read_csv("data/injury_history(player_sessions).csv", sep = ",", encoding = 'ISO-8859-1')

In [3]:
injury_history_raw['Month'] = pd.to_datetime(injury_history_raw['Injury Date']).dt.month
player_sessions_raw['Month'] = pd.to_datetime(player_sessions_raw['Session_Date']).dt.month
muscle_imbalance_raw['Month'] = pd.to_datetime(muscle_imbalance_raw['Date Recorded']).dt.month

In [11]:
injury_history_raw[['Player.ID', 'Name', 'Injury Date', 'Month']].value_counts()

Player.ID  Name              Injury Date  Month
101        Jordan Matthews   10/25/2023   10       1
                             12/5/2023    12       1
                             7/22/2023    7        1
103        Malik Robinson    2/14/2023    2        1
                             6/28/2023    6        1
                             9/27/2023    9        1
105        Noah Bradley      1/13/2023    1        1
                             12/19/2023   12       1
                             9/20/2023    9        1
106        Lennon Van        1/1/2024     1        1
107        Cameron Howard    12/21/2023   12       1
109        Miles Richardson  7/5/2023     7        1
110        Kyle Saunders     10/9/2023    10       1
112        Anthony Lopez     1/26/2023    1        1
                             11/18/2023   11       1
                             7/15/2023    7        1
114        Julian Simmons    10/1/2023    10       1
                             2/25/2023    2        

In [5]:
print(f"Raw Injury Data: \n {injury_history_raw.head()} \n")
print(f"Raw Muscle Imbalance Data: \n {muscle_imbalance_raw.head()} \n")
print(f"Raw Player Session Data: \n {player_sessions_raw.head()} \n")

Raw Injury Data: 
    Player.ID             Name  Group.Id    Injury Type   Body Part   Side  \
0        101  Jordan Matthews       201  Muscle Strain  Quadriceps  Right   
1        101  Jordan Matthews       201     Tendonitis       Wrist   Left   
2        101  Jordan Matthews       201     Tendonitis    Shoulder  Right   
3        103   Malik Robinson       203         Strain       Groin  Right   
4        103   Malik Robinson       203       Fracture       Wrist   Left   

  Injury Date Severity  Recovery Time (days)  \
0   12/5/2023  Grade 2                    51   
1  10/25/2023      NaN                    11   
2   7/22/2023      NaN                    12   
3   6/28/2023  Grade 1                    20   
4   2/14/2023      NaN                    68   

                                    Additional Notes  Month  
0  Grade 2 quadriceps strain with partial tearing...     12  
1  De Quervain's tenosynovitis. Swelling and pain...     10  
2  Rotator cuff tendonitis due to overuse. 

In [6]:
print(f'Columns present in injury data: \n {injury_history_raw.columns} \n')
print(f'Columns present in muscle imbalance data: \n {muscle_imbalance_raw.columns} \n')
print(f'Columns present in player session data: \n {player_sessions_raw.columns} \n')

Columns present in injury data: 
 Index(['Player.ID', 'Name', 'Group.Id', 'Injury Type', 'Body Part', 'Side',
       'Injury Date', 'Severity', 'Recovery Time (days)', 'Additional Notes',
       'Month'],
      dtype='object') 

Columns present in muscle imbalance data: 
 Index(['Player.ID', 'Session ID', 'Player Name', 'Date Recorded',
       'Hamstring To Quad Ratio', 'Quad Imbalance Percent',
       'HamstringImbalance Percent', 'Calf Imbalance Percent',
       'Groin Imbalance Percent', 'Month'],
      dtype='object') 

Columns present in player session data: 
 Index(['Name', 'Player.ID', 'Group.Id', 'Group.name', 'League.ID',
       'Session.ID', 'Session_Date', 'Position', 'Distance..mi.',
       'Distance...min..mi.', 'Duration..s.', 'Steps', 'Speed....of.max......',
       'Speed..max....mph.', 'Speed..?ò...mph.', 'Time..s.',
       'Accumulated.Acceleration.Load', 'Anaerobic.Activity..distance...mi.',
       'Jump.Load..J.', 'Heart.Rate..?ò...bpm.', 'Heart.Rate..min....bpm.',


## Player Session Data column research

#### TRIMP - A way to measure Training Impulse (Aerobic Training Load). It is a product of Training volume (minutes) and Training intensity (beats per minutes). 

TRIMP = Training VOlume * Training Intensity. For example if a workout is 50 minutes and the average heart rate (bpm) is 140 bpm, TRIMP score is calculated out to 7000. TRIMP is affected by a number of factors but we can look for correlation on a correlation matrix as well. 
    - Physical Settings
    - Resting and Maximal Heart Rate
    - Gender (since its mbb there's not going to be any variance with this feature)

TRIMP can be used to compare sessions of different lengths, or to compare high-intensity sessions with longer game data. For example if a session with TRIMP score of 108 might be considered moderate, the TRIMP/min score could indicate an intense session if Training Volume is lower. Training Stress Score (TSS) is a proprietary variant based on a score of 100 for a 1-hour maximum sustained effort. 
Positive correlation between Heart Rate and Effort

---

In [7]:
print(f'Size of injury history data: \n {injury_history_raw.shape}\n')
print(f'Size of muscle imbalance data: \n {muscle_imbalance_raw.shape}\n')
print(f'Size of player session data: \n {player_sessions_raw.shape}\n')

Size of injury history data: 
 (21, 11)

Size of muscle imbalance data: 
 (182, 10)

Size of player session data: 
 (2604, 31)



## Null Values

In [8]:
print(f'Null Values in injury history data: \n {injury_history_raw.isnull().sum()}')
print(f'Null Values in muscle imbalance data: \n {muscle_imbalance_raw.isnull().sum()}')
print(f'Null Values in player session data: \n {player_sessions_raw.isnull().sum()}')

Null Values in injury history data: 
 Player.ID                0
Name                     0
Group.Id                 0
Injury Type              0
Body Part                0
Side                     5
Injury Date              0
Severity                10
Recovery Time (days)     0
Additional Notes         0
Month                    0
dtype: int64
Null Values in muscle imbalance data: 
 Player.ID                     0
Session ID                    0
Player Name                   0
Date Recorded                 0
Hamstring To Quad Ratio       0
Quad Imbalance Percent        0
HamstringImbalance Percent    0
Calf Imbalance Percent        0
Groin Imbalance Percent       0
Month                         0
dtype: int64
Null Values in player session data: 
 Name                                  0
Player.ID                             0
Group.Id                              0
Group.name                            0
League.ID                             0
Session.ID                            0
S

Okay so with null values, injury history data is the only dataset with null values. There are 2 columns with null values with a total of 15 null values. Less than 10% of the data. We should delete the data since we are working with health data instead of imputing the values. I think it will introduce bias through assumption. Our train test split is going to be pretty weak since we only will have 167 instances. 

In [9]:
twothird_data = pd.merge(muscle_imbalance_raw, player_sessions_raw, on = ['Player.ID', 'Month'], how = 'right')
twothird_data.head()

Unnamed: 0,Player.ID,Session ID,Player Name,Date Recorded,Hamstring To Quad Ratio,Quad Imbalance Percent,HamstringImbalance Percent,Calf Imbalance Percent,Groin Imbalance Percent,Month,...,Heart.Rate..min....bpm.,Heart.Rate..max....bpm.,Human.Core.Temperature..?ò....F.,Human.Core.Temperature..max.....F.,TRIMP,Heart.Rate.Recoveries,Jump.Height..max....ft.,Changes.of.Orientation,Exertions,Disk.Usage....
0,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,74,198,99.47,101.24,261,5,2.31,229,307,58.56
1,112,113,Anthony Lopez,1/1/2024,0.919842,-10.238526,-9.417831,-10.206984,-10.126625,1,...,74,198,99.47,101.24,261,5,2.31,229,307,58.56
2,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,62,179,99.56,99.33,270,6,2.44,427,180,44.93
3,112,113,Anthony Lopez,1/1/2024,0.919842,-10.238526,-9.417831,-10.206984,-10.126625,1,...,62,179,99.56,99.33,270,6,2.44,427,180,44.93
4,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,78,172,100.06,102.31,149,4,3.04,383,440,15.32


In [10]:
obt = pd.merge(twothird_data, injury_history_raw, on = ['Player.ID', 'Month'], how = 'left')
obt.tail()

Unnamed: 0,Player.ID,Session ID,Player Name,Date Recorded,Hamstring To Quad Ratio,Quad Imbalance Percent,HamstringImbalance Percent,Calf Imbalance Percent,Groin Imbalance Percent,Month,...,Disk.Usage....,Name_y,Group.Id_y,Injury Type,Body Part,Side,Injury Date,Severity,Recovery Time (days),Additional Notes
2847,108,112,Xavier Foster,12/1/2023,1.106029,-8.500787,-9.402117,-8.825422,-8.92167,12,...,2.65,,,,,,,,,
2848,108,112,Xavier Foster,12/1/2023,1.106029,-8.500787,-9.402117,-8.825422,-8.92167,12,...,87.83,,,,,,,,,
2849,108,112,Xavier Foster,12/1/2023,1.106029,-8.500787,-9.402117,-8.825422,-8.92167,12,...,36.56,,,,,,,,,
2850,108,112,Xavier Foster,12/1/2023,1.106029,-8.500787,-9.402117,-8.825422,-8.92167,12,...,53.61,,,,,,,,,
2851,108,112,Xavier Foster,12/1/2023,1.106029,-8.500787,-9.402117,-8.825422,-8.92167,12,...,16.67,,,,,,,,,


In [12]:
obt.isnull().sum()

Player.ID                                0
Session ID                               0
Player Name                              0
Date Recorded                            0
Hamstring To Quad Ratio                  0
Quad Imbalance Percent                   0
HamstringImbalance Percent               0
Calf Imbalance Percent                   0
Groin Imbalance Percent                  0
Month                                    0
Name_x                                   0
Group.Id_x                               0
Group.name                               0
League.ID                                0
Session.ID                               0
Session_Date                             0
Position                                 0
Distance..mi.                            0
Distance...min..mi.                      0
Duration..s.                             0
Steps                                    0
Speed....of.max......                    0
Speed..max....mph.                       0
Speed..?ò..

In [13]:
obt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2852 entries, 0 to 2851
Data columns (total 48 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Player.ID                           2852 non-null   int64  
 1   Session ID                          2852 non-null   int64  
 2   Player Name                         2852 non-null   object 
 3   Date Recorded                       2852 non-null   object 
 4   Hamstring To Quad Ratio             2852 non-null   float64
 5   Quad Imbalance Percent              2852 non-null   float64
 6   HamstringImbalance Percent          2852 non-null   float64
 7   Calf Imbalance Percent              2852 non-null   float64
 8   Groin Imbalance Percent             2852 non-null   float64
 9   Month                               2852 non-null   int32  
 10  Name_x                              2852 non-null   object 
 11  Group.Id_x                          2852 no

In [14]:
# Dropping columns that are repeated and redundant (no variance among values in column). 
distinct_obt = obt.drop(columns = ['Group.name', 'League.ID', 'Name_y', 'Group.Id_y', 'Name_x'])

In [15]:
distinct_obt.isna().sum()

Player.ID                                0
Session ID                               0
Player Name                              0
Date Recorded                            0
Hamstring To Quad Ratio                  0
Quad Imbalance Percent                   0
HamstringImbalance Percent               0
Calf Imbalance Percent                   0
Groin Imbalance Percent                  0
Month                                    0
Group.Id_x                               0
Session.ID                               0
Session_Date                             0
Position                                 0
Distance..mi.                            0
Distance...min..mi.                      0
Duration..s.                             0
Steps                                    0
Speed....of.max......                    0
Speed..max....mph.                       0
Speed..?ò...mph.                         0
Time..s.                                 0
Accumulated.Acceleration.Load            0
Anaerobic.A

In [16]:
# Injury type, Body Part, Injury Date, Recovery Time, and Additional Notes all have the same number of null values indicating that these players may not have injuries. 
distinct_obt['Injury Type'].fillna("Not Injured", inplace=True)
distinct_obt['Body Part'].fillna("None", inplace = True)
distinct_obt['Injury Date'].fillna("1/1/1990", inplace = True)
distinct_obt['Recovery Time (days)'].fillna(0, inplace = True) 
distinct_obt['Additional Notes'].fillna("Not Injured", inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  distinct_obt['Injury Type'].fillna("Not Injured", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  distinct_obt['Body Part'].fillna("None", inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on 

In [17]:
distinct_obt.loc[distinct_obt['Injury Type'] == "Not Injured", 'Severity'] = distinct_obt.loc[distinct_obt['Injury Type'] == "Not Injured", 'Severity'].fillna("Grade 0")
distinct_obt.loc[distinct_obt['Injury Type'] == "Not Injured", 'Side'] = distinct_obt.loc[distinct_obt['Injury Type'] == "Not Injured", 'Side'].fillna("No Injury")

In [18]:
clean_obt = distinct_obt.dropna()
clean_obt.head()

Unnamed: 0,Player.ID,Session ID,Player Name,Date Recorded,Hamstring To Quad Ratio,Quad Imbalance Percent,HamstringImbalance Percent,Calf Imbalance Percent,Groin Imbalance Percent,Month,...,Changes.of.Orientation,Exertions,Disk.Usage....,Injury Type,Body Part,Side,Injury Date,Severity,Recovery Time (days),Additional Notes
0,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,229,307,58.56,Strain,Knee,Left,1/26/2023,Grade 1,28.0,Strain of the posterior cruciate ligament (PCL...
1,112,113,Anthony Lopez,1/1/2024,0.919842,-10.238526,-9.417831,-10.206984,-10.126625,1,...,229,307,58.56,Strain,Knee,Left,1/26/2023,Grade 1,28.0,Strain of the posterior cruciate ligament (PCL...
2,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,427,180,44.93,Strain,Knee,Left,1/26/2023,Grade 1,28.0,Strain of the posterior cruciate ligament (PCL...
3,112,113,Anthony Lopez,1/1/2024,0.919842,-10.238526,-9.417831,-10.206984,-10.126625,1,...,427,180,44.93,Strain,Knee,Left,1/26/2023,Grade 1,28.0,Strain of the posterior cruciate ligament (PCL...
4,112,101,Anthony Lopez,1/1/2023,0.808741,-10.149294,-8.208145,-10.176416,-10.258755,1,...,383,440,15.32,Strain,Knee,Left,1/26/2023,Grade 1,28.0,Strain of the posterior cruciate ligament (PCL...


In [19]:
clean_sorted_obt = clean_obt.sort_values(['Player.ID', 'Session ID', 'Date Recorded', 'Session.ID', 'Session_Date', 'Injury Date'])
clean_sorted_obt.head()

Unnamed: 0,Player.ID,Session ID,Player Name,Date Recorded,Hamstring To Quad Ratio,Quad Imbalance Percent,HamstringImbalance Percent,Calf Imbalance Percent,Groin Imbalance Percent,Month,...,Changes.of.Orientation,Exertions,Disk.Usage....,Injury Type,Body Part,Side,Injury Date,Severity,Recovery Time (days),Additional Notes
1015,101,101,Jordan Matthews,1/1/2023,0.610768,5.481295,3.347801,5.409497,5.616962,1,...,308,132,23.12,Not Injured,,No Injury,1/1/1990,Grade 0,0.0,Not Injured
1017,101,101,Jordan Matthews,1/1/2023,0.610768,5.481295,3.347801,5.409497,5.616962,1,...,188,194,74.68,Not Injured,,No Injury,1/1/1990,Grade 0,0.0,Not Injured
1019,101,101,Jordan Matthews,1/1/2023,0.610768,5.481295,3.347801,5.409497,5.616962,1,...,181,413,52.18,Not Injured,,No Injury,1/1/1990,Grade 0,0.0,Not Injured
1021,101,101,Jordan Matthews,1/1/2023,0.610768,5.481295,3.347801,5.409497,5.616962,1,...,115,359,37.75,Not Injured,,No Injury,1/1/1990,Grade 0,0.0,Not Injured
1023,101,101,Jordan Matthews,1/1/2023,0.610768,5.481295,3.347801,5.409497,5.616962,1,...,156,405,62.07,Not Injured,,No Injury,1/1/1990,Grade 0,0.0,Not Injured


In [20]:
clean_sorted_obt['Injury_status'] = 1  
clean_sorted_obt['Injury_status'] = clean_sorted_obt['Injury_status'].where(clean_sorted_obt['Injury Type'] == 'Not Injured', 0)

In [21]:
clean_sorted_obt.columns

Index(['Player.ID', 'Session ID', 'Player Name', 'Date Recorded',
       'Hamstring To Quad Ratio', 'Quad Imbalance Percent',
       'HamstringImbalance Percent', 'Calf Imbalance Percent',
       'Groin Imbalance Percent', 'Month', 'Group.Id_x', 'Session.ID',
       'Session_Date', 'Position', 'Distance..mi.', 'Distance...min..mi.',
       'Duration..s.', 'Steps', 'Speed....of.max......', 'Speed..max....mph.',
       'Speed..?ò...mph.', 'Time..s.', 'Accumulated.Acceleration.Load',
       'Anaerobic.Activity..distance...mi.', 'Jump.Load..J.',
       'Heart.Rate..?ò...bpm.', 'Heart.Rate..min....bpm.',
       'Heart.Rate..max....bpm.', 'Human.Core.Temperature..?ò....F.',
       'Human.Core.Temperature..max.....F.', 'TRIMP', 'Heart.Rate.Recoveries',
       'Jump.Height..max....ft.', 'Changes.of.Orientation', 'Exertions',
       'Disk.Usage....', 'Injury Type', 'Body Part', 'Side', 'Injury Date',
       'Severity', 'Recovery Time (days)', 'Additional Notes',
       'Injury_status'],
      d

In [22]:
clean_sorted_obt['Date Recorded'] = pd.to_datetime(clean_sorted_obt['Date Recorded'])
clean_sorted_obt['Session_Date'] = pd.to_datetime(clean_sorted_obt['Session_Date'])
clean_sorted_obt['Injury Date'] = pd.to_datetime(clean_sorted_obt['Injury Date'])

In [None]:
columns_to_drop = [
    'Quad Imbalance Percent', 
    'HamstringImbalance Percent', 
    'Group.Id_x', 
    'Session ID', 
    'Session.ID', 
    'Changes.of.Orientation', 
    'Player Name', 
    'Additional Notes']
for column in columns_to_drop:
    if column in clean_sorted_obt.columns:
        clean_sorted_obt.drop(columns = column, inplace = True)

In [34]:

num_obt = clean_sorted_obt.select_dtypes(include=['number'])
correlation_matrix = num_obt.corr()
round(correlation_matrix['Recovery Time (days)'], 4).sort_values()

Injury_status                        -0.6811
Jump.Height..max....ft.              -0.0274
Time..s.                             -0.0260
Human.Core.Temperature..max.....F.   -0.0229
Speed..max....mph.                   -0.0227
Calf Imbalance Percent               -0.0121
Groin Imbalance Percent              -0.0115
Month                                -0.0088
Heart.Rate..?ò...bpm.                -0.0079
Speed..?ò...mph.                     -0.0065
Speed....of.max......                -0.0063
TRIMP                                 0.0026
Heart.Rate.Recoveries                 0.0030
Disk.Usage....                        0.0036
Heart.Rate..max....bpm.               0.0089
Steps                                 0.0112
Accumulated.Acceleration.Load         0.0117
Duration..s.                          0.0117
Anaerobic.Activity..distance...mi.    0.0223
Heart.Rate..min....bpm.               0.0238
Jump.Load..J.                         0.0312
Distance..mi.                         0.0316
Distance..

In [35]:
non_cat_data = clean_sorted_obt.select_dtypes(include=['number', 'datetime'])
non_cat_data

Unnamed: 0,Player.ID,Date Recorded,Hamstring To Quad Ratio,Calf Imbalance Percent,Groin Imbalance Percent,Month,Session_Date,Distance..mi.,Distance...min..mi.,Duration..s.,...,Human.Core.Temperature..?ò....F.,Human.Core.Temperature..max.....F.,TRIMP,Heart.Rate.Recoveries,Jump.Height..max....ft.,Exertions,Disk.Usage....,Injury Date,Recovery Time (days),Injury_status
1015,101,2023-01-01,0.610768,5.409497,5.616962,1,2023-01-01,5.27,0.13,6246,...,99.38,101.96,114,10,2.37,132,23.12,1990-01-01,0.0,1
1017,101,2023-01-01,0.610768,5.409497,5.616962,1,2023-01-03,5.79,0.12,3758,...,101.00,102.23,105,4,3.52,194,74.68,1990-01-01,0.0,1
1019,101,2023-01-01,0.610768,5.409497,5.616962,1,2023-01-04,3.94,0.11,4895,...,99.31,98.16,268,5,2.41,413,52.18,1990-01-01,0.0,1
1021,101,2023-01-01,0.610768,5.409497,5.616962,1,2023-01-06,3.11,0.10,5319,...,100.78,100.33,245,8,2.12,359,37.75,1990-01-01,0.0,1
1023,101,2023-01-01,0.610768,5.409497,5.616962,1,2023-01-07,3.79,0.08,4327,...,99.16,101.59,183,5,2.10,405,62.07,1990-01-01,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,115,2023-12-01,1.173804,3.361765,3.485792,12,2023-12-25,2.98,0.06,5762,...,100.72,101.75,196,4,2.00,395,24.67,1990-01-01,0.0,1
292,115,2023-12-01,1.173804,3.361765,3.485792,12,2023-12-26,1.99,0.07,6521,...,98.92,98.65,276,1,2.34,326,48.27,1990-01-01,0.0,1
293,115,2023-12-01,1.173804,3.361765,3.485792,12,2023-12-28,3.02,0.08,4320,...,100.85,101.76,178,6,1.89,444,50.06,1990-01-01,0.0,1
294,115,2023-12-01,1.173804,3.361765,3.485792,12,2023-12-30,3.79,0.06,3882,...,100.29,99.15,103,7,2.14,153,98.33,1990-01-01,0.0,1


In [None]:
clean_sorted_obt = clean_sorted_obt.reset_index()

In [127]:
clean_sorted_obt.columns

Index(['level_0', 'index', 'Player.ID', 'Date Recorded',
       'Hamstring To Quad Ratio', 'Calf Imbalance Percent',
       'Groin Imbalance Percent', 'Month', 'Session_Date', 'Position',
       'Distance..mi.', 'Distance...min..mi.', 'Duration..s.', 'Steps',
       'Speed....of.max......', 'Speed..max....mph.', 'Speed..?ò...mph.',
       'Time..s.', 'Accumulated.Acceleration.Load',
       'Anaerobic.Activity..distance...mi.', 'Jump.Load..J.',
       'Heart.Rate..?ò...bpm.', 'Heart.Rate..min....bpm.',
       'Heart.Rate..max....bpm.', 'Human.Core.Temperature..?ò....F.',
       'Human.Core.Temperature..max.....F.', 'TRIMP', 'Heart.Rate.Recoveries',
       'Jump.Height..max....ft.', 'Exertions', 'Disk.Usage....', 'Injury Type',
       'Body Part', 'Side', 'Injury Date', 'Severity', 'Recovery Time (days)',
       'Injury_status'],
      dtype='object')

In [137]:
ohe = OneHotEncoder(sparse_output=False, drop = 'first')

non_cat_data = clean_sorted_obt.select_dtypes(include=['number', 'datetime'])
non_cat_data_columns = non_cat_data.columns.tolist()
cat_columns = set(clean_sorted_obt.columns) - set(non_cat_data.columns)
cat_data = clean_sorted_obt[list(cat_columns)]

multi_colinear_columns = ['Injury Type',
       'Body Part', 'Side', 'Injury Date', 'Severity', 'Recovery Time (days)', 'Player.ID', 'Date Recorded', 'Session_Date']
for col in multi_colinear_columns:
    if col in cat_data.columns:
        cat_data.drop(columns = col, inplace = True)

ohe_cat_data = ohe.fit_transform(cat_data)

ohe_cat_df = pd.DataFrame(ohe_cat_data, columns=list(ohe.get_feature_names_out()))
ohe_injury_data = pd.concat([non_cat_data, ohe_cat_df], axis = 1)
ohe_cat_columns = list(ohe.get_feature_names_out())

col_to_drop = ['level_0', 'index']
for col in col_to_drop :
    if col in ohe_injury_data.columns:
        ohe_injury_data.drop(columns = col, inplace = True)

ohe_injury_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_data.drop(columns = col, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_data.drop(columns = col, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_data.drop(columns = col, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_data.drop(columns = col, inpl

Unnamed: 0,Player.ID,Date Recorded,Hamstring To Quad Ratio,Calf Imbalance Percent,Groin Imbalance Percent,Month,Session_Date,Distance..mi.,Distance...min..mi.,Duration..s.,...,TRIMP,Heart.Rate.Recoveries,Jump.Height..max....ft.,Exertions,Disk.Usage....,Injury Date,Recovery Time (days),Injury_status,Position_Forward,Position_Guard
0,101,2023-01-01,0.610768,5.409497,5.616962,1,2023-01-01,5.27,0.13,6246,...,114,10,2.37,132,23.12,1990-01-01,0.0,1,1.0,0.0
1,101,2023-01-01,0.610768,5.409497,5.616962,1,2023-01-03,5.79,0.12,3758,...,105,4,3.52,194,74.68,1990-01-01,0.0,1,1.0,0.0
2,101,2023-01-01,0.610768,5.409497,5.616962,1,2023-01-04,3.94,0.11,4895,...,268,5,2.41,413,52.18,1990-01-01,0.0,1,1.0,0.0
3,101,2023-01-01,0.610768,5.409497,5.616962,1,2023-01-06,3.11,0.10,5319,...,245,8,2.12,359,37.75,1990-01-01,0.0,1,1.0,0.0
4,101,2023-01-01,0.610768,5.409497,5.616962,1,2023-01-07,3.79,0.08,4327,...,183,5,2.10,405,62.07,1990-01-01,0.0,1,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,115,2023-12-01,1.173804,3.361765,3.485792,12,2023-12-25,2.98,0.06,5762,...,196,4,2.00,395,24.67,1990-01-01,0.0,1,0.0,1.0
2656,115,2023-12-01,1.173804,3.361765,3.485792,12,2023-12-26,1.99,0.07,6521,...,276,1,2.34,326,48.27,1990-01-01,0.0,1,0.0,1.0
2657,115,2023-12-01,1.173804,3.361765,3.485792,12,2023-12-28,3.02,0.08,4320,...,178,6,1.89,444,50.06,1990-01-01,0.0,1,0.0,1.0
2658,115,2023-12-01,1.173804,3.361765,3.485792,12,2023-12-30,3.79,0.06,3882,...,103,7,2.14,153,98.33,1990-01-01,0.0,1,0.0,1.0


In [138]:
scaler = StandardScaler(with_std = True, with_mean= True)

data_to_scale = non_cat_data.copy()
columns = ['level_0', 'index', 'Injury_status']

for col in columns:
    if col in data_to_scale.columns:
        data_to_scale.drop(columns = col, inplace = True)
        
unscaled_data = data_to_scale.select_dtypes(include = ['number'])

columns_to_scale = list(unscaled_data.columns)

scaled_ohe_injury_data = ohe_injury_data.copy()
scaled_ohe_injury_data[columns_to_scale] = scaler.fit_transform(ohe_injury_data[columns_to_scale])

In [139]:
scaled_ohe_injury_data

Unnamed: 0,Player.ID,Date Recorded,Hamstring To Quad Ratio,Calf Imbalance Percent,Groin Imbalance Percent,Month,Session_Date,Distance..mi.,Distance...min..mi.,Duration..s.,...,TRIMP,Heart.Rate.Recoveries,Jump.Height..max....ft.,Exertions,Disk.Usage....,Injury Date,Recovery Time (days),Injury_status,Position_Forward,Position_Guard
0,-1.550557,2023-01-01,-0.396333,0.351115,0.370524,-1.361088,2023-01-01,1.233941,1.090811,0.841179,...,-1.532366,1.668778,-0.212530,-1.470094,-0.924379,1990-01-01,-0.138765,1,1.0,0.0
1,-1.550557,2023-01-01,-0.396333,0.351115,0.370524,-1.361088,2023-01-03,1.602858,0.727344,-1.651992,...,-1.695755,-0.542772,1.158326,-0.917393,0.979720,1990-01-01,-0.138765,1,1.0,0.0
2,-1.550557,2023-01-01,-0.396333,0.351115,0.370524,-1.361088,2023-01-04,0.290367,0.363877,-0.512629,...,1.263396,-0.174180,-0.164848,1.034890,0.148800,1990-01-01,-0.138765,1,1.0,0.0
3,-1.550557,2023-01-01,-0.396333,0.351115,0.370524,-1.361088,2023-01-06,-0.298480,0.000410,-0.087747,...,0.845847,0.931595,-0.510542,0.553505,-0.384096,1990-01-01,-0.138765,1,1.0,0.0
4,-1.550557,2023-01-01,-0.396333,0.351115,0.370524,-1.361088,2023-01-07,0.183949,-0.726524,-1.081809,...,-0.279720,-0.174180,-0.534383,0.963574,0.514035,1990-01-01,-0.138765,1,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,1.941173,2023-12-01,2.135665,0.119518,0.130907,1.811496,2023-12-25,-0.390709,-1.453458,0.356173,...,-0.043714,-0.542772,-0.653587,0.874429,-0.867138,1990-01-01,-0.138765,1,0.0,1.0
2656,1.941173,2023-12-01,2.135665,0.119518,0.130907,1.811496,2023-12-26,-1.093069,-1.089991,1.116751,...,1.408631,-1.648547,-0.248291,0.259326,0.004405,1990-01-01,-0.138765,1,0.0,1.0
2657,1.941173,2023-12-01,2.135665,0.119518,0.130907,1.811496,2023-12-28,-0.362331,-0.726524,-1.088824,...,-0.370491,0.194411,-0.784713,1.311241,0.070509,1990-01-01,-0.138765,1,0.0,1.0
2658,1.941173,2023-12-01,2.135665,0.119518,0.130907,1.811496,2023-12-30,0.183949,-1.453458,-1.527734,...,-1.732064,0.563003,-0.486701,-1.282889,1.853108,1990-01-01,-0.138765,1,0.0,1.0


In [140]:
log_reg = LogisticRegression(solver = 'lbfgs')

predictors = scaled_ohe_injury_data.drop(columns = "Injury_status")
predictors = predictors.select_dtypes(exclude = ['datetime'])
target = scaled_ohe_injury_data['Injury_status']

x_train, x_test, y_train, y_test = train_test_split(predictors, target, train_size=0.7, random_state = 999)

In [141]:
y_train

770     1
2251    1
32      1
2455    1
1845    1
       ..
1736    1
481     0
869     1
348     1
1472    1
Name: Injury_status, Length: 1861, dtype: int64

In [142]:
scores = cross_val_score(log_reg, x_train, y_train, cv = 5, scoring = 'f1')
cv_score = np.mean(scores)
cv_score
# log_reg_model = log_reg.fit(x_train, y_train)

np.float64(0.9988819002226824)

In [153]:
log_reg_model = log_reg.fit(x_train, y_train)
prediction = log_reg_model.predict(x_test)

accuracy = accuracy_score(y_test, prediction)
report = classification_report(y_test, prediction)
print(report)

              precision    recall  f1-score   support

           0       1.00      0.94      0.97        32
           1       1.00      1.00      1.00       767

    accuracy                           1.00       799
   macro avg       1.00      0.97      0.98       799
weighted avg       1.00      1.00      1.00       799



In [155]:
prediction = log_reg_model.predict_proba(x_test)
prediction[:,0]

array([1.49367571e-04, 1.59894895e-03, 6.12610219e-03, 1.35310606e-02,
       2.55847509e-03, 5.46798417e-04, 4.67791938e-04, 1.75133679e-04,
       9.88440659e-01, 9.99986381e-01, 1.30403297e-03, 3.20673610e-03,
       3.25484789e-04, 5.97591942e-05, 8.71052540e-04, 1.92324370e-03,
       4.93216130e-05, 5.89635403e-04, 5.68815561e-04, 1.18459472e-03,
       2.16974883e-05, 4.71749890e-03, 9.10781816e-05, 1.93660618e-04,
       1.41835207e-04, 9.57590101e-03, 2.52365658e-04, 2.29833122e-03,
       1.21820131e-04, 2.82176601e-04, 7.12159727e-04, 1.98662922e-03,
       9.98391267e-01, 5.77299680e-03, 6.45386056e-05, 4.63551912e-03,
       1.88861201e-03, 7.31703416e-04, 6.45213248e-04, 9.36114015e-05,
       1.43400923e-03, 4.78339854e-05, 1.78966417e-04, 5.15080397e-04,
       4.66748144e-03, 4.69668783e-05, 2.60000140e-03, 3.70699646e-03,
       1.66446495e-03, 7.76799898e-05, 4.57551884e-03, 2.50007351e-03,
       2.82731636e-03, 3.00399215e-04, 2.65792052e-03, 1.32358395e-03,
      