In [None]:
import pandas as pd
import numpy as np

In [None]:
#Pathg to injury_ts.csv
injuryPath = ""

#Path to Session dataset
sessionPath = ""

In [None]:
def unpivot_and_fill_csv_file(csv_file_path, interpolate=False):
   
    # read the csv file without header
    df = pd.read_csv(csv_file_path, header=None)

    # Get the player names from the first row of the dataframe and remove the first 6 characters
    player_names = [name[6:] for name in df.iloc[0, 1:]]

    # Drop the first row since it contains player names
    df = df.drop(0)

    # Reset the index
    df = df.reset_index(drop=True)

    # Set the player names as column names
    df.columns = ['Date'] + player_names

    # Melt the dataframe to unpivot the player columns
    df_melt = df.melt(id_vars=['Date'], var_name='Player_name', value_name='Injured')

    # Sort the dataframe by player name and date
    df_melt = df_melt.sort_values(by=['Player_name', 'Date'])

    # Reset the index
    df_melt = df_melt.reset_index(drop=True)

    if interpolate:
        # Iterate through the DataFrame and update 0s to 1s if the day before and day after are 1s
        df_melt['Injured'] = df_melt['Injured'].astype(int)
        for i in range(1, len(df_melt) - 1):
            if df_melt.loc[i - 1, 'Injured'] == 1 and df_melt.loc[i + 1, 'Injured'] == 1:
                df_melt.loc[i, 'Injured'] = 1

    return df_melt



In [None]:
def merge_data_for_team(team_name, interpolate=False):
    """
    Creates a dataframe with the data from Session_calc and injury_ts for one team
    """

    # read csv files into dataframes
    df1 = pd.read_csv(sessionPath)
    df2 = unpivot_and_fill_csv_file(injuryPath, interpolate)
    df2['Date'] = pd.to_datetime(df2['Date'], format='%d.%m.%Y').dt.strftime('%Y-%m-%d')

    if team_name == "TeamA":
        filtered_df1 = df1[df1['Team_Name'] != 'TeamB']
        filtered_df2 = df2[~df2['Player_name'].str.startswith('TeamB')]
    else:
        filtered_df1 = df1[df1['Team_Name'] != 'TeamA']
        filtered_df2 = df2[~df2['Player_name'].str.startswith('TeamA')]

    # Find the minimum and maximum dates in df1
    min_date = pd.to_datetime(df1['Date']).min().strftime('%Y-%m-%d')
    max_date = pd.to_datetime(df1['Date']).max().strftime('%Y-%m-%d')

    # Filter df2 based on the minimum and maximum dates in df1
    filtered_df2 = filtered_df2[(filtered_df2['Date'] >= min_date) & (filtered_df2['Date'] <= max_date)]

    merged_df = pd.merge(filtered_df1, filtered_df2, on=['Date', 'Player_name'], how='outer')

    # add all the data from df2 that is not already in the new dataframe
    new_df = pd.concat([merged_df, filtered_df2[~filtered_df2.set_index(['Date', 'Player_name']).index.isin(merged_df.set_index(['Date', 'Player_name']).index)]])
    merged = new_df[['Date', 'Player_name', 'Duration', 'Total_distance', 'Average_running_speed', 'Top_speed', 'Metabolic_power', 'HIR', 'Injured']]
    merged = merged.dropna(subset=['Injured'])  # remove rows where Injured is NaN
    merged = merged.sort_values(by='Date')  # sort by Date
    merged = merged.fillna(0)  # replace NaN with 0

    
    
    return merged





In [None]:
#Defining class imbalce functions

#Oversampling
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

def oversample_data(X_train, y_train, sampling_ratio=0.35):

    # Apply oversampling on the training data
    sampling_ratio = 0.35
    ros = RandomOverSampler(sampling_strategy=sampling_ratio, random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X_train.reshape(X_train.shape[0], -1), y_train)

    X_train_resampled = X_resampled.reshape(X_resampled.shape[0], X_train.shape[1], X_train.shape[2])

    # Calculate the percentage of 0s after oversampling
    zero_percentage_after = np.sum(y_resampled == 0) / len(y_resampled) * 100
    #print(f"Percentage of 0s after oversampling: {zero_percentage_after:.2f}%")

    return X_train_resampled, y_resampled, zero_percentage_after


# Undersampling
def undersample_data(X_train, y_train, sampling_ratio=0.35):

    # Apply undersampling on the training data
    sampling_strategy = 1 - sampling_ratio
    rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
    X_resampled, y_resampled = rus.fit_resample(X_train.reshape(X_train.shape[0], -1), y_train)

    X_train_resampled = X_resampled.reshape(X_resampled.shape[0], X_train.shape[1], X_train.shape[2])

    # Calculate the percentage of 0s after undersampling
    zero_percentage_after = np.sum(y_resampled == 0) / len(y_resampled) * 100

    return X_train_resampled, y_resampled, zero_percentage_after


#SMOTE
def smote_data(X_train, y_train, sampling_ratio):

    # Set a custom sampling ratio
    sampling_ratio = 0.35

    # Apply SMOTE with the custom ratio
    smote = SMOTE(sampling_strategy=sampling_ratio, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train.reshape(X_train.shape[0], -1), y_train)

    # Reshape the resampled data
    X_train_resampled = X_resampled.reshape(X_resampled.shape[0], X_train.shape[1], X_train.shape[2])

    # Calculate the percentage of 0s after oversampling
    zero_percentage_after = np.sum(y_resampled == 0) / len(y_resampled) * 100


    return X_train_resampled, y_resampled, zero_percentage_after


# ADASYN
def adasyn_oversample_data(X_train, y_train, sampling_ratio=0.35):

    # Apply ADASYN oversampling on the training data
    sampling_strategy = sampling_ratio
    adasyn = ADASYN(sampling_strategy=sampling_strategy, random_state=42, n_neighbors=5)
    X_resampled, y_resampled = adasyn.fit_resample(X_train.reshape(X_train.shape[0], -1), y_train)

    X_train_resampled = X_resampled.reshape(X_resampled.shape[0], X_train.shape[1], X_train.shape[2])

    # Calculate the percentage of 0s after oversampling
    zero_percentage_after = np.sum(y_resampled == 0) / len(y_resampled) * 100
    

    return X_train_resampled, y_resampled, zero_percentage_after


#Data sample mode
def sample_mode(X_train, y_train, sampling_ratio, oversample_mode):
 precentageOfZeroesInDataset = (y_train == 0).sum() / len(y_train)

 if oversample_mode == 'oversample':
      X_train, y_train, precentageOfZeroesInDataset = oversample_data(X_train, y_train, sampling_ratio)
 elif oversample_mode == 'undersample':
      X_train, y_train, precentageOfZeroesInDataset = undersample_data(X_train, y_train, sampling_ratio)
 elif oversample_mode == 'smote':
      X_train, y_train, precentageOfZeroesInDataset = smote_data(X_train, y_train, sampling_ratio)
 elif oversample_mode == 'adasyn':
      X_train, y_train, precentageOfZeroesInDataset = adasyn_oversample_data(X_train, y_train, sampling_ratio)
 
    
 return X_train, y_train, precentageOfZeroesInDataset

In [None]:
#Confusion matrtix that shows injuries as TP
def confusion_matrix_only_injuries(y_test, y_pred):
    tp, fn, fp, tn = 0, 0, 0, 0
    for i, j in zip(y_test, y_pred):
        if i == 1 and j == 1:
            tp += 1
        elif i == 1 and j == 0:
            fn += 1
        elif i == 0 and j == 1:
            fp += 1
        elif i == 0 and j == 0:
            tn += 1
    # Create a DataFrame for the confusion matrix
    confusion_matrix = pd.DataFrame(
        {
            "Predicted Injuries": [tp, fp],
            "Predicted Non-Injuries": [fn, tn],
        },
        index=["Actual Injuries", "Actual Non-Injuries"]
    )

    return confusion_matrix


In [None]:
#Datapreprocessing

def convert_duration_to_seconds(duration):
    if str(duration) == "0": return 0
    
    time_parts = str(duration).split(':')
    hours, minutes, seconds = int(time_parts[0]), int(time_parts[1]), int(time_parts[2])
    total_seconds = hours * 3600 + minutes * 60 + seconds
    return total_seconds

def create_player_sequences(df, player, sequence_length=7):
    player_df = df[df['Player_name'] == player]
    player_data = player_df.drop(['Date', 'Player_name'], axis=1).values

    sequences = []
    for i in range(len(player_data) - sequence_length):
        sequences.append(player_data[i:i + sequence_length])

    return np.array(sequences)

def create_team_sequence(df, sequence_length):
    players = df['Player_name'].unique()
    all_sequences = []
    for player in players:
        player_sequences = create_player_sequences(df, player, sequence_length)
        all_sequences.append(player_sequences)

    data = np.concatenate(all_sequences, axis=0)
    X, y = data[:, :-1, :], data[:, -1, -1]
    return X, y


def preprocess(inputWindow, interpolate=False):
    # Load your dataframe
    df = merge_data_for_team("TeamA", interpolate=interpolate)

    # Convert 'Duration' column to seconds
    df['Duration'] = df['Duration'].apply(convert_duration_to_seconds)

    # Preprocess the data
    X, y = create_team_sequence(df, inputWindow)

    # Convert data types
    X = X.astype(np.float32)
    y = y.astype(np.float32)
    return X, y


In [None]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tslearn.preprocessing import TimeSeriesScalerMinMax

def decision_tree_classification(test_size, oversample_mode, sampling_ratio):
    # Preprocess the data
    X_scaled = TimeSeriesScalerMinMax().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=42, stratify=y)
    precentageOfZeroesInDataset = (y_train == 0).sum() / len(y_train)

    precentageOfZeroesInDataset = np.sum(y_train == 0) / len(y_train) * 100
    X_train, y_train, precentageOfZeroesInDataset = sample_mode(X_train, y_train, sampling_ratio, oversample_mode)

    # Reshape the data for Decision Tree
    X_train_2d = X_train.reshape(X_train.shape[0], -1)
    X_test_2d = X_test.reshape(X_test.shape[0], -1)

    # Train the model
    model = DecisionTreeClassifier()
    model.fit(X_train_2d, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test_2d)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    confInjuries = confusion_matrix_only_injuries(y_test, y_pred)
    
    return accuracy, f1, precision, recall, confInjuries, precentageOfZeroesInDataset


In [None]:
#Multiple runs

import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
warnings.filterwarnings("ignore", category=UserWarning, message="`use_label_encoder` is deprecated in 1.7.0.")

# Create a list to store the results
results = []

# List of hyperparameters
input_windows = [4, 7, 30]
test_sizes = [0.2, 0.4]
oversample_modes = ["none", "oversample", "undersample", "smote", "adasyn"]
sampling_ratios = [0.2, 0.4]
interploate_injuries = [True, False]



# Iterate through all combinations of hyperparameters
for interplate in interploate_injuries:
    for preprocess_param in input_windows:
        X, y = preprocess(preprocess_param)
        
        for test_size_param in test_sizes:
            for oversample_mode in oversample_modes:
                for sampling_ratio in sampling_ratios:
                    
                    accuracy, f1, precision, recall, confusion, precentageOfZeroesInDataset = decision_tree_classification(test_size=test_size_param,
                                                                                    oversample_mode=oversample_mode,
                                                                                    sampling_ratio=sampling_ratio)

                    # Save the results
                    results.append({"input_windows": preprocess_param,
                                    "Interplate": interplate,
                                    "test_size": test_size_param,
                                    "oversample_mode": oversample_mode,
                                    "sampling_ratio": sampling_ratio,
                                    "Prectenage of zeroes in dataset": precentageOfZeroesInDataset,
                                    "accuracy": accuracy,
                                    "f1": f1,
                                    "precision": precision,
                                    "recall": recall,
                                    "confusion": confusion})

# Sort the results based on the F1 score
results.sort(key=lambda x: x['f1'], reverse=True)

# Print the top 3 performers
for i in range(min(3, len(results))):
    print(f"Top Performer {i + 1}:")
    print(f"Input window: {results[i]['input_windows']}")
    print(f"Interplate: {results[i]['Interplate']}")
    print(f"Test Size: {results[i]['test_size']}")
    print(f"Oversample Mode: {results[i]['oversample_mode']}")
    print(f"Sampling Ratio: {results[i]['sampling_ratio']}")
    print(f"Percentage of zeroes in dataset: {results[i]['Prectenage of zeroes in dataset']}")
    print(f"Accuracy: {results[i]['accuracy']}")
    print(f"F1: {results[i]['f1']}")
    print(f"Precision: {results[i]['precision']}")
    print(f"Recall: {results[i]['recall']}")
    display(results[i]['confusion'])
    print("\n")