# Generating, training, and saving the model

### Order of operations for the code
- The `main` function is called to run the entire file.
    - Loads the data from a CSV file into a pandas DataFrame.
    - Drops unnecessary columns from the DataFrame.
    - Splits the data into training and testing sets based on athlete IDs.
    - Calls the `run_exps` function to run multiple experiments.
        - For each of n experiments:
            - It separately calls the `preparedata` function.
                - The function calls three other functions:
                    - `getMeanStd` gets statistics for normalisation per athlete
                    - `normalize2` normalises the data using the statistics.
                    - `getBalancedSubset` generates a dataset with equal numbers of healthy and unhealthy events based os samples of the original training set.
                - It then sets the values for X_train, y_train, X_test, y_test.
            - Then, it calls the `train_model` function, which trains a logistic regressor based on some hyperparameters, and applies platt scaling to calibrate the model.
            - Next, it evaluates the model for key metrics using either the `eval` function or else using the `vis_and_eval` function to produce some visualisations of model performance.
            - Finally it selects the model with the highest recall for an accuracy of at least 0.65 (this was an important tradeoff during experimentation.)
        - Once all experiments have been run it prints out the averages for metrics across all experiments, and returns the best model from across all experiments.
    - The best model is then saved to a file



In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
# from xgboost import XGBClassifier
# from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from sklearn.metrics import precision_recall_curve, roc_curve, auc

def normalize2(row, mean_df, std_df, athlete_id):
    '''
    Normalize the data using z-score normalization.
    '''
    mu = mean_df.loc[athlete_id]
    su = std_df.loc[athlete_id]
    z = (row - mu)/su
    return z

def getMeanStd(data):
    mean = data[data['injury'] == 0].groupby('Athlete ID').mean()
    std = data[data['injury'] == 0].groupby('Athlete ID').std()
    std.replace(to_replace=0.0, value=0.01, inplace=True)
    return mean, std

def getBalancedSubset(X_train, samplesPerClass):
    '''
    Create a balanced subset of the data by sampling from each athlete's data.
    '''
    healthySet   = pd.DataFrame()
    unhealthySet = pd.DataFrame()
    
    stats = pd.DataFrame(X_train[['Athlete ID','injury']].groupby(['Athlete ID','injury']).size().reset_index(name='counts'))
    stats = pd.DataFrame(stats[['Athlete ID']].groupby(['Athlete ID']).size().reset_index(name='counts'))
    stats.drop(stats[stats['counts'] < 2].index, inplace=True)
    athleteList = stats['Athlete ID'].unique()

    samplesPerAthlete = int(np.floor(samplesPerClass) / len(athleteList))

    for athlete in athleteList:
        if unhealthySet.empty:
            unhealthySet = X_train[(X_train['Athlete ID'] == athlete) & (X_train['injury'] == 0)].sample(samplesPerAthlete, replace=True)
        else:
            unhealthySet = pd.concat([unhealthySet, X_train[(X_train['Athlete ID'] == athlete) & (X_train['injury'] == 0)].sample(samplesPerAthlete,replace=True)], ignore_index=True)
        if healthySet.empty:
            healthySet = X_train[(X_train['Athlete ID'] == athlete) & (X_train['injury'] == 1)].sample(samplesPerAthlete, replace=True)
        else:
            healthySet = pd.concat([healthySet, X_train[(X_train['Athlete ID'] == athlete) & (X_train['injury'] == 1)].sample(samplesPerAthlete,replace=True)], ignore_index=True)


    balancedSet = pd.concat([healthySet, unhealthySet], ignore_index=True)
    return balancedSet


def preparedata(df,test_athletes):
    '''
    Prepare the data for training and testing.
    This includes normalization, creating a balanced subset of the data, and splitting into training and testing sets.
    '''
    X_test_original = df[df['Athlete ID'].isin(test_athletes)].copy() # Keep a copy for normalization
    X_train_original = df[~df['Athlete ID'].isin(test_athletes)].copy() # Keep a copy

    X_train_means, X_train_std = getMeanStd(X_train_original)
    X_test_means, X_test_std = getMeanStd(X_test_original)
    X_train_balanced = getBalancedSubset(X_train_original.copy(), 2048)
    
    # Set target variable for testing and training
    y_train = X_train_balanced['injury']
    y_test = X_test_original['injury']

    # Apply normalization to the balanced training data
    X_train_norm = X_train_balanced.apply(lambda x: normalize2(x, X_train_means, X_train_std, x['Athlete ID']), axis=1)
    X_train_norm = X_train_norm.drop(columns=['injury', 'Date', 'Athlete ID'], errors='ignore')

    # Apply normalization to the test data using the testing statistics
    # Note this is a source of data leakage! but the alternative is not feasible
    X_test_norm = X_test_original.apply(lambda x: normalize2(x, X_test_means, X_test_std, x['Athlete ID']), axis=1)
    X_test_norm = X_test_norm.drop(columns=['injury', 'Date', 'Athlete ID'], errors='ignore')


    return y_train, y_test, X_train_norm, X_test_norm

def train_model(X_train, y_train, **params):
    '''
    Train the model using the training data and some already tuned hyperparameters.
    This includes applying Platt scaling for better probability estimates.
    '''
    # model = XGBClassifier()
    # model = GaussianNB()

    # Create and fit the logistic regression model
    model = LogisticRegression(**params, max_iter=500, class_weight='balanced')
    model.fit(X_train, y_train)

    # Apply Platt scaling with cross-validation
    calibrated_model = CalibratedClassifierCV(model, method='isotonic', cv=5)  # You can adjust cv as needed
    calibrated_model.fit(X_train, y_train)

    return calibrated_model

def vis_and_eval(model, y_true, X, y_pred):
    '''
    Produce key evaluation metrics and visualisations.
    '''
    conf_matrix = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    # Print evaluation metrics
    fpr, tpr, thresholds = roc_curve(y_true, model.predict_proba(X)[:, 1])
    roc_auc = auc(fpr, tpr)
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Recall(most important): {recall:.4f}")

    # Plot confusion matrix
    plt.figure(figsize=(3,2))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 16})
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

    # print ROC curve and AUC
    plt.figure(figsize=(3,2))
    plt.plot(fpr, tpr, label='ROC Curve (area = {:.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend()
    plt.show()

    return (accuracy,recall,roc_auc)

def eval(model, y_true, X, y_pred, **params):
    '''
    Produce key evaluation metrics without visualisations.
    '''
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    # Print evaluation metrics
    fpr, tpr, thresholds = roc_curve(y_true, model.predict_proba(X)[:, 1])
    roc_auc = auc(fpr, tpr)
    print(f"ROC AUC: {roc_auc:.3f}, Accuracy: {accuracy:.3f}, Recall(most important): {recall:.3f}")
   
    return (accuracy,recall,roc_auc)

def run_exps(df,test_set, n =5, **params):
    '''
    Run experiments to determine the best model.
    This includes training n models with the given parameters,
    evaluating them, and printings some metrics for each one. 
    it then returns the best model based on recall.   
        
    '''
    best_model = None
    best_recall = 0
    all_results = []
    for i in range(n):
        y_train, y_test, X_train, X_test = preparedata(df, test_set)
        model = train_model(X_train, y_train,**params)
        y_pred = model.predict(X_test)
        results = eval(model, y_test, X_test, y_pred)
        # results = vis_and_eval(model, y_test, X_test, y_pred)
        
        if results[1] > best_recall and results[0] > 0.65:
            best_model = model
            best_recall = results[1]
        all_results.append(results)
    _ =[print(i) for i in all_results]
    # rewrite three lines above as f strings with 3 decimal places
    print(f"Mean Accuracy: {np.mean([x[0] for x in all_results]):.3f}")
    print(f"Mean Recall: {np.mean([x[1] for x in all_results]):.2f}")
    print(f"Mean ROC AUC: {np.mean([x[2] for x in all_results]):.3f}")
    return best_model
    
    
def main():
    '''
    Main function to run the experiments.
    '''

    # Load, clean, and split the data
    dfday = pd.read_csv('../data/raw/day_approach.csv')
    dfday.drop(list(dfday.filter(regex = 'perceived|sprinting|strength')), axis = 1, inplace = True)
    athletes = sorted(list(dfday['Athlete ID'].unique()))
    test_athletes = athletes[len(athletes) - 10:]

    # run experiments to determine the best model
    best_model = run_exps(dfday, test_athletes, n = 5, C=0.01, penalty='elasticnet', solver='saga', l1_ratio=0.5)
   
    # Save the model to a file
    with open('../models/logistic_model.pkl', 'wb') as f:
        pickle.dump(best_model, f)

    
    # Uncomment the following lines to run additional experiments with different hyperparameters
    '''
    print("-" * 50)  # Separator for better readability
    C_values = [0.001, 0.01, 0.1, 1, 10]
    l1_ratio_values = [0.2, 0.5, 0.8]
    for C in C_values:
        for l1_ratio in l1_ratio_values:
            print(f"Running experiments with C={C}, l1_ratio={l1_ratio}, penalty='elasticnet', solver='saga', class_weight='balanced'")
            run_exps(dfday, test_athletes, n = 3, C=C, penalty='elasticnet', solver='saga', l1_ratio=l1_ratio)
            print("-" * 50)  # Separator for better readability
   '''
   
main()

ROC AUC: 0.680, Accuracy: 0.661, Recall(most important): 0.600
ROC AUC: 0.694, Accuracy: 0.691, Recall(most important): 0.620
ROC AUC: 0.704, Accuracy: 0.646, Recall(most important): 0.680
ROC AUC: 0.680, Accuracy: 0.643, Recall(most important): 0.620
ROC AUC: 0.696, Accuracy: 0.639, Recall(most important): 0.620
(0.6613009198423128, 0.6, 0.6798496993987977)
(0.6908672798948752, 0.62, 0.694188376753507)
(0.6455321944809461, 0.68, 0.703941215764863)
(0.6425755584756899, 0.62, 0.6796259185036739)
(0.6389618922470434, 0.62, 0.6958650634602538)
Mean Accuracy: 0.656
Mean Recall: 0.63
Mean ROC AUC: 0.691


## Next Steps
So, I have code that produces fairly well functioning models. I would like to evaluate the data for different testing sets, so I'm going to try to create different training and testing splits for the data. the resulting functionn is below. my results produced quite poor recall, and when outputting the traing and testing splits they were found to be incredibly uneven. I could revisit this, but I would have to really significantly redesign my method for . 
```python
def run_exps(df,athletes, n =5):
    all_results = []
    for i in range(n):
        test_athletes = np.random.choice(athletes, size=10, replace=False)
        
        y_train, y_test, X_train, X_test = preparedata(df, test_athletes)
        
        # print number of count of +ve and -ve samples in the training set
        print("Training set counts: ", y_train[y_train==1].value_counts())
        print("out of Training set counts: ", y_train.value_counts())
        print("Testing set counts: ", y_test[y_test==1].value_counts())
        print("out of Testing set counts: ", y_test.value_counts())
        model = train_model(X_train, y_train)
        y_pred = model.predict(X_test)
        results = eval(model, y_test, X_test, y_pred)
        all_results.append(results)
    _ =[print(i) for i in all_results]
    print("Mean Accuracy: ", np.mean([x[0] for x in all_results]))
    print("Mean Recall: ", np.mean([x[1] for x in all_results]))
    print("Mean ROC AUC: ", np.mean([x[2] for x in all_results]))
    
    
def main():

    dfday = pd.read_csv('C:/Users/milo/Desktop/publicprojectsMilo/RunningVolume_Injury/data/raw/day_approach.csv')
    dfday.drop(list(dfday.filter(regex = 'perceived|sprinting|strength')), axis = 1, inplace = True)
    athletes = sorted(list(dfday['Athlete ID'].unique()))
    # test_athletes = athletes[len(athletes) - 10:]
    run_exps(dfday, athletes, n = 5)
```

Then, can work on updating the current pipeline to transform both user data and pipeline data to a longer timeframe, including volume from 1 week or two weeks prior as well as from the past 7 days