# Preprocessing

In [None]:
import pandas as pd
import numpy as np
import os
import json
import sqlite3 as sl
from joblib import dump, load

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, confusion_matrix, RocCurveDisplay

from matplotlib import pyplot as plt

## Database Access

In [None]:
def db_read_by_id(id):
    con = sl.connect("cycling.db")
    query = "SELECT activity_id, moving, time, velocity_smooth, grade_smooth, heartrate, label " \
            "FROM data_raw "\
            "WHERE activity_id = "+id
    return pd.read_sql(query, con)

def db_get_ids():
    con = sl.connect("cycling.db")
    query = "SELECT DISTINCT activity_id FROM data_raw"
    return pd.read_sql(query, con).activity_id

## Segmenting function

In [None]:
def sequenceData(data, seq_size, sequences):
    data_size = data.shape[0]
    seq_i = 0
    label = data.label[0]
    activity_id = data.activity_id[0]
    while seq_i+seq_size < data_size:
        sequence = data.truncate(before = data.index[seq_i], after = data.index[seq_i+seq_size-1])
        sequence = sequence.reset_index(drop=True)
        sequence = sequence.reset_index()
        if sequence.moving.all() == True and not sequence.isnull().any().any():
            sequences.append([activity_id, data.index[seq_i], data.index[seq_i+seq_size-1], sequence, label])
        seq_i = seq_i+seq_size

## Segmenting from database

In [None]:
ids = db_get_ids()
sequences = []
for i in ids:
    data = db_read_by_id(i)
    data = data.set_index("time")
    data = data.reindex(range(data.index[0], data.index[-1]+1))
    data.moving.bfill(inplace = True)
    data.activity_id.bfill(inplace = True)
    data.label.bfill(inplace=True)
    data.velocity_smooth.interpolate(inplace=True)
    data.grade_smooth.interpolate(inplace=True)
    data.heartrate.interpolate(inplace=True)
    data = data.reset_index()
    sequenceData(data, 60, sequences)
df = pd.DataFrame(sequences, columns = ['activity_id','start_time', 'end_time', 'sequence', 'label'])
for row in df["sequence"]:
    row.drop(["label","moving","activity_id","time","index"], axis=1, inplace=True)

In [None]:
print(df)

# Leave One Out Cross-Validation

In [None]:
def loo_split(id):
    test_df = df[df['activity_id'] == id]
    train_df = df[df['activity_id'] != id]
    X_test = []
    y_test = []
    X_train = []
    y_train = []

    for row in test_df["sequence"]:
        X_test.append(row.to_numpy())
    X_test = np.array(X_test)
    if len(X_test.shape) == 3:
        X_test = X_test.reshape(X_test.shape[0], X_test.shape[1] * X_test.shape[2])
    elif len(X_test.shape) == 2:
        # Do nothing, as X_test is already 2D
        pass
    else:
        # Handle the case where the shape is neither 2D nor 3D
        raise ValueError("Unexpected shape for X_test")

    for label in test_df["label"]:
        y_test.append(label)
    y_test = np.array(y_test)

    for row in train_df["sequence"]:
        X_train.append(row.to_numpy())
    X_train = np.array(X_train)
    if len(X_train.shape) == 3:
        X_train = X_train.reshape(X_train.shape[0], X_train.shape[1] * X_train.shape[2])
    elif len(X_train.shape) == 2:
        # Do nothing, as X_test is already 2D
        pass
    else:
        # Handle the case where the shape is neither 2D nor 3D
        raise ValueError("Unexpected shape for X_train")

    for label in train_df["label"]:
        y_train.append(label)
    y_train = np.array(y_train)
    return X_train, X_test, y_train, y_test

In [None]:
##trains the model without the activity with input id##
def svm_model_training(id, kernel, c, gamma, poly_degree = None):
    X_train, X_test, y_train, y_test = loo_split(id)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    if kernel == "poly":
        svc = SVC(kernel = kernel, degree = poly_degree, C = c, gamma = gamma, cache_size = 2000, probability=True)
    else:
        svc = SVC(kernel = kernel, C = c, gamma = gamma, cache_size = 2000,probability=True)
    svc.fit(X_train,y_train)
    return svc, X_train, X_test, y_train, y_test

# Training and Testing of SVM model

In [None]:
def train_test_split(kernel, c, gamma, poly_degree = None):
    y_predictions = []
    y_tests = []
    y_test_activity = []
    y_probabilities = []
    X_tests = []
    iteration = 1
    ids = db_get_ids()

    for id in ids:
        print("training split: " + str(iteration) + "/" + str(ids.size))
        iteration += 1
        if kernel == "poly":
            svc, X_train, X_test, y_train, y_test = svm_model_training(id = id, kernel = kernel, poly_degree = poly_degree, c = c, gamma = gamma)
        else:
            svc, X_train, X_test, y_train, y_test = svm_model_training(id = id, kernel = kernel, c = c, gamma = gamma)
        dump(svc, 'model/svm_' + str(id) + '.joblib')
        X_tests.append(X_test)
        y_predict = svc.predict(X_test)
        y_probability = svc.predict_proba(X_test)
        y_probabilities.append(y_probability)
        y_predictions.append(y_predict)
        y_tests.append(y_test)
        y_test_activity.append(y_test[0])

    y_predictions_concat = np.concatenate(y_predictions)
    y_probabilities_concat = np.concatenate(y_probabilities)
    y_tests_concat = np.concatenate(y_tests)
    return y_predictions_concat, y_probabilities_concat, y_tests_concat

In [None]:
#### train_test_split() test ####
y_predictions_concat, y_probabilities_concat, y_tests_concat = train_test_split(kernel = "poly", poly_degree = 3, c = 0.1, gamma = 0.1)

# ROC Curve Plotting

In [None]:
def plot_roc(y_tests_concat, y_probabilities_concat):    
    fpr_sequences, tpr_sequences, thresholds = roc_curve(y_tests_concat, y_probabilities_concat[:,0], pos_label = 1)
    auc_sequences = auc(fpr_sequences,tpr_sequences)
    roc_sequences = RocCurveDisplay(fpr=fpr_sequences,tpr=tpr_sequences,roc_auc=auc_sequences,estimator_name = "segment_classifier",pos_label=1)

    ax = plt.gca()
    roc_sequences.plot(ax=ax)
    plt.grid(1)
    plt.show()

In [None]:
plot_roc(y_tests_concat, y_probabilities_concat)

## Functions for activity classification and segment threshold classification

In [None]:
###  takes numpy array y_probability of sequence classification, returns prediction numpy array based on threshold  #######
def classify_sequence(y_probability, threshold):
    y_predict = np.array(list(map(lambda x: 0 if x >= threshold else 1, y_probability)))
    return y_predict

def classify_sequences_by_threshold(y_probabilities, threshold):
    pos_probabilities = [arr[:, 1] for arr in y_probabilities]
    sequence_predictions = []
    for seq in pos_probabilities:
        sequence_predictions.append(classify_sequence(seq, threshold))
    return sequence_predictions

#### takes prediction from classify_sequences_by_threshold
#### returns class probability
def predict_probability_activity(y_predict):
    return np.count_nonzero(y_predict == 1)/y_predict.size

def predict_probabilities_activity(sequence_predictions):
    probability_activities = []
    for activity_sequences in sequence_predictions:
        probability_activities.append(predict_probability_activity(activity_sequences))
    return probability_activities

####  takes numpy array of sequence predictions for an activity_id, returns prediction based on threshold  ###########
def classify_activity_by_threshold(y_predict, threshold):
    if threshold == 0:
        return 1
    proba = predict_probability_activity(y_predict)
    if proba >= threshold:
        return 1
    return 0

# Plotting activity classifier ROC curves for different segment classification thresholds

In [None]:
### plot activity_classifier roc's for different thresholds of sequence classifier
def plot_activity_roc(thresholds_test, y_probabilities, y_test_activity):
    for t in thresholds_test:
        y_predictions = classify_sequences_by_threshold(y_probabilities, t)
        y_probability_activities = predict_probabilities_activity(y_predictions)
        fpr, tpr, thresh = roc_curve(y_test_activity, y_probability_activities, pos_label = 1)
        roc_auc = auc(fpr, tpr)
        roc = RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc,estimator_name = f"Threshold = {round(t,4)}", pos_label=1)
        alpha_value=(roc_auc-0.4)/0.98
        ax = plt.gca()
        if roc_auc > 0.97:
            roc.plot(ax=ax,color = 'red',alpha=1)
        else:
            roc.plot(ax=ax, alpha=alpha_value)
    ax.legend(loc='lower right', fontsize=7)
    plt.grid(1)
    plt.show()


In [None]:
best_thresholds = np.arange(0.553,0.558,0.001)
plot_activity_roc(best_thresholds, y_probabilities, y_test_activity)

# Plot AUC as function of threshold

In [None]:
thresholds = np.arange(0,1,0.001)
auc_values = []
for t in thresholds:
    y_predictions = classify_sequences_by_threshold(y_probabilities, t)
    y_probability_activities = predict_probabilities_activity(y_predictions)
    fpr, tpr, thresh = roc_curve(y_test_activity, y_probability_activities, pos_label = 1)
    roc_auc = auc(fpr, tpr)
    auc_values.append(roc_auc)

In [None]:
# Plotting the line
plt.plot(thresholds, auc_values, label='Line Plot')
# Adding labels and title
plt.xlabel('Thresholds')
plt.ylabel('AUC')

# Adding a legend
plt.grid(1)
# Display the plot
plt.show()

In [None]:
# Plotting the line
plt.plot(thresholds, auc_values, label='Line Plot')
# Adding labels and title
plt.xlabel('Thresholds')
plt.ylabel('AUC')
plt.xlim(0.5,0.6)
# Adding a legend
plt.grid(1)
# Display the plot
plt.show()

# Save and load prediction arrays

In [None]:
# Convert lists to numpy arrays
y_probabilities_array = np.array(y_probabilities, dtype=object)
y_predictions_array = np.array(y_predictions, dtype=object)
y_tests_array = np.array(y_tests, dtype=object)
y_test_activity_array = np.array(y_test_activity)

# Save arrays to files
np.save('data/predictions/y_probabilities.npy', y_probabilities_array)
np.save('data/predictions/y_predictions.npy', y_predictions_array)
np.save('data/predictions/y_tests.npy', y_tests_array)
np.save('data/predictions/y_test_activity.npy', y_test_activity_array)

In [None]:
y_tests = np.load('data/predictions/y_tests.npy', allow_pickle=True)
y_test_activity = np.load('data/predictions/y_test_activity.npy', allow_pickle=True)
y_probabilities = np.load('data/predictions/y_probabilities.npy', allow_pickle=True)
y_tests_concat = np.concatenate(y_tests)
y_probabilities_concat = np.concatenate(y_probabilities)

## Grid-Search

In [None]:
performance = pd.DataFrame(columns=["kernel","c","gamma","degree","auc","tpr","fpr"])
kernels = ["rbf","poly"]
param_C = [1,10]
param_gamma = [1,10]
poly_degrees = [2]
count = 1
for kernel in kernels:
    if kernel == "poly":
        for c in param_C:
            for gamma in param_gamma:
                for degree in poly_degrees:
                    y_predictions_concat, y_probabilities_concat, y_tests_concat = train_test_split(kernel = kernel, c = c, gamma = gamma, poly_degree = degree)
                    plot_roc(y_tests_concat, y_probabilities_concat)
                    print("Kernel:" + kernel)
                    print("Polynomial Degree:" + str(degree))
                    print("C:" + str(c))
                    print("gamma:" + str(gamma))
    else:
        for c in param_C:
            for gamma in param_gamma:
                    y_predictions_concat, y_probabilities_concat, y_tests_concat = train_test_split(kernel = kernel, c = c, gamma = gamma)
                    plot_roc(y_tests_concat, y_probabilities_concat)
                    print("Kernel:" + kernel)
                    print("C:" + str(c))
                    print("gamma:" + str(gamma))