In [140]:
# all imports
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')
from sklearn.preprocessing import StandardScaler
import os
import time
import random

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import ConfusionMatrixDisplay

import warnings
warnings.filterwarnings("ignore")

In [375]:
# reads xml-data
def read_data(file):
    with open(file, 'r') as f:
        data = f.read()
    bs_data = bs(data, 'xml') 
    return bs_data


# gets the bname for calculations
def get_bname(path):
    for xml_file in os.listdir(path):
        if xml_file.startswith("timeseries"):
            bs_data = read_data(path + xml_file)
            b_name = bs_data.find_all('name')
            return b_name    

# returns a dataframe given some data
def values_df(bs_data, low, high):
    b_name = bs_data.find_all('name')
    attributes = []
    # finds all values 
    for i in range(len(b_name)):
        attributes.append(b_name[i].get('value'))

    attributes = list(attributes)
    collection = {}
    
    # finds x-, y- and z-values and computes euclidian distance
    for i in range(low, high):
        r = calc(b_name, i)
        collection[attributes[i]] = r

    df = pd.DataFrame(collection)
    return df

def get_differences_values(path):
    i = 0
    ts_dict = {}
    
    for xml_file in os.listdir(path):
        if xml_file.startswith("timeseries"):
            try:
                bs_data = read_data(path + xml_file)
                b_name = bs_data.find_all('name')
                
                pl = [(0, int(len(b_name)/4)),
                       (int(len(b_name)/4), int(len(b_name)/2)),
                       (int(len(b_name)/2), int(3*len(b_name)/4)),
                       (int(3*len(b_name)/4), len(b_name))]
            
                pl_list = [] # power level list
                for low, high in pl:
                    
                    df = values_df(bs_data, low, high)

                    df_max = pd.DataFrame(df.max().to_dict(),index=[df.index.values[-1]])
                    df_min = pd.DataFrame(df.min().to_dict(),index=[df.index.values[-1]])
                    df = df_max.subtract(df_min, fill_value=0)
                    
                    pl_list.append(df)
                
                if "healthy" in path.lower():
                    injurylevel = [0 for i in range(len(pl_list))]
                else: 
                    injurylevel = [1 for i in range(len(pl_list))]
                
                df = pd.concat(pl_list)
                df = df.reset_index(drop=True)
                df.drop(["Left Foot Progression", "Right Foot Progression"], axis = 1, inplace = True)
                df['Injury level'] = injurylevel
                
                ts_dict[xml_file] = df
            except:
                print(xml_file + " failed to compute")
    return ts_dict

# calculates the euclidian values
def calc(b_name, i):
    a1 = b_name[i].find_all('component')[0].get('data')
    b1 = b_name[i].find_all('component')[1].get('data')
    c1 = b_name[i].find_all('component')[2].get('data')
    x = np.asarray([float(x) for x in a1.split(',')])
    y = np.asarray([float(y) for y in b1.split(',')])
    z = np.asarray([float(z) for z in c1.split(',')])
    r = (x**2 + y**2 + z**2)**0.5
    return r

def ts_df(path):
    ts_list = []
    
    for xml_file in os.listdir(path):
        if xml_file.startswith("timeseries"):
            ts_list.append(xml_file)
        
    #df = pd.DataFrame({'timeseries': ts_list})
    return ts_list

def cross_validation(ts_list, n_folds, p): 
    test = []
    train = []
    
    for n in range(n_folds):
        random.shuffle(ts_list)
        
        test.append(ts_list[:int(p*len(ts_list))])
        train.append(ts_list[int(p*len(ts_list)):])
    
    data = {'test': test, 'train': train}
    df = pd.DataFrame(data)
    return df

In [376]:
path = "usethis/Healthy/"
healthy_dict = get_differences_values(path)

path = "usethis/PFPS/"
pfps_dict = get_differences_values(path)

ts_dict = {**healthy_dict, **pfps_dict}

In [462]:
path = "usethis/Healthy/"
healthy = ts_df(path)

path = "usethis/PFPS/"
pfps = ts_df(path)

cv_healthy = cross_validation(healthy, 10, 0.2)
cv_pfps = cross_validation(pfps, 10, 0.2)

cv = cv_healthy + cv_pfps

## Preprocessing

In [463]:
# extract data from different fold list
def extract_data(flod_list):
    X = []
    y = []
    
    for fold in flod_list:
        temp_list = []
        random.shuffle(fold)
        
        for ts in fold:
            temp_list.append(ts_dict.get(ts))
            
        df = pd.concat(temp_list)
        df = df.reset_index(drop=True)
        
        y_df = df['Injury level']
        x_df = df.drop(['Injury level'], axis = 1)
        
        X.append(x_df)
        y.append(y_df)
    
    return X, y


def get_X_data(X_test, X_train):
    my_imputer = SimpleImputer()
    imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
    imputed_X_test = pd.DataFrame(my_imputer.transform(X_test))

    imputed_X_train.columns = X_train.columns
    imputed_X_test.columns = X_test.columns

    sc = StandardScaler()
    X_train = sc.fit_transform(imputed_X_train)
    X_test = sc.transform(imputed_X_test)
    
    return X_train, X_test

In [464]:
X_train, y_train = extract_data(cv['train'])
X_test, y_test = extract_data(cv['test'])

## XGB - Boost

In [465]:
def xgb(X_test, X_train, y_train, y_test):
    XGB_classifier = XGBClassifier(use_label_encoder = False,
                              learning_rate=0.1,
                              max_depth=10,
                              scale_pos_weight=1.5,
                              eval_metric='mlogloss')
    XGB_classifier.fit(X_train, y_train)
    
    # MAKE PREDICTION
    y_pred = XGB_classifier.predict(X_test)
    score1 = accuracy_score(y_test, y_pred)
    
    return score1

In [466]:
accuracy_list = []

for i in range(len(X_test)):
    X_train, X_test = get_X_data(X_test[i], X_train[i])
    
    accuracy = xgb(X_test, X_train, y_train[i].values, y_test[i].values)
    accuracy_list.append(accuracy)
    
accuracy_mean = np.mean(accuracy_list)

ValueError: Expected 2D array, got 1D array instead:
array=[-0.67750388 -0.92613942 -0.68649574 -0.08477118  0.82530647  0.49439046
  0.27766573 -1.01244394 -0.61653997 -0.60176469 -0.4413632  -0.39256992
 -0.81347757 -0.16129862 -0.68168498  0.1504516  -0.71881462 -0.50639832
  0.04020297  0.18816557  0.04995187  0.43001897 -0.99609169 -0.57853215
 -0.55560577 -0.78333001 -0.19562115 -0.78687442 -0.38995342 -0.38395706].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [461]:
print(accuracy_mean)
display(X_train)

0.375


array([[-0.49171099,  0.22505398,  0.41441444, ...,  0.29232891,
        -0.82591562, -0.78678933],
       [-0.55694243,  0.53202485,  0.15580859, ...,  0.35642254,
        -0.55564176, -0.70285866],
       [-0.39908419,  0.70132586,  0.04082704, ...,  0.3450834 ,
        -0.46670051, -0.61615745],
       ...,
       [-0.25802897, -0.31413996,  0.74694779, ..., -0.88164959,
        -0.2412709 , -0.5791932 ],
       [-0.02384506, -0.45223795,  0.80840482, ..., -1.24988511,
         0.        , -0.46158864],
       [ 2.17098483, -1.67295153,  0.96391595, ..., -1.32226763,
        -0.30000391,  0.67940839]])