In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from collections import Counter

In [2]:
dataset = pd.read_csv('final_dataset_3.csv')

dataset = dataset.set_index('Unnamed: 0')

print(dataset.columns)
drop_columns = ['eda_time','form_time','form_time.1', 'gender']
dataset.drop(drop_columns, axis=1, inplace=True)
dataset.sort_values(by='participantID', axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last', ignore_index=False, key=None)

ids = dataset['participantID']

ids

Index(['form_time', 'eda_time', 'eda_perc_change', 'perc_bpm', 'abs_bpm',
       'perc_ibi', 'abs_ibi', 'perc_sdnn', 'abs_sdnn', 'perc_sdsd', 'abs_sdsd',
       'perc_rmssd', 'abs_rmssd', 'perc_pnn20', 'abs_pnn20', 'perc_pnn50',
       'abs_pnn50', 'perc_hr_mad', 'abs_hr_mad', 'perc_sd1', 'abs_sd1',
       'perc_sd2', 'abs_sd2', 'perc_s', 'abs_s', 'perc_sd1/sd2', 'abs_sd1/sd2',
       'perc_breathingrate', 'abs_breathingrate', 'form_time.1',
       'participantID', 'gender', 'change_in_stressed', 'change_in_amused',
       'change_in_nervous', 'change_in_engaged', 'change_in_lost',
       'change_in_disappointed', 'change_in_excited', 'change_in_frustrated',
       'change_in_contempt'],
      dtype='object')


Unnamed: 0
2      0
4      1
13     1
3      2
10     3
12     3
15     3
8      4
14     4
5      5
1      6
7      7
11     7
0      8
6      9
9     10
Name: participantID, dtype: int64

In [3]:
perc_columns = [col for col in dataset.columns if 'perc' in col]
dataset_perc = dataset[dataset.columns.drop(list(dataset.filter(regex='abs')))]
dataset_perc

Unnamed: 0_level_0,eda_perc_change,perc_bpm,perc_ibi,perc_sdnn,perc_sdsd,perc_rmssd,perc_pnn20,perc_pnn50,perc_hr_mad,perc_sd1,...,participantID,change_in_stressed,change_in_amused,change_in_nervous,change_in_engaged,change_in_lost,change_in_disappointed,change_in_excited,change_in_frustrated,change_in_contempt
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-18.175823,1.233378,0.810781,1.243654,1.474264,1.457498,0.921053,1.246964,1.75,1.466168,...,0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
4,12.05995,0.94614,1.056926,1.415998,1.209645,1.223343,1.240385,1.791667,1.0,1.214502,...,1,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
13,5.554695,1.026443,0.974238,0.613986,0.427218,0.426255,0.916667,0.666667,0.285714,0.426003,...,1,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,-11.86975,1.259298,0.794093,2.980787,2.251378,2.582955,1.0,1.470588,1.714286,2.584538,...,2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10,10.470302,1.054041,0.94873,0.875804,0.209045,0.300858,0.818182,0.113636,0.4,0.300691,...,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
12,-16.508557,0.856132,1.168044,1.91619,2.016577,1.764414,1.166667,1.4,2.0,1.784006,...,3,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
15,33.068751,1.500757,0.66633,1.380636,2.451917,2.595382,1.269841,2.333333,1.25,2.591627,...,3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,57.804164,1.233832,0.810483,1.226543,0.588983,0.838645,1.757396,1.78022,1.333333,0.842642,...,4,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
14,9.757678,1.011955,0.988186,1.551937,2.664646,1.590097,0.857143,0.892857,1.285714,1.54272,...,4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,73.181025,1.276181,0.783588,1.665727,2.40052,3.334103,1.944444,8.75,2.666667,3.333646,...,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
hrv_columns = ["perc_breathingrate","perc_sdnn","perc_sdsd","perc_s","perc_bpm","perc_rmssd"]

In [5]:
features = dataset_perc[perc_columns]
features['eda_perc_change'] /= 100 
features['eda_perc_change'] += 1
features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,eda_perc_change,perc_bpm,perc_ibi,perc_sdnn,perc_sdsd,perc_rmssd,perc_pnn20,perc_pnn50,perc_hr_mad,perc_sd1,perc_sd2,perc_s,perc_sd1/sd2,perc_breathingrate
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2,0.818242,1.233378,0.810781,1.243654,1.474264,1.457498,0.921053,1.246964,1.75,1.466168,1.249142,1.831452,1.17374,0.712531
4,1.120599,0.94614,1.056926,1.415998,1.209645,1.223343,1.240385,1.791667,1.0,1.214502,1.396485,1.696034,0.869685,0.83251
13,1.055547,1.026443,0.974238,0.613986,0.427218,0.426255,0.916667,0.666667,0.285714,0.426003,0.7049,0.30029,0.604346,1.911492
3,0.881303,1.259298,0.794093,2.980787,2.251378,2.582955,1.0,1.470588,1.714286,2.584538,3.431106,8.867824,0.753267,0.654889
10,1.104703,1.054041,0.94873,0.875804,0.209045,0.300858,0.818182,0.113636,0.4,0.300691,0.601345,0.180819,0.50003,0.505947
12,0.834914,0.856132,1.168044,1.91619,2.016577,1.764414,1.166667,1.4,2.0,1.784006,1.916642,3.419301,0.930798,0.414947
15,1.330688,1.500757,0.66633,1.380636,2.451917,2.595382,1.269841,2.333333,1.25,2.591627,1.096942,2.842865,2.362592,1.661831
8,1.578042,1.233832,0.810483,1.226543,0.588983,0.838645,1.757396,1.78022,1.333333,0.842642,1.499373,1.263435,0.561996,0.365168
14,1.097577,1.011955,0.988186,1.551937,2.664646,1.590097,0.857143,0.892857,1.285714,1.54272,2.837654,4.377706,0.543661,1.149916
5,1.73181,1.276181,0.783588,1.665727,2.40052,3.334103,1.944444,8.75,2.666667,3.333646,1.260241,4.201198,2.645244,3.828861


In [10]:
# include_all = True
# if include_all:
#     hr_features = features[perc_columns]
#     hr_features.drop(['eda_perc_change'], axis=1,inplace=True)
# else:
#     hr_features = features[hrv_columns]
# hr_features= StandardScaler().fit_transform(hr_features)

# eda_features = features['eda_perc_change']
# eda_features = StandardScaler().fit_transform(eda_features.to_numpy().reshape(-1,1))
# pca = PCA(n_components=13)
# pca_HRV = pca.fit_transform(hr_features)

# pca_HRV = StandardScaler().fit_transform(pca_HRV)

# transformed_features = np.append(pca_HRV, eda_features, axis=1)

# print('explained_variance:', sum(pca.explained_variance_ratio_))
y_labels = ['change_in_stressed', 'change_in_amused','change_in_engaged','change_in_frustrated']

y_data = dataset_perc[y_labels]
y_data[y_data == -1] = 0
y_data = y_data.to_numpy()
svm = SVC(kernel='linear')
dt = DecisionTreeClassifier(random_state=0)
rt = RandomForestClassifier(n_estimators = 10,max_depth=2, random_state=0,min_samples_leaf=1)
knn = KNeighborsClassifier(n_neighbors = 3)

models_lst = [svm, dt, rt, knn]
# Model selection
def model_selection(features, y_data, include_all=True,max_p=12):
    if include_all:

        hr_features = features[perc_columns]
        
        hr_features.drop(['eda_perc_change'], axis=1,inplace=True)
    else:
        hr_features = features[hrv_columns]
    
    label_model = []
    label_comp = []
    
    hr_features= StandardScaler().fit_transform(hr_features)

    eda_features = features['eda_perc_change']
    eda_features = StandardScaler().fit_transform(eda_features.to_numpy().reshape(-1,1))
    for label_id in range(len(y_labels)):
        results_acc = np.zeros((len(models_lst),max_p))
        for p in range(max_p):

            for j,model in enumerate(models_lst):
                clf = model
                participantIDs = np.arange(11)
                # Start Cross-Validation
                for i, a in enumerate(participantIDs):
                    
                    pca = PCA(n_components=p+2)
                    pca_HRV = pca.fit_transform(hr_features)

                    pca_HRV = StandardScaler().fit_transform(pca_HRV)

                    transformed_features = np.append(pca_HRV, eda_features, axis=1)
                    
                    train_ids = participantIDs[participantIDs!=i]
                    test_ids = participantIDs[participantIDs==i]

                    x_train = transformed_features[(dataset_perc['participantID'].isin(train_ids)).to_numpy()]
                    x_test = transformed_features[(dataset_perc['participantID'].isin(test_ids)).to_numpy()]

                    y_train = y_data[(dataset_perc['participantID'].isin(train_ids)).to_numpy(),label_id]
                    y_test = y_data[(dataset_perc['participantID'].isin(test_ids)).to_numpy(),label_id]



                    clf.fit(x_train, y_train)
                    predict = clf.predict(x_test)

                    if i == 0:
                        predictions = predict
                        gt = y_test
                    else:
                        predictions = np.concatenate([predictions, predict])
                        gt = np.concatenate([gt, y_test])

                results_acc[j,p] = accuracy_score(predictions,gt)

        best_model_ind, num_comp = np.unravel_index(results_acc.argmax(), results_acc.shape)

        label_model.append(best_model_ind)
        label_comp.append(num_comp)
    return label_model, np.array(label_comp)+2



def label_predictions(features, y_data, models, pcs, include_all=True):
    '''
    - Description:
        returns final voting predictions for one label (ex: change_in_stressed)
    - returns:
        total_predictions - > (selected models, prediction(data))
        final_voting -> (1, voting prediction)
    '''

    
    if include_all:

        hr_features = features[perc_columns]
        
        hr_features.drop(['eda_perc_change'], axis=1,inplace=True)
    else:
        hr_features = features[hrv_columns]
    
    label_model = []
    label_comp = []
    
    hr_features= StandardScaler().fit_transform(hr_features)
    

    eda_features = features['eda_perc_change']
    eda_features = StandardScaler().fit_transform(eda_features.to_numpy().reshape(-1,1))
    for y_label in range(len(y_labels)):
        total_predictions = np.zeros(len(features))
        total_train_predictions = np.zeros(170)
        train_gt = np.zeros(170)

        participantIDs = np.arange(11)
        # Start Cross-Validation
        test_data_length = 0
        train_data_length = 0
        print('-' * 100)
        print('-' * 100)
        print('label name:', y_labels[y_label])
        print()

        for i, a in enumerate(participantIDs):
            model_ind = models[y_label]
            pc = pcs[y_label]
            
            pca = PCA(n_components=pc)
            pca_HRV = pca.fit_transform(hr_features)

            pca_HRV = StandardScaler().fit_transform(pca_HRV)

            transformed_features = np.append(pca_HRV, eda_features, axis=1)
            train_ids = participantIDs[participantIDs!=i]
            test_ids = participantIDs[participantIDs==i]

            x_train = transformed_features[(dataset_perc['participantID'].isin(train_ids)).to_numpy()]
            x_test = transformed_features[(dataset_perc['participantID'].isin(test_ids)).to_numpy()]
            
            y_train = y_data[(dataset_perc['participantID'].isin(train_ids)).to_numpy(),y_label]
            y_test = y_data[(dataset_perc['participantID'].isin(test_ids)).to_numpy(),y_label]

            
            clf = models_lst[int(model_ind)]
            clf.fit(x_train, y_train)
            predict = clf.predict(x_test)
            
            predict_train = clf.predict(x_train)
            total_predictions[test_data_length:test_data_length+len(y_test)] = predict
            total_train_predictions[train_data_length:train_data_length+len(y_train)] = predict_train

            train_gt[train_data_length:train_data_length+len(y_train)] = y_train
            test_data_length+=len(y_test)
            train_data_length+=len(y_train)
      
            
      
        final_v = total_predictions
        
        final_train_v = total_train_predictions
        gt = y_data[:,y_label]
        acc = accuracy_score(final_v, gt)
        f1 = f1_score(final_v, gt)

        acc_train = accuracy_score(final_train_v, train_gt)
        f1_train = f1_score(final_train_v, train_gt)
        gt_dict = Counter(gt)

        train_gt_dict = Counter(train_gt)
        
        train_correct_count_1 = 0
        train_correct_count_0 = 0
        for j in range(len(train_gt)):
            if train_gt[j] == final_train_v[j]:
                if train_gt[j] == 1:
                    train_correct_count_1 += 1
                else:
                    train_correct_count_0 += 1


        test_correct_count_1 = 0
        test_correct_count_0 = 0
        for j in range(len(gt)):
            if gt[j] == final_v[j]:
                if gt[j] == 1:
                    test_correct_count_1 += 1
                else:
                    test_correct_count_0 += 1


        print('TRAIN - # of 0 and 1 of Ground Truth', '// 0:', train_gt_dict[0], '    1:', train_gt_dict[1])
        print('TRAIN - # of 0 and 1 correctly predict', '// 0:', train_correct_count_0, '    1:', train_correct_count_1)
        print('TRAIN - ', round(train_correct_count_0/train_gt_dict[0],2), '//', round(train_correct_count_1/train_gt_dict[1],2))
        print()

        print('TEST - # of 0 and 1 of Ground Truth', '// 0:', gt_dict[0], '    1:', gt_dict[1])
        print('TEST - # of 0 and 1 correctly predict', '// 0:', test_correct_count_0, '    1:', test_correct_count_1)
        print('TEST - ', round(test_correct_count_0/gt_dict[0],2), '//', round(test_correct_count_1/gt_dict[1],2))

        if gt_dict[0] > gt_dict[1]:
            print('BASELINE ACCURACY', round(train_gt_dict[0]/(train_gt_dict[0]+train_gt_dict[1]), 2))
        else:
            print('BASELINE ACCURACY', round(train_gt_dict[1]/(train_gt_dict[0]+train_gt_dict[1]), 2))

        print('Total Train Accuracy:', round(acc_train,2))
        print('Total Test Accuracy:', round(acc,2))
        print()
        print('Total Train F1 score:', round(f1_train,2))
        print('Total Test F1 score:', round(f1,2))
        print()
        print('Confusion Matrix - ', y_labels[y_label])
        print(confusion_matrix(gt, final_v))
        
    

#######################################################################################################################

models, pcs = model_selection(features, y_data, include_all=True,max_p=6)
print('models',models)
print('principal_components',pcs)
label_predictions(features, y_data, models,pcs)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


models [2, 1, 0, 0]
principal_components [7 6 7 4]
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
label name: change_in_stressed

TRAIN - # of 0 and 1 of Ground Truth // 0: 60     1: 110
TRAIN - # of 0 and 1 correctly predict // 0: 38     1: 110
TRAIN -  0.63 // 1.0

TEST - # of 0 and 1 of Ground Truth // 0: 5     1: 11
TEST - # of 0 and 1 correctly predict // 0: 2     1: 10
TEST -  0.4 // 0.91
BASELINE ACCURACY 0.65
Total Train Accuracy: 0.87
Total Test Accuracy: 0.75

Total Train F1 score: 0.91
Total Test F1 score: 0.83

Confusion Matrix -  change_in_stressed
[[ 2  3]
 [ 1 10]]
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
label name: change_in_amused

TRAIN - # of 0 and 1 of Gr

(2, 2)

In [20]:
for i,model in enumerate(models_lst):
    print(i,model)

0 SVC(kernel='linear')
1 DecisionTreeClassifier(random_state=0)
2 RandomForestClassifier(max_depth=2, n_estimators=10, random_state=0)
3 KNeighborsClassifier(n_neighbors=3)
