In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from collections import Counter

In [3]:
dataset = pd.read_csv('final_dataset.csv')
dataset = dataset.set_index('Unnamed: 0')
drop_columns = ['eda_time','time', 'gender']
dataset.drop(drop_columns, axis=1, inplace=True)
dataset = dataset.rename(columns={"perc_change": "eda_perc_change", "abs_change": "eda_abs_change"})

In [4]:
perc_columns = [col for col in dataset.columns if 'perc' in col]
dataset_perc = df = dataset[dataset.columns.drop(list(dataset.filter(regex='abs')))]

columns = ['eda_perc_change','perc_bpm','perc_rmssd','perc_sdsd','participantID', 'change_in_stressed', 'change_in_amused','change_in_engaged','change_in_frustrated']
final_dataset = dataset_perc[columns]
final_dataset[final_dataset == -1] = 0
final_dataset['eda_perc_change'] = final_dataset['eda_perc_change']/100 +1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset[final_dataset == -1] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset['eda_perc_change'] = final_dataset['eda_perc_change']/100 +1


In [5]:
x_feature_lst = final_dataset.columns[:4]
y_labels = ['change_in_stressed', 'change_in_amused','change_in_engaged','change_in_frustrated']

In [11]:
svm = SVC(kernel='linear')
dt = DecisionTreeClassifier(random_state=0)
rt = RandomForestClassifier(n_estimators = 10,max_depth=2, random_state=0,min_samples_leaf=1)
knn = KNeighborsClassifier(n_neighbors = 3)

models_lst = [svm, dt, rt, knn]

['change_in_stressed',
 'change_in_amused',
 'change_in_engaged',
 'change_in_frustrated']

In [24]:
# Model selection
label_feature_model = np.zeros((len(y_labels),len(x_feature_lst)))

for label_id in range(len(y_labels)):
    for feature_id in range(len(x_feature_lst)):
        results_acc = []

        for model in models_lst:
            clf = model
            participantIDs = np.arange(11)
            # Start Cross-Validation
            for i, a in enumerate(participantIDs):
                train_ids = participantIDs[participantIDs!=i]
                test_ids = participantIDs[participantIDs==i]
                train_data = final_dataset.loc[final_dataset['participantID'].isin(train_ids)]
                test_data = final_dataset.loc[final_dataset['participantID'].isin(test_ids)]


                x_train = np.array(train_data[x_feature_lst[feature_id]].values).reshape(-1, 1)
                y_train = np.array(train_data[y_labels[label_id]].values)
                x_test = np.array(test_data[x_feature_lst[feature_id]].values).reshape(-1, 1)
                y_test = np.array(test_data[y_labels[label_id]].values)

                clf.fit(x_train, y_train)
                predict = clf.predict(x_test)

                if i == 0:
                    predictions = predict
                else:
                    predictions = np.concatenate([predictions, predict])


            results_acc.append(accuracy_score(predictions, final_dataset[y_labels[2]].values))

        best_model_ind = np.argmax(results_acc)
        label_feature_model[label_id, feature_id] = best_model_ind


In [25]:
pd_label_feature_model = pd.DataFrame(label_feature_model,
                  index=y_labels, columns=x_feature_lst)

pd_label_feature_model # 0.0 -> svm, 1.0 -> dt, 2.0 -> rf, 3.0 -> knn

Unnamed: 0,eda_perc_change,perc_bpm,perc_rmssd,perc_sdsd
change_in_stressed,1.0,3.0,2.0,1.0
change_in_amused,0.0,3.0,1.0,1.0
change_in_engaged,0.0,3.0,2.0,2.0
change_in_frustrated,0.0,0.0,0.0,1.0


In [68]:
def label_predictions(y_label):
    '''
    - Description:
        returns final voting predictions for one label (ex: change_in_stressed)
    - returns:
        total_predictions - > (selected models, prediction(data))
        final_voting -> (1, voting prediction)
    '''

    total_predictions = np.zeros((len(x_feature_lst), len(final_dataset)))
    # for label_id in range(len(y_labels)):
    for feature_id in range(len(x_feature_lst)):
        results_acc = []

        model_ind = label_feature_model[0, feature_id]
        clf = models_lst[int(model_ind)]
        participantIDs = np.arange(11)

        # Start Cross-Validation
        for i, a in enumerate(participantIDs):
            train_ids = participantIDs[participantIDs!=i]
            test_ids = participantIDs[participantIDs==i]
            train_data = final_dataset.loc[final_dataset['participantID'].isin(train_ids)]
            test_data = final_dataset.loc[final_dataset['participantID'].isin(test_ids)]


            x_train = np.array(train_data[x_feature_lst[feature_id]].values).reshape(-1, 1)
            y_train = np.array(train_data[y_label].values)
            x_test = np.array(test_data[x_feature_lst[feature_id]].values).reshape(-1, 1)
            y_test = np.array(test_data[y_label].values)

            clf.fit(x_train, y_train)
            predict = clf.predict(x_test)

            if i == 0:
                predictions = predict
            else:
                predictions = np.concatenate([predictions, predict])

        total_predictions[feature_id, :] = predictions

    final_voting = np.zeros(len(final_dataset))
    for i in range(len(final_dataset)):
        counts = Counter(total_predictions[:,i])
        final_voting[i] = list(counts.keys())[np.argmax(list(counts.values()))]
        
        
    return total_predictions, final_voting


In [86]:
for i in range(len(y_labels)):
    print('-' * 20)
    print('label name', y_labels[i])
    print()
    final_v = label_predictions(y_labels[i])[1]
    print('Final voting:', final_v)
    gt = np.array(final_dataset[final_dataset.columns[5:]][y_labels[i]])
    acc = accuracy_score(final_v, gt)
    print('Ground Truth:', gt)
    print('Accuracy:', acc)

--------------------
label name change_in_stressed

Final voting: [1. 0. 1. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1.]
Ground Truth: [1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 1.]
Accuracy: 0.5882352941176471
--------------------
label name change_in_amused

Final voting: [1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 1.]
Ground Truth: [0. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0.]
Accuracy: 0.4117647058823529
--------------------
label name change_in_engaged

Final voting: [0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0.]
Ground Truth: [0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1.]
Accuracy: 0.6470588235294118
--------------------
label name change_in_frustrated

Final voting: [1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1.]
Ground Truth: [0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0.]
Accuracy: 0.5294117647058824
