In [128]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

from collections import Counter
from sklearn.metrics import confusion_matrix


In [4]:
dataset = pd.read_csv('final_dataset_3.csv')
dataset = dataset.set_index('Unnamed: 0')
drop_columns = ['eda_time','form_time', 'gender']
dataset.drop(drop_columns, axis=1, inplace=True)
dataset = dataset.rename(columns={"perc_change": "eda_perc_change", "abs_change": "eda_abs_change"})

In [145]:
perc_columns = [col for col in dataset.columns if 'perc' in col]
dataset_perc = df = dataset[dataset.columns.drop(list(dataset.filter(regex='abs')))]

# columns = ['eda_perc_change','participantID', 'change_in_stressed', 'change_in_amused','change_in_engaged','change_in_frustrated']
total_x_feature_lst = dataset_perc.columns.tolist()[:14]
x_feature_lst_SE =  ['eda_perc_change','perc_bpm','perc_rmssd','perc_sdsd'] # stressed, engaged
x_feature_lst_A =  ['eda_perc_change','perc_s', 'perc_breathingrate'] # amused
x_feature_lst_F =  ['eda_perc_change','perc_bpm'] # frustrated

participantID = ['participantID']
y_labels = ['change_in_stressed', 'change_in_amused','change_in_engaged','change_in_frustrated']
corresponding_x_features = [x_feature_lst_SE, x_feature_lst_A, x_feature_lst_SE, x_feature_lst_F]

columns_SE = x_feature_lst_SE + participantID + y_labels
columns_A = x_feature_lst_A + participantID + y_labels
columns_F = x_feature_lst_F + participantID + y_labels

final_dataset_SE = dataset_perc[columns_SE]
final_dataset_SE[final_dataset_SE == -1] = 0
final_dataset_SE['eda_perc_change'] = final_dataset_SE['eda_perc_change']/100 +1

final_dataset_A = dataset_perc[columns_A]
final_dataset_A[final_dataset_SE == -1] = 0
final_dataset_A['eda_perc_change'] = final_dataset_A['eda_perc_change']/100 +1

final_dataset_F = dataset_perc[columns_F]
final_dataset_F[final_dataset_F == -1] = 0
final_dataset_F['eda_perc_change'] = final_dataset_F['eda_perc_change']/100 +1

corresponding_datasets = [final_dataset_SE, final_dataset_A,final_dataset_SE, final_dataset_F]


svm = SVC(kernel='linear')
dt = DecisionTreeClassifier(random_state=0)
rt = RandomForestClassifier(n_estimators = 10,max_depth=2, random_state=0,min_samples_leaf=1)
knn = KNeighborsClassifier(n_neighbors = 3)

models_lst = [svm, dt, rt, knn]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset_SE[final_dataset_SE == -1] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset_SE['eda_perc_change'] = final_dataset_SE['eda_perc_change']/100 +1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

In [146]:
# Model selection
def model_selection(x_feature_lst, final_dataset):
    label_feature_model = np.zeros((len(y_labels),len(x_feature_lst)))

    for label_id in range(len(y_labels)):
        for feature_id in range(len(x_feature_lst)):
            results_acc = []

            for model in models_lst:
                clf = model
                participantIDs = np.arange(11)
                # Start Cross-Validation
                for i, a in enumerate(participantIDs):
                    train_ids = participantIDs[participantIDs!=i]
                    test_ids = participantIDs[participantIDs==i]
                    train_data = final_dataset.loc[final_dataset['participantID'].isin(train_ids)]
                    test_data = final_dataset.loc[final_dataset['participantID'].isin(test_ids)]


                    x_train = np.array(train_data[x_feature_lst[feature_id]].values).reshape(-1, 1)
                    y_train = np.array(train_data[y_labels[label_id]].values)
                    x_test = np.array(test_data[x_feature_lst[feature_id]].values).reshape(-1, 1)
                    y_test = np.array(test_data[y_labels[label_id]].values)

                    clf.fit(x_train, y_train)
                    predict = clf.predict(x_test)

                    if i == 0:
                        predictions = predict
                    else:
                        predictions = np.concatenate([predictions, predict])


                results_acc.append(accuracy_score(predictions, final_dataset[y_labels[2]].values))

            best_model_ind = np.argmax(results_acc)
            label_feature_model[label_id, feature_id] = best_model_ind
    return label_feature_model

# 0.0 -> svm, 1.0 -> dt, 2.0 -> rf, 3.0 -> knn
SE_label_feature_model = model_selection(x_feature_lst_SE, final_dataset_SE)
pd_SE_label_feature_model = pd.DataFrame(SE_label_feature_model,
                  index=y_labels, columns=x_feature_lst_SE)

A_label_feature_model = model_selection(x_feature_lst_A, final_dataset_A)
pd_A_label_feature_model = pd.DataFrame(A_label_feature_model,
                  index=y_labels, columns=x_feature_lst_A)

F_label_feature_model = model_selection(x_feature_lst_F, final_dataset_F)
pd_F_label_feature_model = pd.DataFrame(F_label_feature_model,
                  index=y_labels, columns=x_feature_lst_F)

corresponding_models = [SE_label_feature_model, A_label_feature_model, SE_label_feature_model, F_label_feature_model]

print('Selected Models')
print()
print('Stressed & Engaged')
print(pd_SE_label_feature_model)
print('-' * 100)
print('Amused')
print(pd_A_label_feature_model)
print('-' * 100)
print('Frustrated')
print(pd_F_label_feature_model)

Selected Models

Stressed & Engaged
                      eda_perc_change  perc_bpm  perc_rmssd  perc_sdsd
change_in_stressed                0.0       1.0         2.0        1.0
change_in_amused                  2.0       1.0         1.0        1.0
change_in_engaged                 0.0       1.0         0.0        0.0
change_in_frustrated              0.0       1.0         1.0        3.0
----------------------------------------------------------------------------------------------------
Amused
                      eda_perc_change  perc_s  perc_breathingrate
change_in_stressed                0.0     0.0                 3.0
change_in_amused                  2.0     0.0                 1.0
change_in_engaged                 0.0     0.0                 2.0
change_in_frustrated              0.0     3.0                 0.0
----------------------------------------------------------------------------------------------------
Frustrated
                      eda_perc_change  perc_bpm
change_in_s

In [147]:
def label_predictions(y_label, x_feature_lst, final_dataset, models):
    '''
    - Description:
        returns final voting predictions for one label (ex: change_in_stressed)
    - returns:
        total_predictions - > (selected models, prediction(data))
        final_voting -> (1, voting prediction)
    '''

    total_predictions = np.zeros((len(x_feature_lst), len(final_dataset)))
    total_train_predictions = np.zeros((len(x_feature_lst), 170))
    train_gt = np.zeros(170)

    participantIDs = np.arange(11)
    # Start Cross-Validation
    test_data_length = 0
    train_data_length = 0
    for i, a in enumerate(participantIDs):
        for feature_id in range(len(x_feature_lst)):
            train_ids = participantIDs[participantIDs!=i]
            test_ids = participantIDs[participantIDs==i]
            train_data = final_dataset.loc[final_dataset['participantID'].isin(train_ids)]
            test_data = final_dataset.loc[final_dataset['participantID'].isin(test_ids)]
            x_train = np.array(train_data[x_feature_lst[feature_id]].values).reshape(-1, 1)
            y_train = np.array(train_data[y_label].values)
            x_test = np.array(test_data[x_feature_lst[feature_id]].values).reshape(-1, 1)
            y_test = np.array(test_data[y_label].values)

            model_ind = models[y_labels.index(y_label), feature_id]
            clf = models_lst[int(model_ind)]
            clf.fit(x_train, y_train)
            predict = clf.predict(x_test)
            predict_train = clf.predict(x_train)

            total_predictions[feature_id, test_data_length:test_data_length+len(y_test)] = predict
            total_train_predictions[feature_id, train_data_length:train_data_length+len(y_train)] = predict_train


        train_gt[train_data_length:train_data_length+len(y_train)] = y_train
        test_data_length+=len(y_test)
        train_data_length+=len(y_train)


    final_test_voting = np.zeros(len(final_dataset))
    for i in range(len(final_dataset)):
        counts = Counter(total_predictions[:,i])
        vals = list(counts.values())
        if len(vals) == 1:
            final_test_voting[i] = list(counts.keys())[np.argmax(list(counts.values()))]
        elif vals[0] == vals[1]:
            final_test_voting[i] = np.random.choice(list(counts.keys()))
        else:
            final_test_voting[i] = list(counts.keys())[np.argmax(list(counts.values()))]

    final_train_voting = np.zeros(170)
    for i in range(170):
        counts_train = Counter(total_train_predictions[:,i])
        train_vals = list(counts_train.values())
        if len(train_vals) == 1:
            final_train_voting[i] = list(counts_train.keys())[np.argmax(list(counts_train.values()))]
        elif train_vals[0] == train_vals[1]:
            final_train_voting[i] = np.random.choice(list(counts_train.keys()))
        else:
            final_train_voting[i] = list(counts_train.keys())[np.argmax(list(counts_train.values()))]
        
    return total_predictions, final_test_voting, final_train_voting, train_gt, total_train_predictions

#######################################################################################################################

for i in range(len(y_labels)):
    print('-' * 100)
    print('-' * 100)
    print('label name:', y_labels[i])
    print()
    final_v = label_predictions(y_labels[i], corresponding_x_features[i], corresponding_datasets[i], corresponding_models[i])[1]
    final_train_v = label_predictions(y_labels[i], corresponding_x_features[i], corresponding_datasets[i], corresponding_models[i])[2]
    train_gt= label_predictions(y_labels[i], corresponding_x_features[i], corresponding_datasets[i], corresponding_models[i])[3]
    # print('Final voting:', final_v)
    gt = np.array(final_dataset_SE[final_dataset_SE.columns[len(x_feature_lst_SE)+1:]][y_labels[i]])
    acc = accuracy_score(final_v, gt)
    acc_train = accuracy_score(final_train_v, train_gt)

    gt_dict = Counter(gt)
    train_gt_dict = Counter(train_gt)

    train_correct_count_1 = 0
    train_correct_count_0 = 0
    for j in range(len(train_gt)):
        if train_gt[j] == final_train_v[j]:
            if train_gt[j] == 1:
                train_correct_count_1 += 1
            else:
                train_correct_count_0 += 1


    test_correct_count_1 = 0
    test_correct_count_0 = 0
    for j in range(len(gt)):
        if gt[j] == final_v[j]:
            if gt[j] == 1:
                test_correct_count_1 += 1
            else:
                test_correct_count_0 += 1


    print('TRAIN - # of 0 and 1 of Ground Truth', '// 0:', train_gt_dict[0], '    1:', train_gt_dict[1])
    print('TRAIN - # of 0 and 1 correctly predict', '// 0:', train_correct_count_0, '    1:', train_correct_count_1)
    print('TRAIN - ', round(train_correct_count_0/train_gt_dict[0],2), '//', round(train_correct_count_1/train_gt_dict[1],2))
    print()

    print('TEST - # of 0 and 1 of Ground Truth', '// 0:', gt_dict[0], '    1:', gt_dict[1])
    print('TEST - # of 0 and 1 correctly predict', '// 0:', test_correct_count_0, '    1:', test_correct_count_1)
    print('TEST - ', round(test_correct_count_0/gt_dict[0],2), '//', round(test_correct_count_1/gt_dict[1],2))

    if gt_dict[0] > gt_dict[1]:
        print('BASELINE ACCURACY', round(train_gt_dict[0]/(train_gt_dict[0]+train_gt_dict[1]), 2))
    else:
        print('BASELINE ACCURACY', round(train_gt_dict[1]/(train_gt_dict[0]+train_gt_dict[1]), 2))

    print('Total Train Accuracy:', round(acc_train,2))
    print('Total Test Accuracy:', round(acc,2))

    print()
    print('Confusion Matrix - ', y_labels[i])
    print(confusion_matrix(final_v, gt))


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
label name: change_in_stressed

TRAIN - # of 0 and 1 of Ground Truth // 0: 60     1: 110
TRAIN - # of 0 and 1 correctly predict // 0: 48     1: 110
TRAIN -  0.8 // 1.0

TEST - # of 0 and 1 of Ground Truth // 0: 5     1: 11
TEST - # of 0 and 1 correctly predict // 0: 2     1: 10
TEST -  0.4 // 0.91
BASELINE ACCURACY 0.65
Total Train Accuracy: 0.93
Total Test Accuracy: 0.75

Confusion Matrix -  change_in_stressed
[[ 2  1]
 [ 3 10]]
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
label name: change_in_amused

TRAIN - # of 0 and 1 of Ground Truth // 0: 70     1: 100
TRAIN - # of 0 and 1 correctly predict // 0: 65     1: 99
TRAIN -  0.93 // 