# 1. Library

In [None]:
import sys
 
# setting path
sys.path.append('../..')

import pandas as pd
import mne
import numpy as np
import matplotlib.pyplot as plt
import os
import scipy
import sklearn
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

from os import listdir
from os.path import isfile, join
from mna.utils.rnapp_data_format import read_all_lslpresets, return_metadata_from_name, event_data_from_data, read_event_data
from mna.utils.batch_feature_extraction import clean_up_adadrive_trials

# 2. Feature Extraction and Predction Functions

## 2.1 Features Extraction

In [None]:
def eeg_features(df, data_type = 'processed', features = 'all', ground_truth = 'Steering_Wheel_Degree_Encoded', 
                 cleaned_up = False):
    
    if data_type == 'processed':
        first_electrode_column_name = "Fp1_4-8_Hz_Power"
        last_electrode_column_name = "O2_32-55_Hz_Sample_entropy"
        autoreject_column_name = "autorejected"
    elif data_type == 'raw':
        first_electrode_column_name = "Fp1_4-8_Hz_Power_raw"
        last_electrode_column_name = "O2_32-55_Hz_Sample_entropy_raw"
        autoreject_column_name = "autorejected_raw"
        
    first_electrode_idx = df.columns.get_loc(first_electrode_column_name)
    last_electrode_idx = df.columns.get_loc(last_electrode_column_name)

    # with autoreject
    valid_trial = (df[ground_truth].notnull()) & (df[autoreject_column_name] == False)
    
    all_eeg_features = df.iloc[:,first_electrode_idx:last_electrode_idx+1] # all features in cleaned up data
    
    if features == 'all': 
        eeg_features = all_eeg_features
    else:
        features = "|".join(map(str,features))
        eeg_features = all_eeg_features.loc[:, all_eeg_features.columns.str.contains(features)]
    
    if cleaned_up:
        return np.asarray(eeg_features[valid_trial]), np.asarray(df[ground_truth][valid_trial])
    else:
        return eeg_features

In [None]:
def eye_features(df, features = 'all', ground_truth = 'Steering_Wheel_Degree_Encoded', cleaned_up = False):
    
    if features == 'all':
        eye_df = df['Left Pupil Diameter', 'NSLR_count_Fixation', 'NSLR_count_Saccade', 'NSLR_mean_duration_Fixation', 
                    'NSLR_mean_duration_Saccade', 'NSLR_first_onset_Fixation', 'NSLR_first_onset_Saccade']
    else:
        eye_feature = features
        eye_df = df[eye_feature]
        
    if cleaned_up:
        eye_df = eye_df.join(df[ground_truth]).dropna()
        return np.asarray(eye_df.iloc[:,0:-1]), np.asarray(eye_df.iloc[:,-1])
    else:
        return eye_df

In [None]:
def ecg_features(df, features = "all", ground_truth = 'Steering_Wheel_Degree_Encoded', cleaned_up = False):
    ecg_feature_first = df.columns.get_loc("bpm")
    ecg_feature_last = df.columns.get_loc("breathingrate")
    
    if features == 'all':
        ecg_df = df.iloc[:,ecg_feature_first:ecg_feature_last-2]
    else:
        ecg_feature = features
        ecg_df = df[ecg_feature]
    
    if cleaned_up:
        ecg_df = ecg_df.join(df[ground_truth]).dropna()
        return np.asarray(ecg_df.iloc[:,0:-1]), np.asarray(ecg_df.iloc[:,-1])
    else:
        return ecg_df

In [None]:
def multimodal_features(df, features = "all", ground_truth = 'Steering_Wheel_Degree_Encoded'):
    
    if features == "all":
        all_features_list = [eeg_features(df), eye_features(df), ecg_features(df), df[ground_truth]]
        all_features_df = pd.concat(all_features_list, axis = 1).dropna()
    else:
        all_features_df = df[features].join(df[ground_truth]).dropna()
    
    return np.asarray(all_features_df.iloc[:,0:-1]), np.asarray(all_features_df.iloc[:,-1])

## 2.2 Features Normalization

In [None]:
def norm_features(x_train, x_test):
    
    scaler = MinMaxScaler().fit(x_train)
    x_train_norm = scaler.transform(x_train)
    x_test_norm = scaler.transform(x_test)
    
    return x_train_norm, x_test_norm

## 2.3 Prediction Model

In [None]:
def trial_prediction(train_data, test_data, train_true, test_true, prediction_type = "classification", 
                     seed = 42, save_plots = False, plot_fig = True):

    if prediction_type == "regression":
        random_forest = RandomForestRegressor(random_state = seed).fit(train_data, train_true)
        mse = mean_squared_error(test_true, random_forest.predict(test_data), squared=True)
        rmse = mean_squared_error(test_true, random_forest.predict(test_data), squared=False)
        
    if prediction_type == "classification":
        random_forest = RandomForestClassifier(random_state = seed, 
                                               class_weight = 'balanced_subsample').fit(train_data, train_true)
        
        score_train = random_forest.predict_proba(train_data)[:,1]
        score_test = random_forest.predict_proba(test_data)[:,1]
        
        train_pred = random_forest.predict(train_data)
        test_pred = random_forest.predict(test_data)

        fpr_train, tpr_train, thresholds_train = metrics.roc_curve(train_true-1, score_train)
        auc_train = metrics.roc_auc_score (train_true-1, score_train)

        fpr_test, tpr_test, thresholds_test = metrics.roc_curve(test_true-1, score_test)
        auc_test = metrics.roc_auc_score (test_true-1, score_test)

        train_acc = metrics.accuracy_score(train_true,train_pred)
        test_acc = metrics.accuracy_score(test_true,test_pred) 
        
    importance = random_forest.feature_importances_ 

    if plot_fig:
        # Features Importance
        fig_importance = plt.figure(figsize = [10 ,3])
        axe = fig_importance.add_subplot(1,1,1)

        markerline, stemline, baseline = axe.stem([x for x in range(len(importance))], importance, 
                                                  linefmt='k-',markerfmt='ko',basefmt='k.')
        plt.setp(stemline, linewidth = 1)
        plt.setp(markerline, markersize = 1)
        axe.set_xlabel("Feature")
        axe.set_ylabel("Importance")
        axe.set_title("Coefficient for Each Features")
        if save_plots:
            plt.savefig(f"../output/classification_result/feature_importance.png")
        
        if prediction_type == "classification":
            # ROC Curve
            sns.set(font_scale=2)
            plt.style.use('seaborn-white')
            fig = plt.figure(figsize = [25,7])

            axe = fig.add_subplot(1,2,1)
            axe.plot(fpr_train,tpr_train)
            axe.set_xlabel("False Positive Rate")
            axe.set_ylabel("True Positive Rate")
            axe.set_title("Training ROC Curve")
            axe.text(0.6,0.2,"AUC = {:.2f}".format(auc_train))

            axe = fig.add_subplot(1,2,2)
            axe.plot(fpr_test,tpr_test)
            axe.set_xlabel("False Positive Rate")
            axe.set_ylabel("True Positive Rate")
            axe.set_title("Testing ROC Curve")
            axe.text(0.6,0.2,"AUC = {:.2f}".format(auc_test))

            if save_plots:
                plt.savefig(f"../output/classification_result/training_testing_ROC_Curve.png", dpi=300)

            # Confusion Matrix
            fig_cnf = plt.figure(figsize = [20, 5])
            ax1 = fig_cnf.add_subplot(1,2,1)
            ax2 = fig_cnf.add_subplot(1,2,2)

            cnf_matrix_train = metrics.confusion_matrix(train_true, train_pred)
            cnf_matrix_test = metrics.confusion_matrix(test_true, test_pred)

            sns.heatmap(cnf_matrix_train, fmt = 'g', annot = True, xticklabels = ['Easy','Hard'], yticklabels = ['Easy','Hard'],ax=ax1)
            ax1.set_title("Training Confusion Matrix")
            sns.heatmap(cnf_matrix_test, fmt = 'g', annot = True, xticklabels = ['Easy','Hard'], yticklabels = ['Easy','Hard'],ax=ax2)
            ax2.set_title("Testing Confusion Matrix")

            if save_plots:
                plt.savefig(f"../output/classification_result/training_testing_confusion_matrix.png", dpi=300)

        plt.show()
    
    if prediction_type == 'classification':
        return auc_train, auc_test, importance
    if prediction_type == 'regression':
        return mse, rmse, importance

## 2.4 Cross validation

In [None]:
def model_cv(x_modality, y_modality, n_folds = 10, model_type = 'classification', seed = 42):
    
    all_model_metrics = np.empty((2, n_folds))
    model_importance = np.empty((n_folds, x_modality.shape[1]))
    
    skf = StratifiedKFold(n_splits = n_folds, random_state = seed, shuffle=True)

    for i, (train_index, test_index) in enumerate(skf.split(x_modality, y_modality)):

        x_train_norm, x_test_norm = norm_features(x_modality[train_index], x_modality[test_index])
        model_metric_1, model_metric_2, coefs = trial_prediction(x_train_norm, x_test_norm, 
                                                      y_modality[train_index], y_modality[test_index],
                                                      prediction_type = model_type, plot_fig = False)
        all_model_metrics[0,i] = model_metric_1
        all_model_metrics[1,i] = model_metric_2
        model_importance[i,:] = coefs
    
    return np.mean(all_model_metrics, axis = 1), model_importance


# 3. Motor Events Trial Level Classfication/Regression

## 3.1 Dataframe Processing and Extraction

In [None]:
motor_output_dir = (f"../../output/batch_analysis/")

In [None]:
import glob
import re

def str_list_to_list(lst):
    str_single_space = re.sub("\s+", " ", lst.strip())
    str_no_brackets = re.sub("[\[\]]", "", lst)
    return [float(n) for n in str_no_brackets.split()]

# loop over the list of csv files
def read_motor_csvs():
    csv_files = glob.glob(os.path.join(motor_output_dir, "ppid*_motor.csv"))
    all_dfs = None
    for f in csv_files:
        # read the csv file and add column for labels
        temp_df = pd.read_csv(f)

        all_steer_events = temp_df.copy()['post_steer_event_raw']
        all_steer_events_finalized = all_steer_events.apply(str_list_to_list)

        norm_pos = lambda wheel_pos: np.asarray(wheel_pos)/np.asarray(wheel_pos[0])
        final_pos = lambda final_wheel_pos: np.asarray(final_wheel_pos[-1])-np.asarray(final_wheel_pos[0])

        norm_pos_df = all_steer_events_finalized.apply(norm_pos)

        temp_df['steering_wheel_degree'] = abs(all_steer_events_finalized.apply(final_pos))
        temp_df['steering_wheel_degree_categorical'] = pd.qcut(temp_df['steering_wheel_degree'], 2, labels=["Low", "High"]) #2=High, 1 =Low
        temp_df['steering_wheel_degree_encoded'] = temp_df.Steer_Wheel_Degree_Categorical.replace({'High': 2, 'Low': 1})
        
        if not type(all_dfs)==pd.core.frame.DataFrame:
            all_dfs = temp_df
        else:
            all_dfs = pd.concat([all_dfs, temp_df], ignore_index=True)
            
    all_dfs = all_dfs[all_dfs.columns.drop(list(all_dfs.filter(regex='Unnamed')))]
    
    return all_dfs

In [None]:
all_dfs_final = read_motor_csvs()
motor_all_dfs = all_dfs_final.copy()

remove_sessions = [(13,1),(15,1),(22,1),(22,102)]

In [None]:
motor_all_dfs['sub_sess'] = motor_all_dfs.ppid.astype(str) + "_" + motor_all_dfs.session.astype(str)
motor_all_dfs = motor_all_dfs.loc[~motor_all_dfs.sub_sess.isin([f"{es[0]}.0_{es[1]}.0" for es in remove_sessions])]

motor_all_dfs['mean_steering_wheel_degree'] = motor_all_dfs.Steering_Wheel_Degree.mean()

motor_all_dfs['pupil_bin'] = motor_all_dfs.groupby(['ppid'])['Left Pupil Diameter'].transform(
    lambda x: pd.qcut(x, 2, labels=['low', 'high']))
motor_all_dfs['pupil_bin_encoded'] = motor_all_dfs.groupby(['ppid'])['Left Pupil Diameter'].transform(
    lambda x: pd.qcut(x, 2, labels=[1, 2]))

motor_all_dfs = clean_up_adadrive_trials(motor_all_dfs)

In [None]:
# luminance effect removal

pupil_df = pd.read_csv(f"../output/pupil_exposure/participant_level_exposure_fits.csv")
motor_all_dfs['Raw Left Pupil Diameter'] = motor_all_dfs['Left Pupil Diameter']
p_val_criteria = 0.05

for index, row in motor_all_dfs.reset_index(drop=True).iloc[1:].iterrows():
    last_ppid = motor_all_dfs.iloc[index-1].ppid
    last_session = motor_all_dfs.iloc[index-1].session
    last_trial = motor_all_dfs.iloc[index-1].trial
    last_opacity = motor_all_dfs.iloc[index-1].density
    if ((row.ppid == last_ppid) & (row.session == last_session) & (row.trial == last_trial+1)): # if continuous
        # if there is a significant effect of opacity on pupil
        if pupil_df.loc[pupil_df['sub']==last_ppid,'p_opacities'].values < p_val_criteria:
            this_opacity = row.density
            this_pupil_diameter = row['Left Pupil Diameter']
            weight = pupil_df.loc[pupil_df['sub']==last_ppid,'w_opacities']
            adjustment = (this_opacity-last_opacity)*weight
            motor_all_dfs.iloc[index,motor_all_dfs.columns.get_loc('Left Pupil Diameter')] -= adjustment

In [None]:
sns.histplot(data=motor_all_dfs, x="steering_wheel_degree")
# plt.savefig(f"../output/plots/steering_wheel_turned_deg.png", dpi=300)

In [None]:
# motor_all_dfs.to_csv(f"../output/batch_analysis/motor_df_label.csv")
# motor_all_dfs.to_excel(f"../output/batch_analysis/motor_df_label.xlsx")

## 3.2 Classification - Voice, Pupil-linked Arousal, Steering Wheel Turned

In [None]:
def pred_func(motor_dfs, predictions = 'spoken_difficulty_encoded', pred_type = 'classification'):
    
    predictors_1 = ['density', 'Left Pupil Diameter', 
                  'FC3_4-8_Hz_Power', 'FC3_8-15_Hz_Power', 'FC3_15-32_Hz_Power', 'FC3_32-55_Hz_Power', 
                  'FC1_4-8_Hz_Power', 'FC1_8-15_Hz_Power', 'FC1_15-32_Hz_Power', 'FC1_32-55_Hz_Power', 
                  'FCz_4-8_Hz_Power', 'FCz_8-15_Hz_Power', 'FCz_15-32_Hz_Power', 'FCz_32-55_Hz_Power',
                  'FC2_4-8_Hz_Power', 'FC2_8-15_Hz_Power', 'FC2_15-32_Hz_Power', 'FC2_32-55_Hz_Power', 
                  'FC4_4-8_Hz_Power', 'FC4_8-15_Hz_Power', 'FC4_15-32_Hz_Power', 'FC4_32-55_Hz_Power']

    predictors_2 = ['density', 'Left Pupil Diameter', 
                  'NSLR_count_Fixation', 'NSLR_count_Saccade', 
                  'NSLR_mean_duration_Fixation', 'NSLR_mean_duration_Saccade', 
                  'NSLR_first_onset_Fixation', 'NSLR_first_onset_Saccade', 
                  'bpm', 'sdnn', 'rmssd', 'pnn50']
    
    motor_dfs_filtered = motor_dfs.copy()
    motor_dfs_filtered = motor_dfs_filtered[motor_dfs_filtered[predictions] != 0]
    
    if predictions == 'pupil_bin_encoded':
        predictors_1.remove('Left Pupil Diameter')
        predictors_2.remove('Left Pupil Diameter')
    
    x_pupil_eeg, y_pupil_eeg = multimodal_features(motor_dfs_filtered, features = predictors_1, 
                                                   ground_truth = predictions)
    x_eye_ecg, y_eye_ecg = multimodal_features(motor_dfs_filtered, features = predictors_2, 
                                               ground_truth = predictions)

    modalities_dict = {"pupil_eeg": (x_pupil_eeg, y_pupil_eeg),
                       "eye_ecg": (x_eye_ecg, y_eye_ecg)}
                                                   
    modalities_metric = {}
    features_coefs = {}

    for modalities in list(modalities_dict.keys()):

        metric, coef = model_cv(modalities_dict[modalities][0], modalities_dict[modalities][1], 
                             model_type = pred_type)
        modalities_metric[modality] = metric
        modalities_coefs[modality] = coef
    if pred_type == 'classification':
        modalities_metric = pd.DataFrame(modalities_auc, index = ['Train AUC', 'Test AUC'])
    elif pred_type == 'regression':
        modalities_metric = pd.DataFrame(modalities_auc, index = ['MSE', 'RMSE'])

    return modalities_metric, modalities_coefs, [predictors_1, predictors_2]

In [None]:
auc_df_voice, modalities_coefs_voice, features_voice = pred_func(motor_all_dfs, predictions = 'spoken_difficulty_encoded')
auc_df_arousal, modalities_coefs_arousal, features_arousal = pred_func(motor_all_dfs, predictions = 'pupil_bin_encoded')
auc_df_wheel_deg, modalities_coefs_wheel_deg, features_wheel_deg = pred_func(motor_all_dfs, predictions = 'steering_wheel_degree_encoded')

In [None]:
def cv_importance_plot(pred_importance_avg, pred_importance_std, pred_feature_list)
    importance_fig = plt.figure(figsize = [10 ,3])
    axe = importance_fig.add_subplot(1,1,1)
    
    features_no = np.arange(len(clf_importance_avg))

    markerline, stemline, baseline = axe.bar(features_no, pred_importance_avg, yerr=pred_importance_std, 
                                             align='center', alpha=0.5, ecolor='black', capsize=10)
    axe.set_xticks(features_no)
    axe.set_xticklabels(pred_feature_list)
    axe.set_ylabel("Importance")
    axe.set_title("Importance for Each Features")

In [None]:
importance_voice_clf_avg = np.mean(modalities_coefs_voice[0], axis = 0)
importance_voice_clf_std = np.std(modalities_coefs_voice[0], axis = 0)
feature_list_voice_clf = features_voice[0]

cv_importance_plot(importance_voice_clf_avg, importance_voice_clf_std, feature_list_voice_clf)

## 3.3 Motor Event Regression

In [None]:
rmse_mean = mean_squared_error(motor_all_dfs['steering_wheel_degree'], 
                               motor_all_dfs['mean_steering_wheel_degree'], squared=False)

In [None]:
mse_rmse, motor_reg_coefs, motor_reg_features = pred_func(motor_all_dfs, predictions = 'steering_wheel_degree', 
                                   pred_type = 'regression')

In [None]:
importance_motor_pred_avg = np.mean(motor_reg_coefs[0], axis = 0)
importance_motor_pred_std = np.std(motor_reg_coefs[0], axis = 0)
feature_list_motor_reg = motor_reg_features[0]

cv_importance_plot(importance_motor_pred_avg, importance_motor_pred_std, feature_list_motor_reg)

In [None]:

# results_clf_reg.to_csv(f'../output/saved_files/all_modality_rmse_auc.csv')
results_clf_reg.to_excel(f'../output/saved_files/all_modality_rmse_auc.xlsx')