# Muse Data Elaborator

1. Data Recording
    1. (done) Choose the App
    1. (done) Download the data
1. Data Cleaning
    1. (done) Remove noise
    1. (done) Remove invalid parts: sensor does not work -> remove that line
1. Feature Extraction
    1. (done) Calculation of features (five brain wave frequency bands)
    1. (done) Normalize data using the baseline.
1. "Machine Learning"
    1. (done) Split data. (window of 30 seconds, taking a sample from it)
    1. (done) Calculate the stress level.
    1. (done) Label data.
1. Data Visualization
    1. (done) Sections visualization.
    1. (done) Scenarios comparison.

#### Waves information

1. Alpha:
    - increases when a person is relaxed (the lower the busier)
    - blocked in task engagement
    
    
1. Beta:
    - increases in task engagement
    
    
1. Theta:
    - increases in demand and working memory load
    - sensitive to data difficulty
    - suppressed in task engagement
    
    
1. Delta:
    - sensitive to data difficulty

### Code

In [1]:
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show, output_file, save, curdoc
from bokeh.models import ColumnDataSource, HoverTool, NumeralTickFormatter, Title
from bokeh.models.widgets import Select
from bokeh.layouts import column, row, gridplot
from datetime import datetime as dt
from math import pi, sqrt
import os
import csv
import statistics

# Scenarios Titles
experiment_labels = ['scenario 1', 'scenario 2', 'scenario 3']
all_scenario_labels = ['fishes 1', 'scenario 1', 'fishes 2', 'scenario 2', 'fishes 3', 'scenario 3']

# Ranges for plots
min_range = 200
max_range = -100

# Setting the threshold to have a good sensors signal - 4 HSI possible values: 1=Good, 2=Medium, 4=Bad
hsi_threshold = 8
windows_no = 30

In [2]:
# 0. Calls all the algorithms in the right order
def execute_class(participant_no, experiment_order):
    
    # Read the file
    df = read_file(participant_no)
    
    # Clean the data and split it based on content
    df_records, df_elements, df_markers = clean_data(df)

    # Checking if the data returned is valid
    if str(type(df_records)) != "<class 'str'>":
        
        # Calculating the frequencies using the correct range
        df_prepared = extract_features(df_records)
        #visualize_simple_plot([df_prepared], "initial", ['All'])
        
        # Split the data into baseline data and experiment data
        sections = split_data(df_prepared, df_markers)
        #visualize_simple_plot(sections, "sections", all_scenario_labels)
        
        # Normalize the experiment based on its baseline and the quality of data
        normalized_sections = normalize_data(sections)
        
        # Splitting data into comparable windows
        window_stress_values = split_in_windows(normalized_sections, windows_no)
        
        # Save file in CSV
        save_sections(participant_no, window_stress_values, experiment_order)
        

In [3]:
# 1. Load the dato into dataframe
def read_file(participant_no):
    __file__ = participant_no + '.csv'
    my_absolute_dirpath = os.path.abspath(os.path.dirname(__file__))
    file_path = my_absolute_dirpath+"\\aData\\"+__file__
    df = pd.read_csv(file_path, sep=",")
    return df

In [4]:
# 2. Removes unnecessary columns and checks the correctness
def clean_data(df):
    
    # Removing unnecessary columns and formatting
    df['TimeStamp'] = pd.to_datetime(df['TimeStamp'], errors='coerce')
    df_cleaned = df.drop(columns=['Gyro_X', 'Gyro_Y', 'Gyro_Z', 'Accelerometer_X', 'Accelerometer_Y', 'Accelerometer_Z', 'AUX_RIGHT', 'Battery'])

    # Extracting the markers from the dataframe
    df_markers = df_cleaned[df_cleaned['Elements'].str.contains('/Marker/', na=False)]
    df_markers = df_markers.reset_index(drop=True)
    df_markers = df_markers[['TimeStamp', 'Elements']]
    
    # Checking if the markers are correct - Markers are 7: 1x "marker 5", 3x "marker 1", 3x "marker 2"
    markers_no = df_markers.shape[0]
    marker_five_no = df_markers[df_markers['Elements'] == '/Marker/5'].shape[0]
    marker_one_no = df_markers[df_markers['Elements'] == '/Marker/1'].shape[0]
    marker_two_no = df_markers[df_markers['Elements'] == '/Marker/2'].shape[0]
    if not (markers_no == 7 and marker_five_no == 1 and marker_one_no == 3 and marker_two_no == 3):
        print("Markers are not of the desired number or type")
        print("Marker 5: " +str(marker_five_no))
        print("Marker 2: " +str(marker_two_no))
        print("Marker 1: " +str(marker_one_no))
        print("Total Markers: "+str(markers_no))
        return '', '', ''
    
    # Deleting all records before the initial marker timestamp (time spent positioning the sensor)
    df_start_markers = df_markers[df_markers['Elements'] == '/Marker/5']
    timestamp_start = df_start_markers['TimeStamp'].iloc[-1]
    df_formatted = df_cleaned[df_cleaned['TimeStamp'] >= timestamp_start]
    
    # Removing marker 5 from all dataframes
    df_markers = df_markers.iloc[1:]
    df_formatted = df_formatted.iloc[1:]

    # Extracting the elements - markers, blinks, clinches
    df_elements = df_formatted[df_formatted['Elements'].str.contains('/', na=False)]
    df_elements = df_elements[['TimeStamp', 'Elements']]
    df_elements = df_elements.dropna()
    df_elements = df_elements.reset_index(drop=True)

    # Extracting the records with the HeadBandOn
    df_records_headbandon = df_formatted[df_formatted['HeadBandOn'] == 1]
    df_records_headbandon = df_records_headbandon.drop(columns=['Elements'])
    df_records_headbandon = df_records_headbandon.reset_index(drop=True)
    
    # Removing low quality data
    df_records_headbandon['Sensor_Quality'] = df_records_headbandon['HSI_TP9'] + df_records_headbandon['HSI_AF7'] + df_records_headbandon['HSI_AF8'] + df_records_headbandon['HSI_TP10']
    df_records = df_records_headbandon[df_records_headbandon['Sensor_Quality'] <= hsi_threshold]
    
    # Defining and printing low quality data proportion
    lines = df_records_headbandon.shape[0]
    usable_lines = df_records.shape[0]
    print(str(usable_lines) + "/" + str(lines) + " usable lines ")
    print("Total usable lines for this participant is "+
          ('%.2f' % (100*usable_lines/lines,)).rstrip('0').rstrip('.')+
          "% - threshold: "+str(hsi_threshold))
    print()
    
    return df_records, df_elements, df_markers

In [5]:
# 3. Transforming and extracting data, including data split and normalization
def extract_features(df_records):
    
    # Settings to change the data range
    old_range_max = 3
    old_range_min = -3
    old_range = (old_range_max - old_range_min) 
    new_range_max = 200
    new_range_min = -100
    new_range = (new_range_max - new_range_min)

    # Calculating Average Absolute Brain Waves
    df_avg_bw = df_records.drop(columns=['Delta_TP9', 'Delta_AF7', 'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8', 'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10', 'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9', 'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10'])
    old_alpha_value = (df_records['Alpha_TP9'] + df_records['Alpha_AF7'] + df_records['Alpha_AF8'] + df_records['Alpha_TP10'])/4
    old_beta_value = (df_records['Beta_TP9'] + df_records['Beta_AF7'] + df_records['Beta_AF8'] + df_records['Beta_TP10'])/4
    old_delta_value = (df_records['Delta_TP9'] + df_records['Delta_AF7'] + df_records['Delta_AF8'] + df_records['Delta_TP10'])/4
    old_gamma_value = (df_records['Gamma_TP9'] + df_records['Gamma_AF7'] + df_records['Gamma_AF8'] + df_records['Gamma_TP10'])/4
    old_theta_value = (df_records['Theta_TP9'] + df_records['Theta_AF7'] + df_records['Theta_AF8'] + df_records['Theta_TP10'])/4

    # Converting in new range
    df_avg_bw['Alpha_Avg'] = (((old_alpha_value - old_range_min) * new_range) / old_range) + new_range_min
    df_avg_bw['Beta_Avg'] = (((old_beta_value - old_range_min) * new_range) / old_range) + new_range_min
    df_avg_bw['Delta_Avg'] = (((old_delta_value - old_range_min) * new_range) / old_range) + new_range_min
    df_avg_bw['Gamma_Avg'] = (((old_gamma_value - old_range_min) * new_range) / old_range) + new_range_min
    df_avg_bw['Theta_Avg'] = (((old_theta_value - old_range_min) * new_range) / old_range) + new_range_min
    
    # Calculating first ratio: Beta / (Alpha + Theta) --> task difficulty indicator + task engagement
    df_avg_bw['First_Ratio'] = df_avg_bw['Beta_Avg'] / (df_avg_bw['Alpha_Avg'] + df_avg_bw['Theta_Avg'])
        
    # Calculating second ratio: Theta / (Alpha + Beta) --> task difficulty indicator
    df_avg_bw['Second_Ratio'] = df_avg_bw['Theta_Avg'] / (df_avg_bw['Alpha_Avg'] + df_avg_bw['Beta_Avg'])

    return df_avg_bw


In [6]:
# 4. Splitting data based on markers
def split_data(df, df_markers):
    
    # Setting initial variable
    i = 0
    sections = []
    
    # Splitting the data and visualizing them separately
    prev_timestamp = "null"
    markers_timestamps = df_markers['TimeStamp'].tolist()
    for timestamp in markers_timestamps:
        
        # Taking different actions for the dataframe splitting
        if prev_timestamp != "null":
            section = df[
                df['TimeStamp'] > prev_timestamp
            ]
            section = section[
                section['TimeStamp'] <= timestamp
            ]
            
            sections.append(section)
            i=i+1
            
        prev_timestamp = timestamp

    # Visualizing the last part of the experiment
    section = df[
        df['TimeStamp'] > prev_timestamp
    ]
    sections.append(section)
    
    # Returns 6 sections: (baseline + scenario) x3
    return sections

In [7]:
# 5. Normalizing the data based on the baseline
def normalize_data(sections):
    
    # Declaring variables for the method execution
    h = 0
    total_lines = 0
    total_usable_lines = 0
    frequency_columns = ['Alpha_Avg', 'Beta_Avg', 'Delta_Avg', 'Gamma_Avg', 'Theta_Avg']
    new_sections = []
    
    # Iterating through the 3 baseline-experiment sections
    for i in range(0,6,2):
            
        # Defining variables for execution
        df_baseline = sections[i]
        df_experiment = sections[i+1]
        
        # Extracting the last minute of recording 
        baseline_last_timestamp = df_baseline.iloc[-1]['TimeStamp']
        baseline_start = baseline_last_timestamp - pd.Timedelta(minutes=1)
        df_baseline_min = df_baseline[df_baseline['TimeStamp'] >= baseline_start]
        
        # Storing the mean of the baseline
        baseline_frequencies = []
        print("Frequency Mean of baseline " +str(h)+":")
        for j in range(0,5):
            baseline_frequencies.append(df_baseline_min[frequency_columns[j]].mean())
            print("-- " + frequency_columns[j] + ": " +('%.2f' % (baseline_frequencies[j],)).rstrip('0').rstrip('.'))
        print()
                
        # Subtracting baseline from data
        for column in ['First_Ratio', 'Second_Ratio']:
            baseline_avg = df_baseline_min[column].mean()
            df_experiment[column] = df_experiment[column] - baseline_avg
                
        # Storing the elaborated sections
        new_sections.append(df_experiment)
        h = h + 1
        
    return new_sections

In [8]:
# 6. Create windows
def split_in_windows(sections, k):
    
    # Defining return value - for each section, one array containing first_ratio.mean, second_ratio.mean
    all_section_means = []
    
    # Accessing every scenario
    for section in sections:
        
        # Defining array useful for the execution
        mean_values = []
        first_ratio = []
        second_ratio = []
        
        # Copying the section to better manage it
        new_section = section.copy()
        
        # Iterate while we still have data after all the 30 seconds steps
        while not new_section.empty:
            
            # Retrieving a new row
            row = new_section.iloc[0]
            timestamp = row['TimeStamp'] + pd.Timedelta(seconds=k)
            
            # Storing the samples
            first_ratio.append(row['First_Ratio'])
            second_ratio.append(row['Second_Ratio'])
            
            # Excluding the 30 seconds just analyzed
            new_section = new_section[new_section['TimeStamp'] >= timestamp]

        # Appending the mean values
        mean_values.append(statistics.mean(first_ratio))
        mean_values.append(statistics.mean(second_ratio))
        
        # Printing the results
        print("Mean values for First_Ratio, Second_Ratio")
        print(mean_values)
        
        # Appending the result in a return object
        all_section_means.append(mean_values)
        
    return all_section_means

In [9]:
# END: Visualizing the data for the given simple sections
def visualize_simple_plot(sections, extra_title, labels):
    
    # Setting initial variable
    i = 0
    
    # Visualizing all the sections
    for section in sections:
        
        # Defining variables used in method
        df = section
        title = labels[i]
        
        # Saving data in ColumnDataSource
        data = {'TimeStamp': np.array(df['TimeStamp'], dtype='i8').view('datetime64[ms]').tolist(),
                'Alpha': list(df['Alpha_Avg']),
                'Beta': list(df['Beta_Avg']),
                'Delta': list(df['Delta_Avg']),
                'Gamma': list(df['Gamma_Avg']), 
                'Theta': list(df['Theta_Avg']),
                'TimeStamp_tooltip': [x.strftime("%Y-%m-%d %H:%M:%S") for x in df['TimeStamp']]
                }
        source = ColumnDataSource(data=data)

        # Calculating the graph ranges
        smallest = min_range
        largest = max_range
        smallest -= 5
        largest += 5

        # Plotting the data
        p = figure(x_range=(min(data['TimeStamp']), max(data['TimeStamp'])), y_range=(smallest, largest), plot_width=1500, plot_height=600, title=extra_title+": Plot of "+title)
        p.line(x='TimeStamp', y='Alpha', source=source, color="#3DB3FE", line_width=2, legend=dict(value="Alpha"))
        p.line(x='TimeStamp', y='Beta', source=source, color="#38A967", line_width=2, legend=dict(value="Beta"))
        p.line(x='TimeStamp', y='Delta', source=source, color="#C93030", line_width=2, legend=dict(value="Delta"))
        p.line(x='TimeStamp', y='Gamma', source=source, color="#F1A219", line_width=2, legend=dict(value="Gamma"))
        p.line(x='TimeStamp', y='Theta', source=source, color="#A822F3", line_width=2, legend=dict(value="Theta"))

        # Adding hover and other visualization tools
        hover = HoverTool()
        hover.tooltips=[
            ("Value", "$y"),
            ("Timestamp", "@TimeStamp_tooltip")
        ]
        p.add_tools(hover)
        p.add_layout(Title(text="TimeStamp", align="center"), "below")
        p.add_layout(Title(text="Frequency", align="center"), "left")
        p.xaxis.major_label_orientation = np.pi / 4
        p.legend.location = 'top_right'
        
        show(p)
        i = i + 1
    

In [10]:
def save_sections(participant_no, mean_stress_values, experiment_order): 
    
    # Defining header
    header = ["First_Ratio", "Second_Ratio"]
    
    # Iterating through the sections    
    for i in range(0, 3):       
        
        # Creating the name of the file
        file_name = 'aResults/normal/results_'+experiment_order[i]+'_'+participant_no+'.csv'

        # Writing on the CSV
        with open(file_name, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(header)
            writer.writerow(mean_stress_values[i])

In [12]:
# -------------------Main Method-------------------------------------------

# Experiment order: R=Rational, S=StringUtil, U=UtilObject

participants = ['01', '02', '03', '04', '05', '06']
experiments_order = [
    ['R', 'S', 'U'],
    ['U', 'R', 'S'],
    ['S', 'U', 'R'],
    ['R', 'S', 'U'],
    ['U', 'R', 'S'],
    ['S', 'U', 'R']
]

# Iterating through the participants
for i in range(0, len(participants)):
    print("==== Participant "+participants[i]+" ====")
    execute_class(participants[i], experiments_order[i])
    print()

==== Participant 01 ====
2139/2157 usable lines 
Total usable lines for this participant is 99.17% - threshold: 8

Frequency Mean of baseline 0:
-- Alpha_Avg: 65.97
-- Beta_Avg: 64.79
-- Delta_Avg: 44.31
-- Gamma_Avg: 52.36
-- Theta_Avg: 49.35

Frequency Mean of baseline 1:
-- Alpha_Avg: 61.54
-- Beta_Avg: 53.19
-- Delta_Avg: 43.4
-- Gamma_Avg: 42.11
-- Theta_Avg: 46.55

Frequency Mean of baseline 2:
-- Alpha_Avg: 64.85
-- Beta_Avg: 63.67
-- Delta_Avg: 46.83
-- Gamma_Avg: 50.75
-- Theta_Avg: 50.11

Mean values for First_Ratio, Second_Ratio

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



[-0.025694344555805475, 0.04295663125925457]
Mean values for First_Ratio, Second_Ratio
[-0.01860701281307172, 0.05278986940912149]
Mean values for First_Ratio, Second_Ratio
[-0.009977100568162773, 0.03270648565041725]

==== Participant 02 ====
1535/1682 usable lines 
Total usable lines for this participant is 91.26% - threshold: 8

Frequency Mean of baseline 0:
-- Alpha_Avg: 75.1
-- Beta_Avg: 65.12
-- Delta_Avg: 67.44
-- Gamma_Avg: 47.18
-- Theta_Avg: 60.88

Frequency Mean of baseline 1:
-- Alpha_Avg: 89.92
-- Beta_Avg: 76.91
-- Delta_Avg: 88.87
-- Gamma_Avg: 59.62
-- Theta_Avg: 80.9

Frequency Mean of baseline 2:
-- Alpha_Avg: 83.6
-- Beta_Avg: 76.57
-- Delta_Avg: 78.24
-- Gamma_Avg: 83.72
-- Theta_Avg: 67.49

Mean values for First_Ratio, Second_Ratio
[0.03027016030396739, -0.030295412229587364]
Mean values for First_Ratio, Second_Ratio
[0.0257171142057903, -0.023954890963405074]
Mean values for First_Ratio, Second_Ratio
[-0.023844705361432718, 0.03318262517658982]

==== Participant 