# Part 1: Preprocessing the heartrate data

In [1]:
import pandas as pd
import numpy as np

In [2]:
#initiate dict with participant numbers as keys and dataframes as values
participants_dict = {}
for x in [i for i in range(1,38)]:
    df = pd.read_csv(f'raw_heartdata/{x}.CSV', header=2) # read in heartrate data
    participants_dict[x] = df[['Time','HR (bpm)']] #only select time and HR columns

In [3]:
# To show what a single dataframe looks like
participants_dict[1]

Unnamed: 0,Time,HR (bpm)
0,00:00:00,80
1,00:00:01,80
2,00:00:02,81
3,00:00:03,81
4,00:00:04,82
...,...,...
725,00:12:05,80
726,00:12:06,79
727,00:12:07,79
728,00:12:08,78


In [4]:
# To show what dataframe have missing values
for i in participants_dict.keys():
    if participants_dict[i].isnull().any().any():
        print("DataFrame with missing values:")
        print(i)

DataFrame with missing values:
5
DataFrame with missing values:
10
DataFrame with missing values:
19
DataFrame with missing values:
28


This is correct.\
Participant 5 & 28 did not have any heartrate measurements, and should therefore be excluded from the analysis.\
Participant 10 & 19 had a hiccup in the heartrate measurement.\
To solve this, the missing values should be replaced with the average in that period.

In [5]:
# read in the raw participants data
raw_participants_data = pd.read_csv('raw_participants_data.csv', index_col=0)

In [6]:
# replace NaN values of 10 & 19

def fill_missing_values(participant_id) -> pd.DataFrame:
    """Replaces NaN values for a given participant"""
    start = raw_participants_data['Start'][participant_id]
    end = raw_participants_data['End'][participant_id]
    
    before = participants_dict[participant_id].loc[:start-1]
    during = participants_dict[participant_id].loc[start:end]
    #calculate average heartrate per period
    average_heart_rate_before = before['HR (bpm)'].mean()
    average_heart_rate_during = during['HR (bpm)'].mean()

    # replace missing values with the average heart rate
    before['HR (bpm)'].fillna(average_heart_rate_before, inplace=True)
    during['HR (bpm)'].fillna(average_heart_rate_during, inplace=True)
    return pd.concat([before, during], ignore_index=True)

participants_dict[10] = fill_missing_values(10)
participants_dict[19] = fill_missing_values(19)
# print(participants_dict[10])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  before['HR (bpm)'].fillna(average_heart_rate_before, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  during['HR (bpm)'].fillna(average_heart_rate_during, inplace=True)


In [7]:
# Check if the missing values are replaced
for i in participants_dict.keys():
    if participants_dict[i].isnull().any().any():
        print("DataFrame with missing values:")
        print(i)

DataFrame with missing values:
5
DataFrame with missing values:
28


Indeed, the missing values are replaced.\
5 & 28 will later be excluded from analysis.

In [8]:
raw_participants_data_with_heartrate_data = raw_participants_data.copy()

def calculate_heartrate(participant_id):
    """Function to calculate all heartrate attributes for the final dataset
    Average_Hearrate_Before
    Average_Heartrate_During
    Max_Heartrate_Before
    Max_Heartrate_During
    Min_Heartrate_Before
    Min_Heartrate_During"""
    start = raw_participants_data['Start'][participant_id]
    end = raw_participants_data['End'][participant_id]
    
    before = participants_dict[participant_id].loc[:start-1].copy()
    during = participants_dict[participant_id].loc[start:end].copy()

    # Calculate average heartrate per period
    average_heart_rate_before = before['HR (bpm)'].mean()
    average_heart_rate_during = during['HR (bpm)'].mean()

    # Calculate max heartrate per period
    max_heart_rate_before = before['HR (bpm)'].max()
    max_heart_rate_during = during['HR (bpm)'].max()

    # Calculate min heartrate per period
    min_heart_rate_before = before['HR (bpm)'].min()
    min_heart_rate_during = during['HR (bpm)'].min()

    # Write out all data to target dataframe
    raw_participants_data_with_heartrate_data.at[participant_id, 'Average_Heartrate_Before'] = average_heart_rate_before
    raw_participants_data_with_heartrate_data.at[participant_id, 'Average_Heartrate_During'] = average_heart_rate_during
    raw_participants_data_with_heartrate_data.at[participant_id, 'Max_Heartrate_Before'] = max_heart_rate_before
    raw_participants_data_with_heartrate_data.at[participant_id, 'Max_Heartrate_During'] = max_heart_rate_during
    raw_participants_data_with_heartrate_data.at[participant_id, 'Min_Heartrate_Before'] = min_heart_rate_before
    raw_participants_data_with_heartrate_data.at[participant_id, 'Min_Heartrate_During'] = min_heart_rate_during
    
    return


In [9]:
# Apply the above function to all participants
for i in participants_dict.keys():
    calculate_heartrate(i)

In [10]:
# Manually add missing values for participant 5 and 28
for participant_id in [5,28]:
    raw_participants_data_with_heartrate_data.at[participant_id, 'Average_Heartrate_Before'] = np.nan
    raw_participants_data_with_heartrate_data.at[participant_id, 'Average_Heartrate_During'] = np.nan
    raw_participants_data_with_heartrate_data.at[participant_id, 'Max_Heartrate_Before'] = np.nan
    raw_participants_data_with_heartrate_data.at[participant_id, 'Max_Heartrate_During'] = np.nan
    raw_participants_data_with_heartrate_data.at[participant_id, 'Min_Heartrate_Before'] = np.nan
    raw_participants_data_with_heartrate_data.at[participant_id, 'Min_Heartrate_During'] = np.nan

In [11]:
# raw_participants_data_with_heartrate_data.to_csv('raw_participants_data_with_heartrate_data.csv')

# Part 2: Preprocessing the survey data

In [12]:
A = pd.read_csv('survey_responses/Enquete A OIS.csv', index_col=0)
B = pd.read_csv('survey_responses/Enquete B OIS.csv', index_col=0)
participants_data_with_heartrate = pd.read_csv('raw_participants_data_with_heartrate_data.csv', index_col=0)

In [13]:
# dictionary to pair final csv file (key) with survey responses (value)
pairing_dict= {
    'Age': 'Age',
    'Gender': 'Wat is uw gender?',
    'VR_know_how': 'In hoeverre zou u uw kennis van VR omschrijven?',
    'VR_headset_experience': 'Heeft u al eens eerder een VR bril gebruikt',
    
    'b_Item_1_C': 'Mijn ademhaling is sneller dan normaal',
    'b_Item_2_C': 'Mijn hartslag is hoger dan normaal',
    'b_Item_3_M': 'Mijn spieren voelen gespannen (gesloten vuisten, strakke kaak/wenkbrauwen etc.)',
    'b_Item_4_M': 'Mijn spieren voelen relaxed',
    'b_Item_5_M': 'Mijn spieren voelen losjes',
    'b_Item_6_G': 'Ik voel mij relaxed',
    'b_Item_7_G': 'Op het moment, ben ik volkomen kalm',
    'b_Item_8_S': 'Ik voel mij slaperig en/of moe',
    'b_Item_9_S': 'Ik sta op het punt om in slaap te vallen',
    'b_Item_10_S': 'Ik voel mij verfrist en wakker',
    
    'a_Item_1_C': 'Mijn ademhaling is sneller dan normaal.1',
    'a_Item_2_C': 'Mijn hartslag is hoger dan normaal.1',
    'a_Item_3_M': 'Mijn spieren voelen gespannen (gesloten vuisten, strakke kaak/wenkbrauwen etc.).1',
    'a_Item_4_M': 'Mijn spieren voelen relaxed.1',
    'a_Item_5_M': 'Mijn spieren voelen losjes.1',
    'a_Item_6_G': 'Ik voel mij relaxed.1',
    'a_Item_7_G': 'Op het moment, ben ik volkomen kalm.1',
    'a_Item_8_S': 'Ik voel mij slaperig en/of moe.1',
    'a_Item_9_S': 'Ik sta op het punt om in slaap te vallen.1',
    'a_Item_10_S': 'Ik voel mij verfrist en wakker.1'
}

In [14]:
# initiate a finale to be exported dataframe
participants_data_final = participants_data_with_heartrate

# loop to write all the data to the final dataframe
for id in [x for x in range(1, 38) if x != 14]:
    for final, current in pairing_dict.items():
        # participant 14 had no survey data
        if id==14:
            pass
        try:
            # Attempt to retrieve data from DataFrame A
            participants_data_final.at[id, final] = A[current][A['ID'] == id].iloc[0]
        except IndexError:
            try:
                # If not found in DataFrame A, attempt to retrieve data from DataFrame B
                participants_data_final.at[id, final] = B[current][B['ID'] == id].iloc[0]
            except IndexError:
                # If not found in both DataFrames, you may want to handle this case appropriately
                participants_data_final.at[id, final] = None

  participants_data_final.at[id, final] = A[current][A['ID'] == id].iloc[0]


In [15]:
# Processing the Gender, and VR_headset_experience column to correct datatypes
participants_data_final.loc[participants_data_final["Gender"] == "Man", "Gender"] = 'M'
participants_data_final.loc[participants_data_final["Gender"] == "Vrouw", "Gender"] = 'V'
participants_data_final.loc[participants_data_final["VR_headset_experience"] == "Nee", "VR_headset_experience"] = '0'
participants_data_final.loc[participants_data_final["VR_headset_experience"] == "Ja", "VR_headset_experience"] = '1'
participants_data_final.loc[participants_data_final["VR_headset_experience"] == "Weet ik niet", "VR_headset_experience"] = '0'

In [16]:
# participants_data_final.to_csv('participants_data_final.csv')