Function to pre-process open reflections from the Sensory Tool 

@Author : Romy Beauté\
@Corresp : r.beaut@sussex.ac.uk\
@Last modified : 26/07/2024

### Data loading and split in sentences

In [2]:
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

HS_reflections = pd.read_csv('HS_reflections.csv',sep='\t')['reflection_answer']
DL_reflections = pd.read_csv('DL_reflections.csv',sep='\t')['reflection_answer']
HW_reflections = pd.read_csv('HW_reflections.csv',sep='\t')['reflection_answer']


def split_sentences(reflections):
    sentences = []
    for reflection in reflections:
        sentences += sent_tokenize(reflection)
    return sentences

DL_sentences = split_sentences(DL_reflections)
HS_sentences = split_sentences(HS_reflections)
HW_sentences = split_sentences(HW_reflections) 

data = {'Dataset': ['DL', 'HS', 'HW'],
        'Number of Reflections': [len(DL_reflections), len(HS_reflections), len(HW_reflections)],
        'Number of Tokenized Sentences': [len(DL_sentences), len(HS_sentences), len(HW_sentences)]}

df = pd.DataFrame(data)
print(df)


[nltk_data] Downloading package punkt to /home/romy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  Dataset  Number of Reflections  Number of Tokenized Sentences
0      DL                     95                            182
1      HS                    334                            691
2      HW                   3662                          16291


### Gathering other freeform respondes from the SensoryTool dataset

In [2]:
sensory_tool_data = pd.read_excel('SensoryTool_CombinedData_v24_20230912_2.xlsx')

# columns corresp. to freeform responses that we might want to extract 
freeform_responses = [
    'emo_freeformResponse', #user’s freeform emotional response, if present
    'aud_freeformResponse',
    'vizCol_freeformAnswer',
    'vizPat_freeformAnswer',
    'vizMov_freeform_answer', #user's freeform answer for movement types
    'cogBody_freeformAnswer'] #users freeform additional body sensations if they chose to add this

# new dataframe to store only the freeform responses, and the metadata columns
freeform_data = pd.DataFrame()


for col in freeform_responses:
    freeform_data[col] = sensory_tool_data[col]

freeform_data['meta_HighSensory'] = sensory_tool_data['meta_HighSensory']
freeform_data['meta_HighSensory'] = freeform_data['meta_HighSensory'].replace({True: 'High Sensory', False: 'Deep Listening'})

#delete subjects that have NaN in all freeform responses
freeform_data = freeform_data.dropna(subset=freeform_responses, how='all')
freeform_data.to_excel('freeform_SensoryTool_complete.xlsx', index=False)

HS_freeform_data = freeform_data[freeform_data['meta_HighSensory'] == 1]
DL_freeform_data = freeform_data[freeform_data['meta_HighSensory'] == 0]

# HS_freeform_data.to_csv('freeform_HS_SensoryTool_complete.csv', index=False)
# DL_freeform_data.to_csv('freeform_DL_SensoryTool_complete.csv', index=False)
