#### Imports

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.spatial import distance
from matplotlib import pyplot as plt
from itertools import permutations
import glob
import csv
import os
import re
import json


#### Get Data

In [2]:
# Full dataset:
dataset = pd.read_csv("../original data/HowRU_app_data_tasks_and_questionnaire_summer_only.csv")

# Choose only the columns you need:
data = dataset.loc[:, ["id", "user_id", "user", "data"]]

display(data)

print(type(list(data['data'])[0]))


Unnamed: 0,id,user_id,user,data
0,64b1bc41e21b8b02b9bd07dd,64b039620ccdf8768472d8b8,rtocs_imani,"{""data"": {""test_type"": ""Demo"", ""finger_tap"": {..."
1,64abc00a0e51fb55283e3951,64a4ccfbf93736cd307a5066,rtocs_sacha,"{""data"": {""where_EMA_evening"": ""Home"", ""excite..."
2,64ab0b89133b98be33469f23,64a4ccfbf93736cd307a5066,rtocs_sacha,"{""data"": {""where_EMA_morning"": ""Home"", ""excite..."
3,64aa75d35cab48548216fd3d,64a4ccfbf93736cd307a5066,rtocs_sacha,"{""data"": {""where_EMA_evening"": ""Home"", ""excite..."
4,64a9e33e9a987115adbff6aa,64a4ccfbf93736cd307a5066,rtocs_sacha,"{""data"": {""where_EMA"": ""Home"", ""excited_scale""..."
...,...,...,...,...
127,64a66536304d98857d698c90,6423241561f11182a22e06c5,rtocs_999_3,"{""data"": {""test_type"": ""Night Testing"", ""finge..."
128,64a6645be409f9233fe1ba0d,6423241561f11182a22e06c5,rtocs_999_3,"{""data"": {""rst"": [[{""startTime"": 1688626189154..."
129,64a663fe304d98857d698c8f,6423241561f11182a22e06c5,rtocs_999_3,"{""data"": {""ospan_1"": [[{""equation"": {""a"": 12, ..."
130,64a66338e409f9233fe1ba0c,6423241561f11182a22e06c5,rtocs_999_3,"{""data"": {""test_type"": ""Night Training"", ""wpa_..."


<class 'str'>


#### Data Cleanup

In [3]:
def cleanup(filename, task_string):
    np_array = []
    columns = []

    # Set columns to max amount.
    for sample_index in range(len(data['data'])):

        sample_data = data['data'][sample_index]

        if type(task_string) == list:
            if all([task_s in sample_data for task_s in task_string]):
                sample_data_dict = json.loads(sample_data)

                data_columns = list(column for column in sample_data_dict['data'])
                metadata_columns = ['started_at', 'finished_at']

                columns_test = ['id', 'user_id', 'user', *data_columns, *metadata_columns]

                if len(columns_test) > len(columns):
                    columns = columns_test
        else:
            if task_string in sample_data:
                sample_data_dict = json.loads(sample_data)
            
                data_columns = list(column for column in sample_data_dict['data'])
                metadata_columns = ['started_at', 'finished_at']

                columns_test = ['id', 'user_id', 'user', *data_columns, *metadata_columns]

                if len(columns_test) > len(columns):
                    columns = columns_test

    # Add to np_array (based on parameters)
    for sample_index in range(len(data['data'])):
        sample_id = data['id'][sample_index]
        sample_user_id = data['user_id'][sample_index]
        sample_user = data['user'][sample_index]
        sample_data = data['data'][sample_index]

        # Expand data column.
        sample_data_dict = json.loads(sample_data)

        # Data & metadata values.
        data_values = []

        for column in columns[3:-2]:
            try:
                value = sample_data_dict['data'][column]
            except KeyError:
                value = 'N/A'
            data_values.append(str(value))

        metadata_values = list(value for value in sample_data_dict['metadata'].values())[-2:]   

        # Recreate sample_data
        sample_data_new = [*data_values, *metadata_values]

        if type(task_string) == list:
            if all([task_s in sample_data for task_s in task_string]):
                np_array.append([sample_id, sample_user_id, sample_user, *sample_data_new])
        else:
            if task_string in sample_data:
                np_array.append([sample_id, sample_user_id, sample_user, *sample_data_new])
    
    # Convert to dataframe.
    dataframe = pd.DataFrame(np.array(np_array), columns=columns)

    # Convert to csv.
    dataframe.to_csv(f'../clean data/{filename}.csv')


#### Cleanup Sleep Diary Morning

In [25]:
cleanup('sleep_diary_morning', 'where_EMA_morning')

#### Cleanup Finger Tapping Night Testing

In [26]:
cleanup('finger_tapping_night_testing', ['Night Testing', 'finger_tap'])

#### Cleanup Finger Tapping Night Training

In [27]:
cleanup('finger_tapping_night_training', ['Night Training', 'finger_tap'])

#### Cleanup Finger Tapping Morning Testing

In [28]:
cleanup('finger_tapping_morning_testing', ['Morning Testing', 'finger_tap'])

#### Cleanup WPA Night Training

In [29]:
cleanup('WPA_night_training', ['wpa_training', "Night Training"])

#### Cleanup WPA Morning Testing

In [30]:
cleanup('WPA_morning_testing', ['wpa_morning_testing', 'Morning Testing'])

#### Cleanup Sleep Diary Evening

In [31]:
cleanup('sleep_diary_evening', 'where_EMA_evening')

#### Cleanup Sleep Diary Forget Night

In [32]:
cleanup('sleep_diary_forget_night', 'caffeinated_beverages')

#### Cleanup Sleep Diary Forget Morning

In [33]:
cleanup('sleep_diary_forget_morning', 'get_out_of_bed')

#### Cleanup VGE Questionnaire

In [34]:
cleanup('VGE_questionnaire', 'regularly_plays_games')

#### Cleanup PSAM Questionnaire

In [35]:
cleanup('PSAM_questionnaire', 'sure_parent')

#### Cleanup FSS Questionnaire

In [36]:
cleanup('FSS_questionnaire', 'household_income')

#### Cleanup APQ Parent Questionnaire

In [37]:
cleanup('APQ_parent_questionnaire', 'talk_child')

#### Cleanup PSS Questionnaire

In [38]:
cleanup('PSS_questionnaire', 'upset_something_unexpected')

#### Cleanup GAD-7 Questionnaire

In [39]:
cleanup('GAD-7_questionnaire', 'nervous_anxious_onedge')

#### Cleanup CASSS Questionnaire

In [40]:
cleanup('CASSS_questionnaire', 'parent_proud')

#### Cleanup CES-D Questionnaire

In [41]:
cleanup('CES-D_questionnaire', 'more_bother')

#### Cleanup IPPA Questionnaire

In [42]:
cleanup('IPPA_questionnaire', 'respect_feeling')

#### Cleanup PDS Questionnaire

In [43]:
cleanup('PDS_questionnaire', 'height_growth')

#### Cleanup OSPAN

In [None]:
cleanup('OSPAN', 'ospan_1')

#### Cleanup RST

In [4]:
cleanup('RST', 'rst')    # NOTE: this definitely did not work if you check OSPAN.csv

#### Cleanup FTT Night Testing

In [5]:
cleanup('FTT_night_testing', ['Night Testing', 'finger_tap'])

#### Cleanup FTT Night Training

In [None]:
cleanup('FTT_night_training', ['Night Training', 'finger_tap'])

#### Cleanup FTT Morning Testing

In [None]:
cleanup('FTT_morning_testing', ['Morning Testing', 'finger_tap'])