In [None]:
import pandas as pd

## Cognition dataset

Downloaded from https://github.com/MartinGjoreski/martingjoreski.github.io/blob/master/files/CogDatasets.rar

In [None]:
data = pd.read_csv('raw_data/personality.csv')
data

In [None]:
df = pd.DataFrame()
users = data['user_id'].tolist()
num_rows = 240
for user_id in users:
    user_df = pd.read_csv('raw_data/{}_sensors.csv'.format(user_id))
    user_df = user_df[['user_id','level','hr','gsr','rr','temperature','click_per_second','points']].dropna()
    # merge with personality traits
    user_df = pd.merge(user_df, data, on='user_id', how='left')
    user_df = user_df[0:num_rows]
    print("adding {} rows for user {}".format(len(user_df), user_id))
    df = pd.concat([df, user_df])

In [None]:
df = pd.concat([df, pd.get_dummies(df['level'], prefix='level')], axis=1).drop(columns='level')

In [None]:
# convert the task column to indices
task_map = dict(zip(df['user_id'].unique(), range(len(df['user_id'].unique()))))
df['task'] = df['user_id'].map(task_map)

# drop columns not needed anymore
df = df.drop(columns=['user_id'])

# rename the label column and feature columns
df = df.rename(columns={'points':'Y'})
df = df.rename(columns={feature:'X_{}'.format(feature) for feature in df.keys() if feature not in ['Y','task']})

# order values by task
df = df.sort_values(by=['task'])


In [None]:
import sys
sys.path.insert(1, '../')
from utils import get_train_val_test_data

NUM_DATASETS = 6
DATASET_NAME = 'cognition'
INTERVENTIONS = ['X_level_0', 'X_level_1', 'X_level_2']

full_datasets, full_interv_masks = get_train_val_test_data(df, NUM_DATASETS, INTERVENTIONS)

for dataset in range(NUM_DATASETS):
    full_datasets[dataset].to_csv(f'{DATASET_NAME}_dataset{dataset}.csv', index=None)
    full_interv_masks[dataset].to_csv(f'{DATASET_NAME}_dataset{dataset}_mask.csv', index=None)