In [1]:
%run window_extraction.ipynb
%run time_series_functions.ipynb
%run helper_functions.ipynb
%run time_series_preprocessing.ipynb
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder 
from collections import defaultdict
from keras.models import Model, Sequential
from keras.layers import Input, Embedding, LSTM, concatenate, Dense, Embedding, TimeDistributed, Flatten, Masking, Dropout, GRU, Concatenate, RNN
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MinMaxScaler
from keras.optimizers import Adam, RMSprop
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from tensorflow.keras.regularizers import l2
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report
import random

In [2]:
with open('/Users/finnschonknecht/Desktop/all_participants_data.pkl', 'rb') as fp:
    all_participants_data = pickle.load(fp)

In [3]:
all_participants_data = convert_to_korean_time_all(all_participants_data, ['timestamp'])
all_participants_data = set_index_to_timestamp(all_participants_data, 'timestamp')

## Preprocess the data

In [4]:
columns_to_drop_per_df = {
    'MessageEvent.csv': ['isPinned', 'isStarred', 'messageClass', 'contact'],
    'AppUsageEvent.csv': ['isSystemApp', 'isUpdatedSystemApp', 'name', 'packageName'],
    'HR.csv': ['Quality'],
    'Location.csv': ['speed', 'accuracy', 'altitude'],
    'Calorie.csv': ['CaloriesToday'],
    'AppUsageEvent.csv': ['isSystemApp', 'name', 'packageName', 'isUpdatedSystemApp'],
    'CallEvent.csv': ['isPinned', 'presentation', 'dataUsage', 'contact', 'isStarred'],
    'Distance.csv': ['DistanceToday', 'Pace', 'Speed'],
    'StepCount.csv': ['StepsToday', 'steps'],
    'UltraViolet.csv': ['UVIndexLevel'],
    'Distance.csv': ['DistanceToday', 'MotionType', 'Pace', 'Speed']
    
}

all_participants_data = drop_columns_in_participant_data(all_participants_data, columns_to_drop_per_df)

In [5]:
preprocessing_functions = {
    'Acceleration.csv': acceleration_preprocess,
    'WiFi.csv': delete_preprocess,
    'StepCount.csv': step_count_preprocess,
    'MessageEvent.csv': messageevent_preprocess,
    'Locaiton.csv': location_preprocess,
    'InstalledApp.csv': delete_preprocess,
    'HR.csv' : hr_preprocess,
    'Distance.csv' : distance_preprocess,
    'DataTraffic.csv' : delete_preprocess,
    'Connectivity.csv' : delete_preprocess,
    'EDA.csv' : delete_preprocess,
    'ActivityTransition.csv' : delete_preprocess,
    'MediaEvent.csv' : delete_preprocess,
    'InstalledApp.csv' : delete_preprocess,
    'BatteryEvent.csv' : delete_preprocess,
    'ActivityEvent.csv' : activityevent_preprocess,
    'Calorie.csv' : calorie_preprocess,
    'Location.csv' : location_preprocess,
    'HR.csv' : hr_preprocess,
    'SkinTemperature.csv': skintemp_preprocess,
    'DeviceEvent.csv': calculate_top_sleep_proxies
    
}

In [6]:
dataframe_names = ['DataTraffic.csv', 'Connectivity.csv', 'EDA.csv', 'ActivityTransition.csv',  
                   'MediaEvent.csv', 'InstalledApp.csv', 'BatteryEvent.csv', 'WiFi.csv']
def apply_preprocessing(preprocessing_functions, all_participants_data, dataframe_names):
    for dataframe_name, function in preprocessing_functions.items():
        if dataframe_name in dataframe_names:
            print(f"Processing {dataframe_name}")
            function(all_participants_data, dataframe_name)
            print(f"Finished processing {dataframe_name}")
        else:
            print(f"Processing {dataframe_name}")
            function(all_participants_data)  # Pass the dataframe_name here as well if needed
            print(f"Finished processing {dataframe_name}")

# Call apply_preprocessing function
apply_preprocessing(preprocessing_functions, all_participants_data, dataframe_names)

Processing Acceleration.csv
Finished processing Acceleration.csv
Processing WiFi.csv
Finished processing WiFi.csv
Processing StepCount.csv
Finished processing StepCount.csv
Processing MessageEvent.csv
Finished processing MessageEvent.csv
Processing Locaiton.csv
Finished processing Locaiton.csv
Processing InstalledApp.csv
Finished processing InstalledApp.csv
Processing HR.csv
Finished processing HR.csv
Processing Distance.csv
Finished processing Distance.csv
Processing DataTraffic.csv
Finished processing DataTraffic.csv
Processing Connectivity.csv
Finished processing Connectivity.csv
Processing EDA.csv
Finished processing EDA.csv
Processing ActivityTransition.csv
Finished processing ActivityTransition.csv
Processing MediaEvent.csv
Finished processing MediaEvent.csv
Processing BatteryEvent.csv
Finished processing BatteryEvent.csv
Processing ActivityEvent.csv
Finished processing ActivityEvent.csv
Processing Calorie.csv
Finished processing Calorie.csv
Processing Location.csv
Finished proce

In [7]:
columns_to_drop_per_df = {
    'Location.csv': ['latitude', 'longitude']
}

all_participants_data = drop_columns_in_participant_data(all_participants_data, columns_to_drop_per_df)

In [8]:
def add_empty_dataframe_to_participant(data, participant_id, dataframe_name, columns):
    if participant_id in data:
        # Create an empty DataFrame with specified columns
        empty_df = pd.DataFrame(columns=columns)
        # Add the empty DataFrame to the participant's entry with the specified name
        data[participant_id][dataframe_name] = empty_df
    else:
        print(f"Participant {participant_id} does not exist in the data.")

    return data

In [9]:
all_participants_data = add_empty_dataframe_to_participant(all_participants_data, 'P55', 'Distance.csv', ['TotalDistance.csv'])

In [10]:
def add_column_to_dataframe(data, participant_id, dataframe_name, column_name, position):
    if participant_id in data:
        participant_data = data[participant_id]
        if dataframe_name in participant_data:
            # Get the DataFrame
            dataframe = participant_data[dataframe_name]
            # Insert the new column filled with zeros at the specified position
            dataframe.insert(position, column_name, 0)
            # Update the participant's data
            participant_data[dataframe_name] = dataframe
            data[participant_id] = participant_data
        else:
            print(f"DataFrame '{dataframe_name}' does not exist for participant {participant_id}.")
    else:
        print(f"Participant {participant_id} does not exist in the data.")

    return data

In [11]:
all_participants_data = add_column_to_dataframe(all_participants_data, 'P66', 'ActivityEvent.csv', 'TILTING', 5)

## Desired structure

If data has more than 1 data point per second then we conduct feature extraction

## Checking if there is enough data

In [12]:
esm_responses = pd.read_csv('/Users/finnschonknecht/Desktop/SubjData/preprocessed_esm_responses.csv')
cols_to_drop = ['ScheduledTime', 'ReactionTime', 'Valence', 'Arousal', 'Stress', 'Window']
esm_responses = esm_responses.drop(columns=cols_to_drop)

In [13]:
esm_responses = convert_to_korean_time(esm_responses, ['ResponseTime'])

In [14]:
# Function to check if there is at least 2 hours of data prior to the response time
def check_data_availability(participant_data, response_time):
    response_time = pd.Timestamp(response_time)
    two_hours_before = response_time - pd.Timedelta(hours=2)
    if two_hours_before < participant_data.index[0]:
        return False  # Not enough data available
    return True

# Iterate through the rows of the ESM dataframe
for index, row in esm_responses.iterrows():
    pcode = row['Pcode']
    response_time = row['ResponseTime']
    
    # Find corresponding participant's data
    if pcode in all_participants_data:
        participant_data = all_participants_data[pcode]
        
        # Initialize sequence validity flag as True
        sequence_validity = True
        
        # Check data availability for all dataframes of the participant
        for dataframe_name, dataframe in participant_data.items():
            if not dataframe.empty:  # Check if dataframe is not empty
                enough_data = check_data_availability(dataframe, response_time)
                if not enough_data:
                    sequence_validity = False
                    break  # No need to check further, set validity flag to False
            else:
                # Skip empty dataframes
                continue
        
        # Update 'sequence validity' column in esm_responses dataframe
        esm_responses.loc[index, 'sequence_validity'] = sequence_validity
    else:
        esm_responses.loc[index, 'sequence_validity'] = False  # Participant not found in all_participants_data
esm_responses_valid = esm_responses[esm_responses['sequence_validity']]

  if two_hours_before < participant_data.index[0]:


## Sequence Creation

In [14]:
user_info = pd.read_csv('/Users/finnschonknecht/Desktop/SubjData/UserInfo.csv')

In [145]:
all_sequences = sequence_creation(all_participants_data, esm_responses, user_info)

In [147]:
all_sequences_reversed = {}

for participant, sequences in all_sequences.items():
    all_sequences_reversed[participant] = {}
    for sequence, data in sequences.items():
        reversed_features = list(reversed(data['features']))
        all_sequences_reversed[participant][sequence] = {
            'features': reversed_features,
            'target': data['target']
        }

## Splitting Targets into training and testing sets

In [148]:
targets = {}

for participant_id, sequences in all_sequences_reversed.items():
    for sequence_id, sequence_data in sequences.items():
        target = sequence_data['target']
        if participant_id not in targets:
            targets[participant_id] = []
        targets[participant_id].append(target)

In [149]:
unique_ids = list(all_sequences_reversed.keys())

random.seed(150)   
test_ids = np.random.choice(unique_ids, 15, replace=False)

In [150]:
test_targets = {}
for ids in test_ids:
    for target in targets[ids]:
        current = target
        if ids not in test_targets:
            test_targets[ids] = []
        test_targets[ids].append(current)

In [151]:
for key in test_ids:
    if key in targets:
        del targets[key]

In [152]:
train_targets = {'stress': [], 'valence': [], 'arousal': []}
test_targets = {'stress': [], 'valence': [], 'arousal': []}

for participant_id, participant_data in all_sequences_reversed.items():
    for esm_timestamp, esm_data in participant_data.items():
        if participant_id not in test_ids:
            # Append targets to train_targets
            train_targets['stress'].append(esm_data['target'][0])
            train_targets['valence'].append(esm_data['target'][1])
            train_targets['arousal'].append(esm_data['target'][2])

train_targets = {key: np.array(value) for key, value in train_targets.items()}

## Splitting sequences into training and testing

In [153]:
test_sequences = {}
for ids in test_ids:
    for sequence in all_sequences_reversed[ids]:
        current = sequence
        if ids not in test_sequences:
            test_sequences[ids] = []
        test_sequences[ids].append(current)

In [154]:
test_sequences = {}

# Populate the test_sequences dictionary with sequences from test_ids
for participant_id in test_ids:
    if participant_id in all_sequences_reversed:
        test_sequences[participant_id] = all_sequences_reversed[participant_id]

In [155]:
for key in test_ids:
    if key in all_sequences_reversed:
        del all_sequences_reversed[key]

In [177]:
for participant_id, participant_data in test_sequences.items():
    for esm_timestamp, esm_data in participant_data.items():
        if participant_id in test_ids:
            # Append targets to train_targets
            test_targets['stress'].append(esm_data['target'][0])
            test_targets['valence'].append(esm_data['target'][1])
            test_targets['arousal'].append(esm_data['target'][2])
            
test_targets = {key: np.array(value) for key, value in test_targets.items()}

In [178]:
train_stress = train_targets['stress']
train_valence = train_targets['valence']
train_arousal = train_targets['arousal']

test_stress = test_targets['stress']
test_valence = test_targets['valence']
test_arousal = test_targets['arousal']

## Preprocessing Sequences

In [157]:
def preprocess_sequences(sequences, categorical_indexes):
    categorical_features = {}
    numeric_features = {}
    
    for participant_id, participant_data in sequences.items():
        categorical_features[participant_id] = []
        numeric_features[participant_id] = []

        for esm_timestamp, esm_data in participant_data.items():
            features = esm_data['features']
            
            processed_categorical_features = []
            processed_numeric_features = []

            # Iterate over each timestep in the features list
            for timestep in features:
                # Initialize lists for categorical and numeric values
                categorical_values = []
                numeric_values = []

                # Separate the categorical and numeric values based on the specified indexes
                for i, value in enumerate(timestep):
                    if i in categorical_indexes:
                        # Handle categorical values
                        if isinstance(value, str) and value not in ['nan', '-1999']:
                            categorical_values.append(value)
                        else:
                            categorical_values.append('missing')
                    else:
                        # Handle numeric values
                        try:
                            numeric_value = float(value)
                        except ValueError:
                            numeric_value = -1999
                        numeric_values.append(numeric_value)

                # Ensure the lengths are consistent by filling missing values
                while len(categorical_values) < len(categorical_indexes):
                    categorical_values.append('missing')
                while len(numeric_values) < (len(timestep) - len(categorical_indexes)):
                    numeric_values.append(0)

                # Apply the threshold rule for numeric values
                numeric_array = np.array(numeric_values, dtype=float)
                num_invalid_values = np.sum(numeric_array == -1999)
                if num_invalid_values > len(numeric_values) / 2:
                    numeric_array[:] = -1999
                else:
                    numeric_array[numeric_array == -1999] = 0

                # Append to the processed lists
                processed_categorical_features.append(categorical_values)
                processed_numeric_features.append(numeric_array.tolist())

            categorical_features[participant_id].append(processed_categorical_features)
            numeric_features[participant_id].append(processed_numeric_features)
    
    return categorical_features, numeric_features


In [158]:
categorical_indexes = [0, 2, 72, 83, 84, 85, 86, 87, 99]

train_categorical_sequences, train_numeric_sequences = preprocess_sequences(all_sequences_reversed, categorical_indexes)
test_categorical_sequences, test_numeric_sequences = preprocess_sequences(test_sequences, categorical_indexes)

In [159]:
def check_overall_shape(sequences):
    num_samples = 0
    num_timesteps = None
    num_features = None

    for participant_id, participant_sequences in sequences.items():
        for sequence in participant_sequences:
            current_num_timesteps = len(sequence)
            current_num_features = len(sequence[0]) if sequence else 0

            if num_timesteps is None:
                num_timesteps = current_num_timesteps
            if num_features is None:
                num_features = current_num_features

            if current_num_timesteps != num_timesteps:
                print(f"Inconsistent number of timesteps for participant {participant_id}. Expected {num_timesteps}, found {current_num_timesteps}")
            if current_num_features != num_features:
                print(f"Inconsistent number of features for participant {participant_id}. Expected {num_features}, found {current_num_features}")

            num_samples += 1

    return num_samples, num_timesteps, num_features

In [112]:
tes_par = all_participants_data['P73']
cal = tes_par['UltraViolet.csv']
timestamp = pd.Timestamp('2019-04-30 14:14:51+09:00')
window = {'30min': 60 * 30}

In [113]:
extract_generic_window_features(cal, window, timestamp)

{'30min_UVExposureToday_mean': 0.0,
 '30min_UVExposureToday_median': 0.0,
 '30min_UVExposureToday_std': 0.0,
 '30min_UVExposureToday_entropy': nan}

In [160]:
train_categorical_shape = check_overall_shape(train_categorical_sequences)
train_numeric_shape = check_overall_shape(train_numeric_sequences)
test_categorical_shape = check_overall_shape(test_categorical_sequences)
test_numeric_shape = check_overall_shape(test_numeric_sequences)

print(f"Train Categorical Shape: {train_categorical_shape}")
print(f"Train Numeric Shape: {train_numeric_shape}")
print(f"Test Categorical Shape: {test_categorical_shape}")
print(f"Test Numeric Shape: {test_numeric_shape}")

Train Categorical Shape: (4307, 25, 9)
Train Numeric Shape: (4307, 25, 112)
Test Categorical Shape: (1131, 25, 9)
Test Numeric Shape: (1131, 25, 112)


## Encdoding Categorical Variables

In [161]:
def encode_categorical_sequences_train(categorical_sequences):
    # Collect unique values for each categorical index from the training data
    unique_values = defaultdict(set)
    for participant_id, participant_data in categorical_sequences.items():
        for sequence in participant_data:
            for timestep in sequence:
                for i, value in enumerate(timestep):
                    unique_values[i].add(value)
    
    # Add 'missing' to the set of unique values for each categorical index
    for i in unique_values:
        unique_values[i].add('missing')

    # Print the number of unique values for each categorical variable
    print("Number of unique values for each categorical variable:")
    for i, values in unique_values.items():
        print(f"Feature {i}: {len(values)}")
    
    # Initialize LabelEncoders for each categorical index
    label_encoders = {i: LabelEncoder() for i in unique_values}
    
    # Fit LabelEncoders with the unique values including 'missing'
    for i, values in unique_values.items():
        label_encoders[i].fit(list(values))
    
    # Encode the categorical sequences
    encoded_sequences = {}
    for participant_id, participant_data in categorical_sequences.items():
        encoded_sequences[participant_id] = []
        for sequence in participant_data:
            encoded_sequence = []
            for timestep in sequence:
                encoded_timestep = []
                for i, value in enumerate(timestep):
                    if value not in unique_values[i]:
                        value = 'missing'
                    encoded_value = label_encoders[i].transform([value])[0]
                    encoded_timestep.append(encoded_value)
                encoded_sequence.append(encoded_timestep)
            encoded_sequences[participant_id].append(encoded_sequence)

    return encoded_sequences, label_encoders

def encode_categorical_sequences_test(categorical_sequences, label_encoders):
    # Encode the categorical sequences using fitted encoders
    encoded_sequences = {}
    for participant_id, participant_data in categorical_sequences.items():
        encoded_sequences[participant_id] = []
        for sequence in participant_data:
            encoded_sequence = []
            for timestep in sequence:
                encoded_timestep = []
                for i, value in enumerate(timestep):
                    if value not in label_encoders[i].classes_:
                        value = 'missing'
                    encoded_value = label_encoders[i].transform([value])[0]
                    encoded_timestep.append(encoded_value)
                encoded_sequence.append(encoded_timestep)
            encoded_sequences[participant_id].append(encoded_sequence)

    return encoded_sequences

In [162]:
encoded_train_sequences, label_encoders = encode_categorical_sequences_train(train_categorical_sequences)

Number of unique values for each categorical variable:
Feature 0: 8
Feature 1: 3
Feature 2: 1114
Feature 3: 31
Feature 4: 31
Feature 5: 31
Feature 6: 30
Feature 7: 1
Feature 8: 1


In [163]:
encoded_test_sequences = encode_categorical_sequences_test(test_categorical_sequences, label_encoders)

In [164]:
def calculate_cardinality(categorical_sequences):
    unique_values = {}

    # Iterate over each participant's data
    for participant_id, participant_data in categorical_sequences.items():
        for sequence in participant_data:
            for timestep in sequence:
                for i, value in enumerate(timestep):
                    if i not in unique_values:
                        unique_values[i] = set()
                    unique_values[i].add(value)

    # Calculate cardinality for each categorical index
    cardinality = {i: len(values) for i, values in unique_values.items()}

    return cardinality

In [165]:
cardinality = calculate_cardinality(encoded_train_sequences)
print(cardinality)

{0: 7, 1: 2, 2: 1114, 3: 31, 4: 31, 5: 31, 6: 30, 7: 1, 8: 1}


In [166]:
cardinality = calculate_cardinality(encoded_test_sequences)
print(cardinality)

{0: 7, 1: 2, 2: 156, 3: 28, 4: 29, 5: 30, 6: 30, 7: 1, 8: 1}


In [167]:
def calculate_embedding_sizes(cardinalities):
    embedding_info = {}
    for index, cardinality in cardinalities.items():
        embedding_size = min(50, (cardinality // 2) + 1)
        embedding_info[index] = (cardinality, embedding_size)
    return embedding_info

cardinalities = {0: 7, 1: 2, 2: 2398, 3: 31, 4: 31, 5: 31, 6: 31, 7: 31, 8: 31}
embedding_info = calculate_embedding_sizes(cardinalities)
print(f"Embedding Info: {embedding_info}")

Embedding Info: {0: (7, 4), 1: (2, 2), 2: (2398, 50), 3: (31, 16), 4: (31, 16), 5: (31, 16), 6: (31, 16), 7: (31, 16), 8: (31, 16)}


In [168]:
def replace_nans_with_zeros(numeric_sequences):
    cleaned_sequences = {}

    for participant_id, participant_data in numeric_sequences.items():
        cleaned_participant_data = []
        for sequence in participant_data:
            cleaned_sequence = []
            for timestep in sequence:
                cleaned_timestep = [0 if np.isnan(value) else value for value in timestep]
                cleaned_sequence.append(cleaned_timestep)
            cleaned_participant_data.append(cleaned_sequence)
        cleaned_sequences[participant_id] = cleaned_participant_data

    return cleaned_sequences

In [169]:
cleaned_numerical_train_sequences = replace_nans_with_zeros(train_numeric_sequences)
cleaned_numerical_test_sequences = replace_nans_with_zeros(test_numeric_sequences)

In [170]:
numerical_sequences_train_flat = []
numerical_sequences_test_flat = []
categorical_sequences_train_flat = []
categorical_sequences_test_flat = []

In [171]:
def flatten_sequences(sequences, name_of_flat):
    for participant_sequences in sequences.values():
        for sequence in participant_sequences:
            name_of_flat.append(sequence)

In [172]:
flatten_sequences(cleaned_numerical_train_sequences, numerical_sequences_train_flat)
flatten_sequences(cleaned_numerical_test_sequences, numerical_sequences_test_flat)
flatten_sequences(encoded_train_sequences, categorical_sequences_train_flat)
flatten_sequences(encoded_test_sequences, categorical_sequences_test_flat)

In [173]:
numerical_sequences_train_flat = np.array(numerical_sequences_train_flat)
numerical_sequences_test_flat = np.array(numerical_sequences_test_flat)
categorical_sequences_train_flat = np.array(categorical_sequences_train_flat)
categorical_sequences_test_flat = np.array(categorical_sequences_test_flat)

In [636]:
arrays_to_save = {
    'numerical_sequences_train_flat': numerical_sequences_train_flat,
    'numerical_sequences_test_flat': numerical_sequences_test_flat,
    'categorical_sequences_train_flat': categorical_sequences_train_flat,
    'categorical_sequences_test_flat': categorical_sequences_test_flat,
    'arousal_train': train_arousal,
    'valence_train': train_valence,
    'stress_train': train_stress,
    'arousal_test': test_arousal,
    'valence_test': test_valence,
    'stress_test': test_stress,
    # Add other arrays as needed
}

# Save arrays
for name, array in arrays_to_save.items():
    np.save(f'{name}.npy', array)