In [7]:
def extract_message_time_series(df, windows, timestamp):
    windowed_messageevent = {}

    # If the DataFrame is empty, return a dictionary with zeros
    if df.empty:
        for i in range(25):  # Assuming 49 timesteps
            windowed_messageevent[timestamp - pd.Timedelta(minutes=30) * i] = [0, 0, 0, 0, 0]
        return windowed_messageevent

    # Process the DataFrame if it's not empty
    before_esm = df[df.index <= timestamp]
    timestamp = pd.Timestamp(timestamp)
    end_time = timestamp
    max_timestamp = max(before_esm.index.min(), timestamp - pd.Timedelta(hours=12))

    current_time = end_time  # Start from the end_time
    while current_time >= max_timestamp - pd.Timedelta(minutes=30):  # Iterate backwards
        window_start = current_time - pd.Timedelta(minutes=30)
        windowed_data = before_esm[(before_esm.index >= window_start) & (before_esm.index < current_time)]

        if current_time >= max_timestamp:
            if not windowed_data.empty:
                unique_numbers_outgoing = []
                unique_numbers_incoming = []
                messages_outgoing = 0
                messages_incoming = 0 
                unique_messegers_outgoing = 0
                unique_messegers_incoming = 0 
                total_messages = 0 
                for message_time, message_type, number, in windowed_data[['messageBox', 'number']].itertuples(index=True):
                    total_messages += 1
                    if message_type == 'SENT':
                        messages_outgoing += 1
                        if number not in unique_numbers_outgoing:
                            unique_messegers_outgoing += 1
                            unique_numbers_outgoing.append(number)
                    elif message_type == 'INBOX':
                        messages_incoming += 1
                        if number not in unique_numbers_incoming:
                            unique_messegers_incoming += 1
                            unique_numbers_incoming.append(number)
                windowed_messageevent[current_time] = [messages_outgoing, messages_incoming, unique_messegers_outgoing, unique_messegers_incoming, total_messages]
            else:
                windowed_messageevent[current_time] = [0, 0, 0, 0, 0]  # Default values if no data in the window
        else:
            windowed_messageevent[current_time] = [-1999, -1999, -1999, -1999, -1999]  # Padding with -1999 when out of range

        current_time -= pd.Timedelta(minutes=30)  # Decrement current_time

    # Ensure all 49 timesteps are covered
    current_time = end_time  # Reset current_time to end_time
    for i in range(25):
        timestep_timestamp = timestamp - pd.Timedelta(minutes=30) * i
        if timestep_timestamp not in windowed_messageevent:
            if timestep_timestamp > end_time:
                windowed_messageevent[timestep_timestamp] = [0, 0, 0, 0, 0]
            elif timestep_timestamp < max_timestamp:
                windowed_messageevent[timestep_timestamp] = [-1999, -1999, -1999, -1999, -1999]
            else:
                windowed_messageevent[timestep_timestamp] = [0, 0, 0, 0, 0]

    return windowed_messageevent


In [8]:
def extract_deviceevent_time_series(df, windows, timestamp):
    windowed_deviceevent = {}
    
    if df.empty:
        for i in range(25):  # Assuming 49 timesteps
            windowed_deviceevent[timestamp - pd.Timedelta(minutes=30) * i] = [-1999, -1999]
        return windowed_deviceevent

    before_esm = df[df.index <= timestamp]
    timestamp = pd.Timestamp(timestamp)
    end_time = timestamp
    max_timestamp = max(before_esm.index.min(), timestamp - pd.Timedelta(hours=12))

    current_time = end_time  # Start from the end_time
    while current_time >= max_timestamp - pd.Timedelta(minutes=30):  # Iterate backwards
        window_size = windows['30min']  # Retrieve window size from the windows dictionary
        window_start = current_time - pd.Timedelta(minutes=30)
        windowed_data = before_esm[(before_esm.index >= window_start) & (before_esm.index < current_time)]

        if current_time >= max_timestamp:
            times_unlocked = 0
            time_spent_on_phone = 0
            unlock_time = None  # Variable to store the timestamp of the last unlock event

            for _, event_type in windowed_data[['type']].itertuples(index=True):
                if event_type == 'UNLOCK':
                    times_unlocked += 1
                    unlock_time = current_time  # Update the unlock time
                elif event_type == 'SCREEN_OFF' and unlock_time is not None:
                    # Calculate the time spent on phone by subtracting unlock time from screen off time
                    time_spent_on_phone += 30 * 60  # Assuming 10 minutes window size
                    unlock_time = None  # Reset the unlock time

            proportion_time_spent_on_phone = time_spent_on_phone / window_size
            windowed_deviceevent[current_time] = [times_unlocked, proportion_time_spent_on_phone]
        else:
            windowed_deviceevent[current_time] = [-1999, -1999]  # Padding with -1999 when out of range

        current_time -= pd.Timedelta(minutes=30)  # Decrement current_time

    # Ensure all 49 timesteps are covered
    for i in range(25):
        timestep_timestamp = timestamp - pd.Timedelta(minutes=30) * i
        if timestep_timestamp not in windowed_deviceevent:
            if timestep_timestamp > end_time:
                windowed_deviceevent[timestep_timestamp] = [-1999, -1999]
            elif timestep_timestamp < max_timestamp:
                windowed_deviceevent[timestep_timestamp] = [-1999, -1999]
            else:
                windowed_deviceevent[timestep_timestamp] = [-1999, -1999]

    return windowed_deviceevent


In [18]:
def entropy(labels):
    n_labels = len(labels)
    
    if n_labels <= 1:
        return 0
    
    _, counts = np.unique(labels, return_counts=True)
    probs = counts / n_labels
    entropy = -np.sum(probs * np.log2(probs))
    
    return entropy


def extract_appusage_time_series(df, windows, timestamp):
    windowed_appevent = {}
    
    if df.empty:
        for i in range(25):  # Assuming 49 timesteps
            windowed_appevent[timestamp - pd.Timedelta(minutes=30) * i] = [-1999] * 17  # Adjusting size to 17 to match expected output
        return windowed_appevent

    before_esm = df[df.index <= timestamp]
    timestamp = pd.Timestamp(timestamp)
    end_time = timestamp
    max_timestamp = max(before_esm.index.min(), timestamp - pd.Timedelta(hours=12))

    current_time = end_time  # Start from the end_time

    while current_time >= max_timestamp:  # Iterate backwards
        window_start = current_time - pd.Timedelta(minutes=30)

        if window_start < df.index.min():
            windowed_appevent[current_time] = [-1999] * 17  # Out of range, fill with -1999
        else:
            windowed_data = before_esm[(before_esm.index >= window_start) & (before_esm.index <= current_time)]
            
            sequence_data = [0] * 17  # Placeholder for all variables with zeros

            if not windowed_data.empty:
                # Find the top 5 categories
                top_categories = list(windowed_data['category'].value_counts().head(5).index)
                
                # If there are fewer than 5 categories, fill the remaining slots with NaN
                top_categories.extend([np.nan] * (5 - len(top_categories)))
                
                # Fill in category names
                sequence_data[:5] = top_categories
                
                # Calculate statistics for observed categories
                for i, category in enumerate(top_categories):
                    if pd.isna(category):
                        continue
                    category_data = windowed_data[windowed_data['category'] == category]
                    move_to_foreground_indices = category_data[category_data['type'] == 'MOVE_TO_FOREGROUND'].index
                    move_to_background_indices = category_data[category_data['type'] == 'MOVE_TO_BACKGROUND'].index

                    category_time_spent = 0
                    for foreground_index in move_to_foreground_indices:
                        next_background_index = min(move_to_background_indices[move_to_background_indices > foreground_index], default=None)
                        if next_background_index is not None:
                            category_time_spent += (next_background_index - foreground_index).total_seconds()

                    # Fill in actual values for observed categories
                    sequence_data[5 + i] = category_time_spent / 60  # Time spent in minutes
                    sequence_data[10 + i] = len(category_data)  # Count of events

                # Calculate entropy and add most common category
                app_category_entropy = entropy(windowed_data['category'].value_counts())
                most_common_category = windowed_data['category'].mode().iloc[0]
                sequence_data[15] = app_category_entropy
                sequence_data[16] = most_common_category

            windowed_appevent[current_time] = sequence_data
        
        current_time -= pd.Timedelta(minutes=30)  # Decrement current_time

    # Ensure all 49 timesteps are covered
    for i in range(25):
        timestep_timestamp = timestamp - pd.Timedelta(minutes=30) * i
        if timestep_timestamp not in windowed_appevent:
            if timestep_timestamp > end_time:
                windowed_appevent[timestep_timestamp] = [0] * 17
            else:
                windowed_appevent[timestep_timestamp] = [-1999] * 17

    return windowed_appevent


In [10]:
def extract_call_timeseries(df, windows, timestamp):
    windowed_callevent = {}
    
    if df.empty:
        for i in range(25):  # Assuming 49 timesteps
            windowed_callevent[timestamp - pd.Timedelta(minutes=30) * i] = [-1999, -1999, -1999]
        return windowed_callevent

    before_esm = df[df.index <= timestamp]
    timestamp = pd.Timestamp(timestamp)
    end_time = timestamp
    max_timestamp = max(before_esm.index.min(), timestamp - pd.Timedelta(hours=12))

    current_time = end_time  # Start from the end_time
    while current_time >= max_timestamp - pd.Timedelta(minutes=30):  # Iterate backwards
        window_start = current_time - pd.Timedelta(minutes=30)
        windowed_data = before_esm[(before_esm.index >= window_start) & (before_esm.index < current_time)]

        if current_time >= max_timestamp:
            unique_callers_outgoing = set()
            unique_callers_incoming = set()
            time_spent_calling = 0

            for _, row in windowed_data.iterrows():
                call_type = row['type']
                number = row['number']
                duration = row['duration']
                time_spent_calling += duration / 60  # Accumulate duration in minutes
                if call_type == 'OUTGOING':
                    unique_callers_outgoing.add(number)
                elif call_type == 'INCOMING':
                    unique_callers_incoming.add(number)

            windowed_callevent[current_time] = [len(unique_callers_outgoing), len(unique_callers_incoming), time_spent_calling]
        else:
            windowed_callevent[current_time] = [-1999, -1999, -1999]  # Padding with -1999 when out of range

        current_time -= pd.Timedelta(minutes=30)  # Decrement current_time

    # Ensure all 49 timesteps are covered
    for i in range(25):
        timestep_timestamp = timestamp - pd.Timedelta(minutes=30) * i
        if timestep_timestamp not in windowed_callevent:
            if timestep_timestamp > end_time:
                windowed_callevent[timestep_timestamp] = [-1999, -1999, -1999]
            elif timestep_timestamp < max_timestamp:
                windowed_callevent[timestep_timestamp] = [-1999, -1999, -1999]
            else:
                windowed_callevent[timestep_timestamp] = [-1999, -1999, -1999]

    return windowed_callevent

In [22]:
#Location.csv
def calculate_entropy(cluster_counts):
    total_time = cluster_counts.sum()
    cluster_proportions = cluster_counts / total_time
    entropy = -np.sum([p * np.log2(p) for p in cluster_proportions.values if p != 0])
    return entropy


def extract_location_time_series(df, windows, timestamp):
    windowed_location = {}

    if df.empty:
        for i in range(25):  # Assuming 49 timesteps
            windowed_location[timestamp - pd.Timedelta(minutes=30) * i] = [-1999, -1999]
        return windowed_location

    before_esm = df[df.index <= timestamp]
    timestamp = pd.Timestamp(timestamp)
    end_time = timestamp
    max_timestamp = max(before_esm.index.min(), timestamp - pd.Timedelta(hours=12))

    current_time = end_time  # Start from the end_time
    while current_time >= max_timestamp - pd.Timedelta(minutes=30):  # Iterate backwards
        window_start = current_time - pd.Timedelta(minutes=30)
        windowed_data = before_esm[(before_esm.index >= window_start) & (before_esm.index < current_time)]

        if current_time >= max_timestamp:
            if len(windowed_data) == 0:
                most_common_cluster = np.nan
                window_entropy = np.nan
                window_normalised_entropy = np.nan
            else:
                cluster_counts = windowed_data['cluster'].value_counts()
                most_common_cluster = windowed_data['cluster'].mode().iloc[0]
                window_entropy = calculate_entropy(cluster_counts)
                window_normalised_entropy = calculate_normalised_entropy(cluster_counts)

            windowed_location[current_time] = [most_common_cluster, window_entropy]
        else:
            windowed_location[current_time] = [-1999, -1999]  # Padding with -1999 when out of range

        current_time -= pd.Timedelta(minutes=30)  # Decrement current_time

    # Ensure all 49 timesteps are covered
    for i in range(25):
        timestep_timestamp = timestamp - pd.Timedelta(minutes=30) * i
        if timestep_timestamp not in windowed_location:
            if timestep_timestamp > end_time:
                windowed_location[timestep_timestamp] = [-1999, -1999]
            elif timestep_timestamp < max_timestamp:
                windowed_location[timestep_timestamp] = [-1999, -1999]
            else:
                windowed_location[timestep_timestamp] = [-1999, -1999]

    return windowed_location

In [24]:
def generic_entropy(data):
    value_counts = data.value_counts()
    probabilities = value_counts / len(data)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

def extract_generic_time_series(df, windows, timestamp):
    windowed_features_dict = {}
    timestamp = pd.Timestamp(timestamp)
    
    # If the DataFrame is empty, return a dictionary with zeros
    if df.empty:
        for i in range(25):  # Assuming 25 timesteps
            windowed_features_dict[timestamp - pd.Timedelta(minutes=30) * i] = [0, 0, 0, 0]
        return windowed_features_dict

    # Process the DataFrame if it's not empty
    before_esm = df[df.index <= timestamp]
    end_time = timestamp
    max_timestamp = max(before_esm.index.min(), timestamp - pd.Timedelta(hours=12))

    current_time = end_time  # Start from the end_time
    while current_time >= max_timestamp:  # Iterate backwards
        window_start = current_time - pd.Timedelta(minutes=30)
        windowed_data = before_esm[(before_esm.index >= window_start) & (before_esm.index < current_time)]

        windowed_features = []  # Initialize list to store statistics for each column
        if not windowed_data.empty:
            numeric_cols = windowed_data.select_dtypes(include=np.number).columns
            for col in numeric_cols:
                col_mean = windowed_data[col].mean()
                col_median = windowed_data[col].median()
                col_std = windowed_data[col].std()
                col_entropy = generic_entropy(windowed_data[col])
                windowed_features.extend([col_mean, col_median, col_std, col_entropy])  # Extend the list with statistics
        else:
            numeric_cols = df.select_dtypes(include=np.number).columns
            windowed_features = [np.nan, np.nan, np.nan, np.nan] * len(numeric_cols)
        
        windowed_features_dict[current_time] = windowed_features

        current_time -= pd.Timedelta(minutes=30)  # Decrement current_time

    # Ensure all 25 timesteps are covered
    for i in range(25):
        timestep_timestamp = timestamp - pd.Timedelta(minutes=30) * i
        if timestep_timestamp not in windowed_features_dict:
            numeric_cols = df.select_dtypes(include=np.number).columns
            if timestep_timestamp > end_time:
                windowed_features_dict[timestep_timestamp] = [0, 0, 0, 0] * len(numeric_cols)
            elif timestep_timestamp < max_timestamp:
                windowed_features_dict[timestep_timestamp] = [-1999, -1999, -1999, -1999] * len(numeric_cols)
            else:
                windowed_features_dict[timestep_timestamp] = [0, 0, 0, 0] * len(numeric_cols)

    return windowed_features_dict

In [23]:
# def sequence_creation(all_participants_data, esm_responses, user_info):
#     desired_structure = {}
#     days_of_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    
#     window = {'30min': 60 * 30}
    
#     # Preprocess user_info to avoid repeated lookups
#     user_info_dict = user_info.set_index('Pcode').to_dict(orient='index')
    
#     # Preprocess external functions
#     external_functions = [
#         (extract_generic_time_series, 'Calorie.csv'),
#         (extract_generic_time_series, 'SkinTemperature.csv'),
#         (extract_generic_time_series, 'AmbientLight.csv'),
#         (extract_generic_time_series, 'RRI.csv'),
#         (extract_generic_time_series, 'StepCount.csv'),
#         (extract_message_time_series, 'MessageEvent.csv'),
#         (extract_call_timeseries, 'CallEvent.csv'),
#         (extract_generic_time_series, 'ActivityEvent.csv'),
#         (extract_location_time_series, 'Location.csv'),
#         (extract_generic_time_series, 'HR.csv'),
#         (extract_generic_time_series, 'Distance.csv'),
#         (extract_appusage_time_series, 'AppUsageEvent.csv'),
#         (extract_generic_time_series, 'Acceleration.csv'),
#         (extract_generic_time_series, 'UltraViolet.csv'),
#         (extract_deviceevent_time_series, 'DeviceEvent.csv')
#     ]
    
#     for participant_id, participant_data in all_participants_data.items():
#         sleep_proxies = participant_data['sleep_proxies']

#         participant_esm_responses = esm_responses[esm_responses['Pcode'] == participant_id]
#         for index, esm_response in participant_esm_responses.iterrows():
#             timestamp = pd.Timestamp(esm_response['ResponseTime'])
#             sequence_name = f"{timestamp}_{index}"
#             day_of_week = days_of_week[timestamp.weekday()]
#             user_info_row = user_info_dict.get(participant_id, {})
#             static_features = [day_of_week, user_info_row.get('Age', 0), user_info_row.get('Gender', ''), 
#                                user_info_row.get('Openness', 0), user_info_row.get('Conscientiousness', 0),
#                                user_info_row.get('Neuroticism', 0), user_info_row.get('Extraversion', 0),
#                                user_info_row.get('Agreeableness', 0), user_info_row.get('PSS10', 0),
#                                user_info_row.get('PHQ9', 0), user_info_row.get('GHQ12', 0)]
            
            
#             date_of_timestamp = timestamp.date()  # Normalize to date without time
#             if date_of_timestamp in sleep_proxies.index:
#                 sleep_proxy_for_timestamp = sleep_proxies.loc[date_of_timestamp, 'SleepProxy']
#                 sleep_proxy_hours = sleep_proxy_for_timestamp.total_seconds() / 3600
#                 static_features.append(sleep_proxy_hours)
#             else:
#                 static_features.append(0)  # Appending 0 if there's no sleep proxy data


#             # Initialize desired structure with 49 timesteps
#             timestep_features = [[] for _ in range(25)]
#             for func, dataframe_name in external_functions:
#                 dataframe = participant_data.get(dataframe_name)
#                 if dataframe is not None:
#                     timestep_features_func = func(dataframe, window, timestamp)
#                     for i in range(25):
#                         timestep_timestamp = timestamp - pd.Timedelta(minutes=30) * i
#                         if timestep_timestamp in timestep_features_func:
#                             timestep_features[i].extend(timestep_features_func[timestep_timestamp])

#             desired_structure.setdefault(participant_id, {}).setdefault(sequence_name, {
#                 'features': [static_features + timestep for timestep in timestep_features],
#                 'target': [esm_response['Stress_binary'], esm_response['Valence_binary'], esm_response['Arousal_binary']]
#             })

#     return desired_structure

def sequence_creation(all_participants_data, esm_responses, user_info):
    desired_structure = {}
    days_of_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    
    window = {'30min': 60 * 30}
    
    # Preprocess user_info to avoid repeated lookups
    user_info_dict = user_info.set_index('Pcode').to_dict(orient='index')
    
    # Preprocess external functions
    external_functions = [
        (extract_generic_time_series, 'Calorie.csv'),
        (extract_generic_time_series, 'SkinTemperature.csv'),
        (extract_generic_time_series, 'AmbientLight.csv'),
        (extract_generic_time_series, 'RRI.csv'),
        (extract_generic_time_series, 'StepCount.csv'),
        (extract_message_time_series, 'MessageEvent.csv'),
        (extract_call_timeseries, 'CallEvent.csv'),
        (extract_generic_time_series, 'ActivityEvent.csv'),
        (extract_location_time_series, 'Location.csv'),
        (extract_generic_time_series, 'HR.csv'),
        (extract_generic_time_series, 'Distance.csv'),
        (extract_appusage_time_series, 'AppUsageEvent.csv'),
        (extract_generic_time_series, 'Acceleration.csv'),
        (extract_generic_time_series, 'UltraViolet.csv'),
        (extract_deviceevent_time_series, 'DeviceEvent.csv')
    ]
    
    for participant_id, participant_data in all_participants_data.items():
        sleep_proxies = participant_data['sleep_proxies']

        participant_esm_responses = esm_responses[esm_responses['Pcode'] == participant_id]
        for index, esm_response in participant_esm_responses.iterrows():
            timestamp = pd.Timestamp(esm_response['ResponseTime'])
            sequence_name = f"{timestamp}_{index}"
            day_of_week = days_of_week[timestamp.weekday()]
            user_info_row = user_info_dict.get(participant_id, {})
            static_features = [day_of_week, user_info_row.get('Age', 0), user_info_row.get('Gender', ''), 
                               user_info_row.get('Openness', 0), user_info_row.get('Conscientiousness', 0),
                               user_info_row.get('Neuroticism', 0), user_info_row.get('Extraversion', 0),
                               user_info_row.get('Agreeableness', 0), user_info_row.get('PSS10', 0),
                               user_info_row.get('PHQ9', 0), user_info_row.get('GHQ12', 0)]
            
            date_of_timestamp = timestamp.date()  # Normalize to date without time
            if date_of_timestamp in sleep_proxies.index:
                sleep_proxy_for_timestamp = sleep_proxies.loc[date_of_timestamp, 'SleepProxy']
                sleep_proxy_hours = sleep_proxy_for_timestamp.total_seconds() / 3600
                static_features.append(sleep_proxy_hours)
            else:
                static_features.append(0)  # Appending 0 if there's no sleep proxy data

            # Initialize desired structure with 25 timesteps
            timestep_features = [[] for _ in range(25)]
            feature_counts = {df_name: 0 for _, df_name in external_functions}  # Initialize feature count dictionary
            
            for func, dataframe_name in external_functions:
                dataframe = participant_data.get(dataframe_name)
                if dataframe is not None:
                    timestep_features_func = func(dataframe, window, timestamp)
                    for i in range(25):
                        timestep_timestamp = timestamp - pd.Timedelta(minutes=30) * i
                        if timestep_timestamp in timestep_features_func:
                            extracted_features = timestep_features_func[timestep_timestamp]
                            timestep_features[i].extend(extracted_features)
                            feature_counts[dataframe_name] += len(extracted_features)  # Count features added

            desired_structure.setdefault(participant_id, {}).setdefault(sequence_name, {
                'features': [static_features + timestep for timestep in timestep_features],
                'target': [esm_response['Stress_binary'], esm_response['Valence_binary'], esm_response['Arousal_binary']],
                'feature_counts': feature_counts  # Add feature counts to the structure
            })

    return desired_structure