# What is this?
Now that I have thousands of sets entered into this application over the last year, might as well analyze it a bit.

# 1. Getting the data in a tensor

In [None]:
import boto3
import os 
ddb = boto3.resource('dynamodb')

sets_table_name = os.getenv("GAINS_IQ_SETS_TABLE_NAME")
target_user = os.getenv("GAINS_IQ_TARGET_USERNAME")

sets_table = ddb.Table(sets_table_name)

scan_kwargs = {
    'FilterExpression': 'username = :u',
    'ExpressionAttributeValues': {
        ':u': target_user
    }
}


response = sets_table.scan(**scan_kwargs)
raw_set_data = response['Items']

while 'LastEvaluatedKey' in response:
    response = sets_table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    raw_set_data.extend(response['Items'])

In [None]:
weight_table_name = os.getenv("GAINS_IQ_WEIGHT_TABLE_NAME")
weight_table = ddb.Table(weight_table_name)
scan_kwargs = {
    'FilterExpression': 'username = :u',
    'ExpressionAttributeValues': {
        ':u': target_user
    }
}

response = weight_table.scan(**scan_kwargs)
raw_weight_data = response['Items']

while 'LastEvaluatedKey' in response:
    response = weight_table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    raw_weight_data.extend(response['Items'])
raw_weight_data


In [None]:
import pandas as pd 
import re

# To handle "16 or above" and similar rep entries
def clean_reps(rep_string):
    if pd.isna(rep_string) or rep_string is None:
        return 0
    match = re.search(r'\d+', str(rep_string)) 
    if match:
        return int(match.group(0))
    else:
        return 0

df_sets = pd.DataFrame(raw_set_data)
df_set = df_sets.drop(columns=['userId', 'username', 'workoutId']) # workoutId is meaningless and this is only ran on a single user at a time

numerical_fields = ['sets', 'timestamp', 'weight']
df_sets[numerical_fields] = df_sets[numerical_fields].astype(float)

# categorical feature encoding
df_sets['exercise_idx'] = df_sets['exercise'].astype('category').cat.codes
df_sets['modulation_idx'] = df_sets['weight_modulation'].astype('category').cat.codes

df_sets['reps'] = df_sets['reps'].apply(clean_reps)
df_sets['reps'] = df_sets['reps'].astype(int)

In [None]:
# More complex feature engineering 

# Something important is how far into a workout a set took place. We can find this by grouping sets into workouts and then calculating from there
# each workout is at least 8 hours apart (probably), so we can use that to do the grouping

# 12 hours in seconds
NEW_WORKOUT_THRESHOLD_SEC = 8 * 60 * 60

df_sets = df_sets.sort_values(by='timestamp').reset_index(drop=True)

df_sets['time_diff'] = df_sets['timestamp'].diff().fillna(NEW_WORKOUT_THRESHOLD_SEC + 1)
df_sets['is_new_workout'] = df_sets['time_diff'] > NEW_WORKOUT_THRESHOLD_SEC

# Create the virtual workout ID by cumulatively summing the 'is_new_workout' column.
# This assigns a unique, incremental ID (0, 1, 2, ...) to each hypothesized workout
df_sets['virtual_workout_id'] = df_sets['is_new_workout'].cumsum()

# Now, calculate time elapsed using the new 'virtual_workout_id'
grouped = df_sets.groupby('virtual_workout_id')
first_timestamp = grouped['timestamp'].transform('min')
df_sets['time_elapsed_sec'] = df_sets['timestamp'] - first_timestamp
df_sets['time'] = pd.to_datetime(df_sets['timestamp'], unit='s')
df_sets['weight'] = df_sets['weight'].astype(float)
df_sets

In [None]:
# Making a DataFrame for weight data
df_bw = pd.DataFrame(raw_weight_data)
df_bw['bodyweight'] = df_bw['weight'].astype(float) 
df_bw['timestamp_bw'] = df_bw['timestamp'].astype(float) 
df_bw['time'] = pd.to_datetime(df_bw['timestamp_bw'], unit='s')
df_bw

In [None]:
# Merging set and weight data together
df_merged = pd.merge_asof(
    df_sets.sort_values('time'),
    df_bw.sort_values('time'), 
    left_on='time', 
    right_on='time', 
    direction='backward' # Finds the nearest preceding or exact match
)
column_mapping = {
    # Core Workout Data (Renamed for clarity)
    'timestamp_x': 'set_timestamp', # Keep the set's timestamp
    'weight_x': 'set_weight_lb',   # Keep the set's weight
    'reps': 'reps',
    'sets': 'sets',
    'exercise': 'exercise_name',
    'weight_modulation': 'modulation_name',
    'bodyweight': 'bodyweight_lb', # The integrated bodyweight
    'exercise_idx': 'exercise_idx',
    'modulation_idx': 'modulation_idx',
    'virtual_workout_id': 'virtual_workout_id',
    'time_elapsed_sec': 'time_elapsed_sec',
}

columns_to_keep = list(column_mapping.keys())
df_clean = df_merged[columns_to_keep].rename(columns=column_mapping).copy()

df_clean['set_weight_lb'] = df_clean['set_weight_lb'].astype(float)
df_clean['bodyweight_lb'] = df_clean['bodyweight_lb'].astype(float)

df_clean

In [None]:
# Adding in day of the week, using sin/cos the embed the cyclical nature
import numpy as np 
df_clean['datetime'] = pd.to_datetime(df_clean['set_timestamp'], unit='s')

# Hour of Day (0 to 23)
df_clean['hour'] = df_clean['datetime'].dt.hour
HOURS_IN_DAY = 24

# Day of Week (0=Monday, 6=Sunday)
df_clean['day_of_week'] = df_clean['datetime'].dt.dayofweek
DAYS_IN_WEEK = 7

# Hour Sin/Cos
df_clean['hour_sin'] = np.sin(2 * np.pi * df_clean['hour'] / HOURS_IN_DAY)
df_clean['hour_cos'] = np.cos(2 * np.pi * df_clean['hour'] / HOURS_IN_DAY)

# Day of Week Sin/Cos
df_clean['day_of_week_sin'] = np.sin(2 * np.pi * df_clean['day_of_week'] / DAYS_IN_WEEK)
df_clean['day_of_week_cos'] = np.cos(2 * np.pi * df_clean['day_of_week'] / DAYS_IN_WEEK)

# (The sine/cosine columns carry the information, and the original timestamp is also kept)
df_clean = df_clean.drop(columns=['datetime', 'hour', 'day_of_week'], errors='ignore')
df_clean

In [None]:
import torch 

feature_columns = [
    'set_timestamp',
    'hour_sin',
    'hour_cos',
    'day_of_week_sin',
    'day_of_week_cos',
    'time_elapsed_sec',
    'sets',
    'reps',
    'set_weight_lb',
    'exercise_idx',    
    'modulation_idx',
    'bodyweight_lb',
    'virtual_workout_id' 
]

data_numpy = df_clean[feature_columns].values
raw_tensor = torch.tensor(data_numpy, dtype=torch.float32)
raw_tensor

# Seeing what we can derive