# What is this?
Now that I have thousands of sets entered into this application over the last year, might as well analyze it a bit.

# 1. Getting the data in a tensor

In [12]:
import boto3
import os 
ddb = boto3.resource('dynamodb')

sets_table_name = os.getenv("GAINS_IQ_SETS_TABLE_NAME")
target_user = os.getenv("GAINS_IQ_TARGET_USERNAME")

sets_table = ddb.Table(sets_table_name)

scan_kwargs = {
    'FilterExpression': 'username = :u',
    'ExpressionAttributeValues': {
        ':u': target_user
    }
}


response = sets_table.scan(**scan_kwargs)
raw_set_data = response['Items']

while 'LastEvaluatedKey' in response:
    response = sets_table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    raw_set_data.extend(response['Items'])

In [13]:
weight_table_name = os.getenv("GAINS_IQ_WEIGHT_TABLE_NAME")
weight_table = ddb.Table(weight_table_name)
scan_kwargs = {
    'FilterExpression': 'username = :u',
    'ExpressionAttributeValues': {
        ':u': target_user
    }
}

response = weight_table.scan(**scan_kwargs)
raw_weight_data = response['Items']

while 'LastEvaluatedKey' in response:
    response = weight_table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    raw_weight_data.extend(response['Items'])
raw_weight_data


[{'weight': Decimal('156.2'),
  'username': 'hammero',
  'timestamp': Decimal('1760020549')},
 {'username': 'hammero',
  'userId': 'hammero',
  'weight': Decimal('150.4'),
  'timestamp': Decimal('1752161818')},
 {'username': 'hammero',
  'userId': 'hammero',
  'weight': Decimal('155.6'),
  'timestamp': Decimal('1744986997')},
 {'username': 'hammero',
  'userId': 'hammero',
  'weight': Decimal('155.2'),
  'timestamp': Decimal('1757082570')},
 {'username': 'hammero',
  'userId': 'hammero',
  'weight': Decimal('150.6'),
  'timestamp': Decimal('1752680445')},
 {'username': 'hammero',
  'userId': 'hammero',
  'weight': Decimal('158.2'),
  'timestamp': Decimal('1743779426')},
 {'username': 'hammero',
  'userId': 'hammero',
  'weight': Decimal('148.2'),
  'timestamp': Decimal('1748961134')},
 {'username': 'hammero',
  'userId': 'hammero',
  'weight': Decimal('151.8'),
  'timestamp': Decimal('1753718015')},
 {'weight': Decimal('156.6'),
  'username': 'hammero',
  'timestamp': Decimal('17603647

In [14]:
import pandas as pd 
import re

# To handle "16 or above" and similar rep entries
def clean_reps(rep_string):
    if pd.isna(rep_string) or rep_string is None:
        return 0
    match = re.search(r'\d+', str(rep_string)) 
    if match:
        return int(match.group(0))
    else:
        return 0

df_sets = pd.DataFrame(raw_set_data)
df_set = df_sets.drop(columns=['userId', 'username', 'workoutId']) # workoutId is meaningless and this is only ran on a single user at a time

numerical_fields = ['sets', 'timestamp', 'weight']
df_sets[numerical_fields] = df_sets[numerical_fields].astype(float)

# categorical feature encoding
df_sets['exercise_idx'] = df_sets['exercise'].astype('category').cat.codes
df_sets['modulation_idx'] = df_sets['weight_modulation'].astype('category').cat.codes

df_sets['reps'] = df_sets['reps'].apply(clean_reps)
df_sets['reps'] = df_sets['reps'].astype(int)

In [15]:
# More complex feature engineering 

# Something important is how far into a workout a set took place. We can find this by grouping sets into workouts and then calculating from there
# each workout is at least 8 hours apart (probably), so we can use that to do the grouping

# 12 hours in seconds
NEW_WORKOUT_THRESHOLD_SEC = 8 * 60 * 60

df_sets = df_sets.sort_values(by='timestamp').reset_index(drop=True)

df_sets['time_diff'] = df_sets['timestamp'].diff().fillna(NEW_WORKOUT_THRESHOLD_SEC + 1)
df_sets['is_new_workout'] = df_sets['time_diff'] > NEW_WORKOUT_THRESHOLD_SEC

# Create the virtual workout ID by cumulatively summing the 'is_new_workout' column.
# This assigns a unique, incremental ID (0, 1, 2, ...) to each hypothesized workout
df_sets['virtual_workout_id'] = df_sets['is_new_workout'].cumsum()

# Now, calculate time elapsed using the new 'virtual_workout_id'
grouped = df_sets.groupby('virtual_workout_id')
first_timestamp = grouped['timestamp'].transform('min')
df_sets['time_elapsed_sec'] = df_sets['timestamp'] - first_timestamp
df_sets['time'] = pd.to_datetime(df_sets['timestamp'], unit='s')
df_sets['weight'] = df_sets['weight'].astype(float)
df_sets

Unnamed: 0,sets,reps,userId,timestamp,username,exercise,workoutId,weight_modulation,weight,exercise_idx,modulation_idx,time_diff,is_new_workout,virtual_workout_id,time_elapsed_sec,time
0,1.0,8,hammero,1.726535e+09,hammero,Preacher Curls (HS),a68e9678-1a1f-4d4f-adde-93558463e92b,,90.0,57,-1,28801.0,True,1,0.0,2024-09-17 01:06:15
1,2.0,7,hammero,1.726535e+09,hammero,Preacher Curls (HS),3d996989-d0b2-438e-ad56-83a7f100edfc,,90.0,57,-1,12.0,False,1,12.0,2024-09-17 01:06:27
2,3.0,7,hammero,1.726535e+09,hammero,Preacher Curls (HS),84a8763b-09f9-463b-8dad-1063f00eb606,,90.0,57,-1,183.0,False,1,195.0,2024-09-17 01:09:30
3,4.0,6,hammero,1.726536e+09,hammero,Preacher Curls (HS),2fd94a04-8bb4-4b54-a9b0-6ae2926d4eca,,90.0,57,-1,240.0,False,1,435.0,2024-09-17 01:13:30
4,5.0,7,hammero,1.726536e+09,hammero,Preacher Curls (HS),e4c36666-1bbd-4abb-a35e-2f4432787306,,90.0,57,-1,213.0,False,1,648.0,2024-09-17 01:17:03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3917,2.0,9,,1.765505e+09,hammero,Chest flies (WP),bebc6290-c597-4f4a-bbf2-5f4f8806b751,Cutting,160.0,14,1,218.0,False,302,2229.0,2025-12-12 02:09:19
3918,1.0,15,,1.765506e+09,hammero,Rope Pushdowns (WP),d38e2cb7-0e45-46bd-a998-d66536338238,Cutting,115.0,66,1,356.0,False,302,2585.0,2025-12-12 02:15:15
3919,2.0,11,,1.765506e+09,hammero,Rope Pushdowns (WP),fc5341c0-c83c-47b4-a2f8-381ad4c08691,Cutting,115.0,66,1,284.0,False,302,2869.0,2025-12-12 02:19:59
3920,1.0,10,,1.765506e+09,hammero,Incline DB Curls,81e0beda-8681-4cb4-92e0-a2dc21db976c,Cutting,45.0,34,1,190.0,False,302,3059.0,2025-12-12 02:23:09


In [16]:
# Making a DataFrame for weight data
df_bw = pd.DataFrame(raw_weight_data)
df_bw['bodyweight'] = df_bw['weight'].astype(float) 
df_bw['timestamp_bw'] = df_bw['timestamp'].astype(float) 
df_bw['time'] = pd.to_datetime(df_bw['timestamp_bw'], unit='s')
df_bw

Unnamed: 0,weight,username,timestamp,userId,bodyweight,timestamp_bw,time
0,156.2,hammero,1760020549,,156.2,1.760021e+09,2025-10-09 14:35:49
1,150.4,hammero,1752161818,hammero,150.4,1.752162e+09,2025-07-10 15:36:58
2,155.6,hammero,1744986997,hammero,155.6,1.744987e+09,2025-04-18 14:36:37
3,155.2,hammero,1757082570,hammero,155.2,1.757083e+09,2025-09-05 14:29:30
4,150.6,hammero,1752680445,hammero,150.6,1.752680e+09,2025-07-16 15:40:45
...,...,...,...,...,...,...,...
173,148.6,hammero,1763051078,,148.6,1.763051e+09,2025-11-13 16:24:38
174,153.6,hammero,1761320709,,153.6,1.761321e+09,2025-10-24 15:45:09
175,152.2,hammero,1754579438,hammero,152.2,1.754579e+09,2025-08-07 15:10:38
176,152.4,hammero,1754062246,hammero,152.4,1.754062e+09,2025-08-01 15:30:46


In [17]:
# Merging set and weight data together
df_merged = pd.merge_asof(
    df_sets.sort_values('time'),
    df_bw.sort_values('time'), 
    left_on='time', 
    right_on='time', 
    direction='backward' # Finds the nearest preceding or exact match
)
column_mapping = {
    # Core Workout Data (Renamed for clarity)
    'timestamp_x': 'set_timestamp', # Keep the set's timestamp
    'weight_x': 'set_weight_lb',   # Keep the set's weight
    'reps': 'reps',
    'sets': 'sets',
    'exercise': 'exercise_name',
    'weight_modulation': 'modulation_name',
    'bodyweight': 'bodyweight_lb', # The integrated bodyweight
    'exercise_idx': 'exercise_idx',
    'modulation_idx': 'modulation_idx',
    'virtual_workout_id': 'virtual_workout_id',
    'time_elapsed_sec': 'time_elapsed_sec',
}

columns_to_keep = list(column_mapping.keys())
df_clean = df_merged[columns_to_keep].rename(columns=column_mapping).copy()

df_clean['set_weight_lb'] = df_clean['set_weight_lb'].astype(float)
df_clean['bodyweight_lb'] = df_clean['bodyweight_lb'].astype(float)

df_clean

Unnamed: 0,set_timestamp,set_weight_lb,reps,sets,exercise_name,modulation_name,bodyweight_lb,exercise_idx,modulation_idx,virtual_workout_id,time_elapsed_sec
0,1.726535e+09,90.0,8,1.0,Preacher Curls (HS),,,57,-1,1,0.0
1,1.726535e+09,90.0,7,2.0,Preacher Curls (HS),,,57,-1,1,12.0
2,1.726535e+09,90.0,7,3.0,Preacher Curls (HS),,,57,-1,1,195.0
3,1.726536e+09,90.0,6,4.0,Preacher Curls (HS),,,57,-1,1,435.0
4,1.726536e+09,90.0,7,5.0,Preacher Curls (HS),,,57,-1,1,648.0
...,...,...,...,...,...,...,...,...,...,...,...
3917,1.765505e+09,160.0,9,2.0,Chest flies (WP),Cutting,142.4,14,1,302,2229.0
3918,1.765506e+09,115.0,15,1.0,Rope Pushdowns (WP),Cutting,142.4,66,1,302,2585.0
3919,1.765506e+09,115.0,11,2.0,Rope Pushdowns (WP),Cutting,142.4,66,1,302,2869.0
3920,1.765506e+09,45.0,10,1.0,Incline DB Curls,Cutting,142.4,34,1,302,3059.0


In [18]:
# Adding in day of the week, using sin/cos the embed the cyclical nature
import numpy as np 
df_clean['datetime'] = pd.to_datetime(df_clean['set_timestamp'], unit='s')

# Hour of Day (0 to 23)
df_clean['hour'] = df_clean['datetime'].dt.hour
HOURS_IN_DAY = 24

# Day of Week (0=Monday, 6=Sunday)
df_clean['day_of_week'] = df_clean['datetime'].dt.dayofweek
DAYS_IN_WEEK = 7

# Hour Sin/Cos
df_clean['hour_sin'] = np.sin(2 * np.pi * df_clean['hour'] / HOURS_IN_DAY)
df_clean['hour_cos'] = np.cos(2 * np.pi * df_clean['hour'] / HOURS_IN_DAY)

# Day of Week Sin/Cos
df_clean['day_of_week_sin'] = np.sin(2 * np.pi * df_clean['day_of_week'] / DAYS_IN_WEEK)
df_clean['day_of_week_cos'] = np.cos(2 * np.pi * df_clean['day_of_week'] / DAYS_IN_WEEK)

# (The sine/cosine columns carry the information, and the original timestamp is also kept)
df_clean = df_clean.drop(columns=['datetime', 'hour', 'day_of_week'], errors='ignore')
df_clean

Unnamed: 0,set_timestamp,set_weight_lb,reps,sets,exercise_name,modulation_name,bodyweight_lb,exercise_idx,modulation_idx,virtual_workout_id,time_elapsed_sec,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos
0,1.726535e+09,90.0,8,1.0,Preacher Curls (HS),,,57,-1,1,0.0,0.258819,0.965926,0.781831,0.623490
1,1.726535e+09,90.0,7,2.0,Preacher Curls (HS),,,57,-1,1,12.0,0.258819,0.965926,0.781831,0.623490
2,1.726535e+09,90.0,7,3.0,Preacher Curls (HS),,,57,-1,1,195.0,0.258819,0.965926,0.781831,0.623490
3,1.726536e+09,90.0,6,4.0,Preacher Curls (HS),,,57,-1,1,435.0,0.258819,0.965926,0.781831,0.623490
4,1.726536e+09,90.0,7,5.0,Preacher Curls (HS),,,57,-1,1,648.0,0.258819,0.965926,0.781831,0.623490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3917,1.765505e+09,160.0,9,2.0,Chest flies (WP),Cutting,142.4,14,1,302,2229.0,0.500000,0.866025,-0.433884,-0.900969
3918,1.765506e+09,115.0,15,1.0,Rope Pushdowns (WP),Cutting,142.4,66,1,302,2585.0,0.500000,0.866025,-0.433884,-0.900969
3919,1.765506e+09,115.0,11,2.0,Rope Pushdowns (WP),Cutting,142.4,66,1,302,2869.0,0.500000,0.866025,-0.433884,-0.900969
3920,1.765506e+09,45.0,10,1.0,Incline DB Curls,Cutting,142.4,34,1,302,3059.0,0.500000,0.866025,-0.433884,-0.900969


In [19]:
import torch 

feature_columns = [
    'set_timestamp',
    'hour_sin',
    'hour_cos',
    'day_of_week_sin',
    'day_of_week_cos',
    'time_elapsed_sec',
    'sets',
    'reps',
    'set_weight_lb',
    'exercise_idx',    
    'modulation_idx',
    'bodyweight_lb',
    'virtual_workout_id' 
]

data_numpy = df_clean[feature_columns].values
raw_tensor = torch.tensor(data_numpy, dtype=torch.float32)
raw_tensor

tensor([[ 1.7265e+09,  2.5882e-01,  9.6593e-01,  ..., -1.0000e+00,
                 nan,  1.0000e+00],
        [ 1.7265e+09,  2.5882e-01,  9.6593e-01,  ..., -1.0000e+00,
                 nan,  1.0000e+00],
        [ 1.7265e+09,  2.5882e-01,  9.6593e-01,  ..., -1.0000e+00,
                 nan,  1.0000e+00],
        ...,
        [ 1.7655e+09,  5.0000e-01,  8.6603e-01,  ...,  1.0000e+00,
          1.4240e+02,  3.0200e+02],
        [ 1.7655e+09,  5.0000e-01,  8.6603e-01,  ...,  1.0000e+00,
          1.4240e+02,  3.0200e+02],
        [ 1.7655e+09,  5.0000e-01,  8.6603e-01,  ...,  1.0000e+00,
          1.4240e+02,  3.0200e+02]])

# Seeing what we can derive