# What is this?
Now that I have thousands of sets entered into this application over the last year, might as well analyze it a bit.

# 1. Getting the data in a tensor

In [27]:
import boto3
import os 
ddb = boto3.resource('dynamodb')

sets_table_name = os.getenv("GAINS_IQ_SETS_TABLE_NAME")
target_user = os.getenv("GAINS_IQ_TARGET_USERNAME")
sets_table = ddb.Table(sets_table_name)

scan_kwargs = {
    'FilterExpression': 'username = :u',
    'ExpressionAttributeValues': {
        ':u': target_user
    }
}


response = sets_table.scan(**scan_kwargs)
raw_set_data = response['Items']

while 'LastEvaluatedKey' in response:
    response = sets_table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    raw_set_data.extend(response['Items'])

In [None]:
import pandas as pd 
import re

# To handle "16 or above" and similar rep entries
def clean_reps(rep_string):
    if pd.isna(rep_string) or rep_string is None:
        return 0
    match = re.search(r'\d+', str(rep_string)) 
    if match:
        return int(match.group(0))
    else:
        return 0

df = pd.DataFrame(raw_set_data)
df = df.drop(columns=['userId', 'username', 'workoutId']) # workoutId is meaningless and this is only ran on a single user at a time

numerical_fields = ['sets', 'timestamp', 'weight']
df[numerical_fields] = df[numerical_fields].astype(float)

# categorical feature encoding
df['exercise_idx'] = df['exercise'].astype('category').cat.codes
df['modulation_idx'] = df['weight_modulation'].astype('category').cat.codes

df['reps'] = df['reps'].apply(clean_reps)
df['reps'] = df['reps'].astype(int)

In [36]:
# More complex feature engineering 

# Something important is how far into a workout a set took place. We can find this by grouping sets into workouts and then calculating from there
# each workout is at least 8 hours apart (probably), so we can use that to do the grouping

# 12 hours in seconds
NEW_WORKOUT_THRESHOLD_SEC = 8 * 60 * 60

df = df.sort_values(by='timestamp').reset_index(drop=True)

df['time_diff'] = df['timestamp'].diff().fillna(NEW_WORKOUT_THRESHOLD_SEC + 1)
df['is_new_workout'] = df['time_diff'] > NEW_WORKOUT_THRESHOLD_SEC

# Create the virtual workout ID by cumulatively summing the 'is_new_workout' column.
# This assigns a unique, incremental ID (0, 1, 2, ...) to each hypothesized workout
df['virtual_workout_id'] = df['is_new_workout'].cumsum()

# Now, calculate time elapsed using the new 'virtual_workout_id'
grouped = df.groupby('virtual_workout_id')
first_timestamp = grouped['timestamp'].transform('min')
df['time_elapsed_sec'] = df['timestamp'] - first_timestamp
df

Unnamed: 0,sets,reps,timestamp,exercise,weight_modulation,weight,exercise_idx,modulation_idx,time_diff,is_new_workout,virtual_workout_id,time_elapsed_sec
0,1.0,8,1726535000.0,Preacher Curls (HS),,90.0,57,-1,28801.0,True,1,0.0
1,2.0,7,1726535000.0,Preacher Curls (HS),,90.0,57,-1,12.0,False,1,12.0
2,3.0,7,1726535000.0,Preacher Curls (HS),,90.0,57,-1,183.0,False,1,195.0
3,4.0,6,1726536000.0,Preacher Curls (HS),,90.0,57,-1,240.0,False,1,435.0
4,5.0,7,1726536000.0,Preacher Curls (HS),,90.0,57,-1,213.0,False,1,648.0
5,1.0,8,1726536000.0,Overhead Extensions,,115.74255,54,-1,314.0,False,1,962.0
6,2.0,8,1726536000.0,Overhead Extensions,,115.74255,54,-1,171.0,False,1,1133.0
7,3.0,9,1726537000.0,Overhead Extensions,,115.74255,54,-1,270.0,False,1,1403.0
8,4.0,8,1726537000.0,Overhead Extensions,,115.74255,54,-1,309.0,False,1,1712.0
9,1.0,10,1726537000.0,Pushdowns,,115.74255,60,-1,90.0,False,1,1802.0


In [37]:
import torch 

feature_columns = [
    'timestamp',
    'sets',
    'reps',
    'weight',
    'exercise_idx',    
    'modulation_idx'   
]

data_numpy = df[feature_columns].values
raw_tensor = torch.tensor(data_numpy, dtype=torch.float32)
raw_tensor

tensor([[ 1.7265e+09,  1.0000e+00,  8.0000e+00,  9.0000e+01,  5.7000e+01,
         -1.0000e+00],
        [ 1.7265e+09,  2.0000e+00,  7.0000e+00,  9.0000e+01,  5.7000e+01,
         -1.0000e+00],
        [ 1.7265e+09,  3.0000e+00,  7.0000e+00,  9.0000e+01,  5.7000e+01,
         -1.0000e+00],
        ...,
        [ 1.7652e+09,  1.0000e+00,  2.5000e+01,  4.0000e+02,  2.0000e+00,
          1.0000e+00],
        [ 1.7652e+09,  2.0000e+00,  2.0000e+01,  4.0000e+02,  2.0000e+00,
          1.0000e+00],
        [ 1.7652e+09,  3.0000e+00,  1.8000e+01,  4.0000e+02,  2.0000e+00,
          1.0000e+00]])