# Cycling Data LLM with PyTorch
Building an LLM that can answer questions about my cycling data from fit files downloaded from my TrainingPeaks account.

## Step 1 is to extract the fit files
I will likely use the fit_file_analysis.py file that I already created since it is set up to handle the .gz file extension is from the zipped file bulk download.  I believe there is potential to set this up as an API.

In [None]:
import pandas as pd
from fitparse import FitFile

def parse_fit_file(filepath):
    """
    parses a .fit file and extracts relevant data from it.
    returns a dictionary of dataframes for different message types.
    """
    fitfile = FitFile(filepath)
    data = {}

    #iterate over all messages of type 'record'
    # (these contain most of the per-second or per-event data)
    for record in fitfile.get_messages('record'):
        #Get all fields and their values
        record_data = {}
        for field in record.fields:
            record_data[field.name] = field.value

        # filter or process specific fields here
        # ex. 'timestamp', 'power', 'heart_rate', 'distance', 'speed', 'cadence'

        # initialize list for this message type if not exists
        if 'record' not in data:
            data['record'] = []
        data['record'].append(record_data)

    #extract other message types like 'session', 'lap', etc.
    #for simplicity, this example focuses on 'record' messages.
    for msg_type in ['session', 'lap']:
        for msg in fitfile.get_messages(msg_type):
            msg_data = {}
            for field in msg.fields:
                msg_data[field.name] = field.value
            if msg_type not in data:
                data[msg_type] = []
            data[msg_type].append(msg_data)

    #convert lists of dicts to pandas dfs for easier manipulation
    processed_data = {}
    for msg_type, records in data.items():
        if records:
            processed_data[msg_type] = pd.DataFrame(records)
            #convert timestamp to datetime objects if present
            if 'timestamp' in processed_data[msg_type].columns:
                processed_data[msg_type]['timestamp'] = pd.to_datetime(processed_data[msg_type]['timestamp'])

    return processed_data

# ex usage:
# fit_file_path = 'path/to/your/fitfiles.fit'
# print("record data (first 5 rows):")
# print(cycling_data['record'].head())
# print("\nAvailable fields in record data:", cycling_data['record'].columns.tolist())

# if 'session' in cycling_data:
#   print("\nSession Data:")
#   print(cycling_data['session'])





## Step 2 will be to generate question-answer pairs
LLM needs to learn from examples.  Need to create a dataset where each entry is a pair of (question, answer)

Several strategies for doing this. come back and outline/discuss

In [None]:
def generate_qa_pairs(cycling_data_df, ride_date):
    qa_pairs = []

    # Ensure 'timestamp' is datetime and set as index for time-based queries
    df = cycling_data_df.set_index('timestamp')

    # Example: Average Power
    if 'power' in df.columns:
        avg_power = df['power'].mean()
        qa_pairs.append({
            "question": f"What was my average power for the ride on {ride_date}?",
            "answer": f"Your average power was {avg_power:.1f} watts."
        })

    # Example: Max Heart Rate
    if 'heart_rate' in df.columns:
        max_hr = df['heart_rate'].max()
        qa_pairs.append({
            "question": f"What was my maximum heart rate during the ride on {ride_date}?",
            "answer": f"Your maximum heart rate was {max_hr:.0f} bpm."
        })

    # Example: Total Distance
    if 'distance' in df.columns:
        total_distance = df['distance'].iloc[-1] / 1000 # Assuming distance in meters, convert to km
        qa_pairs.append({
            "question": f"How far did I cycle on {ride_date}?",
            "answer": f"You cycled {total_distance:.2f} kilometers."
        })

    # Example: Average Speed
    if 'speed' in df.columns:
        avg_speed = df['speed'].mean() * 3.6 # Assuming speed in m/s, convert to km/h
        qa_pairs.append({
            "question": f"What was my average speed for the ride on {ride_date}?",
            "answer": f"Your average speed was {avg_speed:.2f} km/h."
        })

    # More complex: Speed when HR > X
    if 'speed' in df.columns and 'heart_rate' in df.columns:
        for hr_threshold in [150, 160, 170]:
            filtered_speed = df[df['heart_rate'] > hr_threshold]['speed']
            if not filtered_speed.empty:
                avg_speed_filtered = filtered_speed.mean() * 3.6
                qa_pairs.append({
                    "question": f"What was my average speed when my heart rate was above {hr_threshold} bpm on {ride_date}?",
                    "answer": f"Your average speed when your heart rate was above {hr_threshold} bpm was {avg_speed_filtered:.2f} km/h."
                })

    return qa_pairs

# To generate data for multiple files:
# all_qa_pairs = []
# for fit_file_path in list_of_your_fit_files:
#     cycling_data = parse_fit_file(fit_file_path)
#     if 'record' in cycling_data:
#         # Extract date from filename or timestamp
#         ride_date = cycling_data['record']['timestamp'].iloc[0].strftime('%Y-%m-%d')
#         all_qa_pairs.extend(generate_qa_pairs(cycling_data['record'], ride_date))

# print(f"Generated {len(all_qa_pairs)} QA pairs.")
# print(all_qa_pairs[0]) # Example of a generated pair

## Step 3 tokenization and vocabulary
text needs to be converted into numerical tokens before being fed into LLM
build vocab
tokenize

In [None]:
from collections import Counter
import torch

def build_vocabulary(qa_pairs):
    """
    Builds a word-to-index and index-to-word mapping from QA pairs.
    Adds special tokens for padding, start-of-sequence, and end-of-sequence.
    """
    all_words = []
    for pair in qa_pairs:
        all_words.extend(pair['question'].lower().split())
        all_words.extend(pair['answer'].lower().split())

    word_counts = Counter(all_words)
    sorted_vocab = sorted(word_counts.keys())

    # Add special tokens
    word_to_idx = {
        "<pad>": 0,  # Padding token
        "<sos>": 1,  # Start of sequence
        "<eos>": 2,  # End of sequence
        "<unk>": 3   # Unknown word
    }
    idx_to_word = {
        0: "<pad>",
        1: "<sos>",
        2: "<eos>",
        3: "<unk>"
    }

    for word in sorted_vocab:
        if word not in word_to_idx:
            idx = len(word_to_idx)
            word_to_idx[word] = idx
            idx_to_word[idx] = word

    return word_to_idx, idx_to_word

def tokenize_and_pad(text, word_to_idx, max_len):
    """
    Tokenizes a text, converts words to indices, and pads/truncates to max_len.
    """
    tokens = text.lower().split()
    indexed_tokens = [word_to_idx.get(word, word_to_idx["<unk>"]) for word in tokens]

    # Add <sos> and <eos> tokens
    indexed_tokens = [word_to_idx["<sos>"]] + indexed_tokens + [word_to_idx["<eos>"]]

    if len(indexed_tokens) > max_len:
        indexed_tokens = indexed_tokens[:max_len]
    else:
        indexed_tokens = indexed_tokens + [word_to_idx["<pad>"]] * (max_len - len(indexed_tokens))

    return torch.tensor(indexed_tokens, dtype=torch.long)

# Example Usage:
# word_to_idx, idx_to_word = build_vocabulary(all_qa_pairs)
# max_seq_len = 50 # Determine a suitable max length based on your data
# tokenized_question = tokenize_and_pad(all_qa_pairs[0]['question'], word_to_idx, max_seq_len)
# tokenized_answer = tokenize_and_pad(all_qa_pairs[0]['answer'], word_to_idx, max_seq_len)
# print("Tokenized Question:", tokenized_question)