In [0]:
# imports
import requests
import time
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, concat_ws, regexp_replace, to_date, trim, lit, when, length, udf, broadcast
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType, ArrayType
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from functools import partial
import sys
import traceback
import json
from langdetect import detect
import re
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Tuple, Set
import pandas as pd
import ast
from json_repair import repair_json

def get_exercises(api_url):
    '''Retrieves complete list of exercises from the wger public API'''
    all_exercises = []
    current_url = api_url

    try:
        while current_url:
            response = requests.get(current_url)
            response.raise_for_status()
            data = response.json()
            all_exercises.extend(data['results'])
            current_url = data.get('next')
        return all_exercises
    except Exception as e:
        print('error fetching exercise data from wger API:', e)

def save_data(df, table_name: str, mode: str = "overwrite"):
    """save pyspark df to delta table."""
    writer = df.write.format("delta").mode(mode)
    if mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")
    writer.saveAsTable(table_name)

def is_english_content(description):
    """check if description is actually in English"""
    try:
        # check description language, sampling first 100 chars 
        if description and len(description.strip()) > 0:
            desc_sample = description[:100]
            desc_lang = detect(desc_sample)
            if desc_lang != 'en':
                return False
        
        return True
    except Exception as e:
        # if detection fails, assume not english
        return False
    
def normalize_record(record):
    """Normalize a record dictionary by extracting and formatting specific fields."""
    # extract name and description from translations with language == 2 (english)
    translations = record.get('translations', [])
    name = None
    description = None
    
    for translation in translations:
        if translation.get("language") == 2:
            candidate_name = translation.get("name")
            candidate_description = translation.get("description")
            
            # verify the content is english
            if is_english_content(candidate_description):
                name = candidate_name
                description = candidate_description
                break
    
    # return None if no English content found
    # this is a data quality fix;
    # there is a known issue in wger API that results in non-english records despite language = 2 filtering
    if not name:
        return None
    
    # data cleaning step: remove HTML tags from name and description
    if name:
        name = re.sub(r'<[^>]*>', '', name).strip()
    if description:
        description = re.sub(r'<[^>]*>', '', description).strip()
    
    # parse datetime fields with proper error handling
    created = None
    if record.get("created"):
        created = datetime.fromisoformat(record["created"].replace('Z', '+00:00'))
    
    last_update = None
    if record.get("last_update"):
        last_update = datetime.fromisoformat(
            record["last_update"].replace('Z', '+00:00'))
    
    # generate normalized dict structure
    normalized = {
        "id": record.get("id"),
        "uuid": record.get("uuid"),
        "name": name.upper(),
        "description": description,
        "created": created,
        "last_update": last_update,
        "category": record.get("category")["name"],
        "muscles": [muscle["name"] for muscle in record.get("muscles", [])],
        "muscles_secondary": [muscle["name"] for muscle in record.get("muscles_secondary", [])],
        "equipment": [equip["name"] for equip in record.get("equipment", [])],
        "variations": record.get("variations") or [],
        "license_author": record.get("license_author")}
    
    return normalized

def query_databricks_foundation_model(prompt, model_name="databricks-llama-4-maverick", max_tokens=512, temperature=0.5):
    """
    Query a Databricks foundation model through the serving endpoint
    """
    
    # construct the API endpoint URL
    api_url = f"https://{DATABRICKS_INSTANCE}/serving-endpoints/{model_name}/invocations"
    
    headers = {
        "Authorization": f"Bearer {DATABRICKS_TOKEN}",
        "Content-Type": "application/json"}
    
    # chat payload
    payload_messages = {
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "max_tokens": max_tokens,
        "temperature": temperature
    }
    
    # prompt payload
    payload_prompt = {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature
    }
    
    # input payload
    payload_input = {
        "input": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature
    }
    
    # attempt each format above
    for payload_name, payload in [("messages", payload_messages), ("prompt", payload_prompt), ("input", payload_input)]:
        try:
            print(f"Trying payload format: {payload_name}")
            response = requests.post(api_url, headers=headers, json=payload)
            response.raise_for_status()
            print(f"Success with {payload_name} format!")
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Failed with {payload_name} format: {e}")
            if hasattr(e, 'response') and e.response is not None:
                print(f"Response content: {e.response.text}")
            continue
    
    print("all payload formats failed")
    return None

# Alternative approach using the newer Databricks SDK
def query_with_databricks_sdk(prompt, model_name="databricks-llama-4-maverick", max_tokens=5000, temperature=0.5):
    """
    Alternative approach using Databricks SDK (if available)
    """
    try:
        from databricks.sdk import WorkspaceClient
        
        w = WorkspaceClient()
        
        # Format the request properly for the SDK
        request_data = {
            "messages": [
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "max_tokens": max_tokens,
            "temperature": temperature
        }
        
        response = w.serving_endpoints.query(
            name=model_name,
            **request_data
        )
        
        return response
    except ImportError:
        print("Databricks SDK not available, use the API approach instead")
        return None
    except Exception as e:
        print(f"Error with SDK approach: {e}")
        return None

def format_prompt(instruction, system_prompt=None):
    """
    Format prompt similar to your original template
    Based on documentation here:
    https://github.com/databricks/databricks-ml-examples/blob/master/llm-models/mistral/mistral-7b/01_load_inference.py
    """
    if system_prompt is None:
        system_prompt = """You are a JSON-only exercise standardization assistant. 
        ALWAYS respond with valid JSON in this exact format:
        {"groups":[{"standardized_name":"NAME","variations":["var1","var2"]}]}
        Never include explanations, markdown, misspellings, or extra text."""
    
    formatted_prompt = f"""<s>[INST]<<SYS>>{system_prompt}<</SYS>>{instruction}[/INST]"""
    
    return formatted_prompt

# Example usage
def gen_text_databricks(prompts, use_template=True, **kwargs):
    """
    Generate text using Databricks foundation model
    """
    results = []
    
    for prompt in prompts:
        if use_template:
            formatted_prompt = format_prompt(prompt)
        else:
            formatted_prompt = prompt
            
        # First try the API approach
        response = query_databricks_foundation_model(
            formatted_prompt, 
            max_tokens=kwargs.get('max_new_tokens', 512),
            temperature=kwargs.get('temperature', 0.5)
        )
        
        if response:
            # Handle different response formats
            if 'choices' in response and response['choices']:
                # OpenAI-style response
                content = response['choices'][0].get('message', {}).get('content', '') or response['choices'][0].get('text', '')
                results.append(content)
            elif 'predictions' in response:
                # MLflow-style response
                results.append(response['predictions'][0].get('generated_text', ''))
            elif 'candidates' in response:
                # Gemini-style response
                results.append(response['candidates'][0].get('content', {}).get('parts', [{}])[0].get('text', ''))
            else:
                # Try to extract any text from the response
                results.append(str(response))
        else:
            # Try SDK approach as fallback
            sdk_response = query_with_databricks_sdk(
                formatted_prompt,
                max_tokens=kwargs.get('max_new_tokens', 512),
                temperature=kwargs.get('temperature', 0.5)
            )
            if sdk_response:
                results.append(str(sdk_response))
            else:
                results.append("Error: Could not get response from model")
    
    return results


DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None)
DATABRICKS_INSTANCE = spark.conf.get("spark.databricks.workspaceUrl")

# documentation link: https://exercise.hellogym.io/nl/software/api
# the exerciseinfo contains denormalized data with nested
wger_api_url = "https://wger.de/api/v2/exerciseinfo/?status=2&language=2"
exercises = get_exercises(wger_api_url)

ex_schema = StructType([
    StructField("id", IntegerType()),
    StructField("uuid", StringType()), 
    StructField("name", StringType()), 
    StructField("description", StringType()),
    StructField("created", TimestampType()),
    StructField("last_update", TimestampType()),
    StructField("category", StringType()),
    StructField("muscles",  StringType()),
    StructField("muscles_secondary",  StringType()),
    StructField("equipment",  StringType()),
    StructField("variations", StringType()),
    StructField("license_author", StringType())])
    
# filter out None values for data quality
exercise_abbv = [normalize_record(record) for record in exercises]
exercise_abbv = [record for record in exercise_abbv if record is not None]  # Remove non-English records

try:
    exercise_df = spark.createDataFrame(exercise_abbv, schema=ex_schema)
    exercise_df = exercise_df.drop("uuid", "created", "last_update", "license_author")
    display(exercise_df.sort("name"))
except Exception as e:
    print("error creating dataframe:", e)
    traceback.print_exc()


In [0]:
from pyspark.sql.functions import broadcast

def validate_json_structure(result):
    """Validate that the JSON has the required structure"""

    if not isinstance(result, dict):
        raise ValueError("Result is not a dictionary")
    
    if 'groups' not in result:
        raise ValueError("Missing 'groups' key")
    
    if not isinstance(result['groups'], list):
        raise ValueError("'groups' is not a list")
    
    for i, group in enumerate(result['groups']):
        if not isinstance(group, dict):
            raise ValueError(f"Group {i} is not a dictionary")
        
        # Check for standardized name with multiple possible keys
        if not any(key in group for key in ['standardized_name', 'standard_name']):
            raise ValueError(f"Group {i} missing standardized name key")
        
        if 'variations' not in group:
            raise ValueError(f"Group {i} missing 'variations' key")
        
        if not isinstance(group['variations'], list):
            raise ValueError(f"Group {i} 'variations' is not a list")
    
    return True

# dataset augmentation module
class AdvancedExerciseStandardizer:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        """
        Initialize with a sentence transformer model optimized for semantic similarity
        """
        self.model = SentenceTransformer(model_name)
        
        # equipment terms that should NOT be ignored as they define different exercises
        # for example, resistance band curl is very different than ez bar curl
        # thus, they should never be clustered/harmonized into the same representation
        self.critical_equipment = {
            'BARBELL', 'DUMBBELL', 'DUMBBELLS', 'EZ BAR', 'EZ-BAR', 'EZBAR',
            'CABLE', 'MACHINE', 'KETTLEBELL', 'RESISTANCE BAND', 'TRX',
            'SMITH MACHINE', 'PREACHER', 'HAMMER', 'ROPE', 'T-BAR'}
        
        # equipment terms that can be normalized/ignored
        # for example, curl on machine or curl using machine
        self.non_critical_equipment = {'ON MACHINE', 'USING MACHINE', 'WITH MACHINE'}
        
        # movement patterns that define exercise families
        # the list is not exhaustive, but helps direct the LLM
        self.movement_patterns = {
            'CURL', 'PRESS', 'ROW', 'RAISE', 'EXTENSION', 'SQUAT', 'DEADLIFT',
            'PULL', 'PUSH', 'FLY', 'DIP', 'LUNGE', 'PLANK', 'CRUNCH'}

    def extract_exercise_features(self, exercise_name: str) -> Dict:
        """
        Extract key features that distinguish exercises
        """
        name_upper = exercise_name.upper()
        
        # extract equipment
        equipment = []
        for equip in self.critical_equipment:
            if equip in name_upper:
                equipment.append(equip)
        
        # extract movement pattern
        movement = None
        for pattern in self.movement_patterns:
            if pattern in name_upper:
                movement = pattern
                break
        
        # extract body part/muscle groups
        body_parts = []
        body_part_terms = ['BICEP', 'TRICEP', 'CHEST', 'BACK', 'SHOULDER', 'LEG', 'CALF', 'ABS']
        for part in body_part_terms:
            if part in name_upper:
                body_parts.append(part)
        
        # extract modifiers (grip, stance, etc.)
        modifiers = []
        modifier_terms = ['NARROW', 'WIDE', 'CLOSE', 'HAMMER', 'REVERSE', 'OVERHEAD']
        for mod in modifier_terms:
            if mod in name_upper:
                modifiers.append(mod)
        
        return {
            'equipment': equipment,
            'movement': movement,
            'body_parts': body_parts,
            'modifiers': modifiers,
            'raw_name': exercise_name
        }

    def create_feature_aware_embeddings(self, exercise_names: List[str]) -> np.ndarray:
        """
        Create embeddings that emphasize critical distinguishing features
        """
        enhanced_texts = []
        
        for name in exercise_names:
            features = self.extract_exercise_features(name)
            
            # create enhanced text that emphasizes critical features
            enhanced_parts = [name]
            # emphasize equipment (doubled weight for critical equipment)
            for equip in features['equipment']:
                enhanced_parts.extend([equip, equip])
            # add movement pattern
            if features['movement']:
                enhanced_parts.append(features['movement'])
            # modifiers...
            enhanced_parts.extend(features['modifiers'])
            enhanced_text = ' '.join(enhanced_parts)
            enhanced_texts.append(enhanced_text)
        
        return self.model.encode(enhanced_texts)

    def are_exercises_equivalent(self, name1: str, name2: str, threshold: float = 0.85) -> bool:
        """
        Use LLM to validate if two exercises should be considered equivalent
        """
        # Quick feature check first
        features1 = self.extract_exercise_features(name1)
        features2 = self.extract_exercise_features(name2)
        
        # if equipment differs, they're likely different exercises
        if set(features1['equipment']) != set(features2['equipment']):
            # exception: if one has no equipment specified, might be same
            if features1['equipment'] and features2['equipment']:
                return False
        
        # if movement patterns differ significantly, likely different
        if features1['movement'] != features2['movement'] and \
           features1['movement'] and features2['movement']:
            return False
        
        # semantic similarity as final check
        embeddings = self.model.encode([name1, name2])
        similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
        
        return similarity >= threshold

    def llm_validate_cluster(self, exercise_names: List[str]) -> Dict[str, str]:
        """
        use LLM to validate and standardize a cluster of exercise names
        """
        if len(exercise_names) == 1:
            return {exercise_names[0]: self.clean_exercise_name(exercise_names[0])}
        
        # custom prompt for LLM validation
        prompt = f"""
        Analyze these exercise names and determine if they represent the same exercise or different exercises.
        If they are the same exercise, provide one standardized name.
        If they are different exercises, group them appropriately and provide standardized names for each group.

        IMPORTANT RULES:
        1. Different equipment (Barbell vs Dumbbell vs EZ Bar) = DIFFERENT exercises
        2. Different grips (Narrow vs Wide) = DIFFERENT exercises  
        3. Different positions (Incline vs Decline vs Flat) = DIFFERENT exercises
        4. Only spelling/formatting differences = SAME exercise
        5. Left and right variations = SAME exercise

        Exercise names: {exercise_names}

        CRITICAL: Return ONLY valid JSON. No explanations, no markdown, no extra text.
        Start your response with {{ and end with }}.

        Format:
        {{"groups":[{{"standardized_name":"EXERCISE NAME","variations":["var1","var2"]}}]}}

        Example:
        {{"groups":[{{"standardized_name":"BARBELL BICEP CURL","variations":["Barbell Biceps Curl","Bar Bell Bicep Curls"]}},{{"standardized_name":"EZ BAR BICEP CURL","variations":["EZ Bar Biceps Curl","Ez-Bar Bicep Curl"]}}]}}
        """
                
        prompt = prompt.replace("'", "''")
        
        try:
            # leveraging Databricks foundation model (free tier...)
            response = str(gen_text_databricks([prompt], temperature=0.1, max_new_tokens=5000, use_template=True))
            response = response.replace('```json', "").replace("```", "").strip()
            bad_response = True
            while bad_response:
                try:
                    repaired = repair_json(response)
                    result = ast.literal_eval(json.loads(repaired)[0])
                    
                    # Validate structure
                    validate_json_structure(result)
                    
                    bad_response = False
                    print('Success:')
                except Exception as e:
                    print(f"Error: {e}")
                    print('failure:', response)
                    bad_response = True

            # then create mapping from variations to standardized names
            mapping = {}
            for group in result['groups']:
                try:
                    standardized = group['standardized_name']
                except: # plan for the LLM to hallucinate standardized name 
                    standardized = group['standard_name']
                for variation in group['variations']:
                    mapping[variation] = standardized
            
            return mapping
            
        except Exception as e:
            print(f"LLM validation failed: {e}")
            raise
            # fallback: treat as one group with cleaned name
            #cleaned_name = self.clean_exercise_name(exercise_names[0])
            #return {name: cleaned_name for name in exercise_names}

    def clean_exercise_name(self, name: str) -> str:
        """
        Clean exercise name while preserving critical distinctions
        """
        # remove parenthetical information 
        name = re.sub(r'\s*\([^)]*\)', '', name)
        
        # standardize equipment terminology
        equipment_replacements = {
            r'DUMBBELLS?': 'DUMBBELL',
            r'BARBELLS?': 'BARBELL', 
            r'EZ[-\s]?BAR': 'EZ BAR',
            r'T[-\s]?BAR': 'T-BAR'}
        
        for pattern, replacement in equipment_replacements.items():
            name = re.sub(pattern, replacement, name, flags=re.IGNORECASE)
        
        # standardize exercise terms
        exercise_replacements = {
            r'BICEPS?': 'BICEP',
            r'TRICEPS?': 'TRICEP',
            r'PULL[-\s]?UPS?': 'PULL UP',
            r'PUSH[-\s]?UPS?': 'PUSH UP',
            r'SIT[-\s]?UPS?': 'SIT UP'}
        
        for pattern, replacement in exercise_replacements.items():
            name = re.sub(pattern, replacement, name, flags=re.IGNORECASE)
        
        # whitespace/formatting adjustment
        name = re.sub(r'[-_]+', ' ', name)
        name = ' '.join(name.split())
        
        return name.strip().upper()

    def smart_clustering(self, exercise_names: List[str], eps: float = 0.25, min_samples: int = 2) -> Dict[str, str]:
        """
        perform intelligent clustering with LLM validation
        """
        print(f"starting smart clustering for {len(exercise_names)} exercises...")
        
        # create feature-aware embeddings
        embeddings = self.create_feature_aware_embeddings(exercise_names)
        # initial clustering
        clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
        clusters = clustering.fit_predict(embeddings)
        
        print(f"Initial clustering found {len(set(clusters))} clusters")
        
        final_mapping = {}
        
        # process each cluster individually
        for cluster_id in set(clusters):
            if cluster_id == -1:  # noise points (unique exercises)
                continue
            
            # get exercises in this cluster
            cluster_exercises = [exercise_names[i] for i in range(len(exercise_names)) 
                               if clusters[i] == cluster_id]
            
            print(f"processing cluster {cluster_id} with {len(cluster_exercises)} exercises")
            
            # validate cluster with LLM
            cluster_mapping = self.llm_validate_cluster(cluster_exercises)
            final_mapping.update(cluster_mapping)
        
        # handle noise points (unique exercises)
        for i, cluster_id in enumerate(clusters):
            if cluster_id == -1:
                exercise = exercise_names[i]
                final_mapping[exercise] = self.clean_exercise_name(exercise)
        
        return final_mapping

    def standardize_exercise_dataset(self, exercise_names: List[str]) -> Tuple[Dict[str, str], pd.DataFrame]:
        """
        Complete standardization pipeline
        """
        print("starting exercise standardization pipeline...")
        
        # remove obvious invalid exercises
        valid_exercises = [name for name in exercise_names 
                          if not any(invalid in name.upper() 
                                   for invalid in ['REST', 'JOGGING', 'WALKING', 'CYCLING'])]
        
        print(f"filtered out {len(exercise_names) - len(valid_exercises)} invalid exercises")
        
        # perform smart clustering
        mapping = self.smart_clustering(valid_exercises)
        # generate summary to print...
        standardized_names = list(set(mapping.values()))
        
        # group by standardized name for review
        groups = {}
        for original, standardized in mapping.items():
            if standardized not in groups:
                groups[standardized] = []
            groups[standardized].append(original)
        
        # Create DataFrame for review
        review_data = []
        for standardized, originals in groups.items():
            review_data.append({
                'standardized_name': standardized,
                'original_count': len(originals),
                'original_names': ' | '.join(originals)})
        
        review_df = pd.DataFrame(review_data).sort_values('original_count', ascending=False)
        
        print(f"Standardization complete:")
        print(f"  Original exercises: {len(exercise_names)}")
        print(f"  Valid exercises: {len(valid_exercises)}")
        print(f"  Standardized exercises: {len(standardized_names)}")
        print(f"  Reduction: {((len(valid_exercises) - len(standardized_names)) / len(valid_exercises) * 100):.1f}%")
        
        return mapping, review_df

# implementation
def apply_standardization(df, exercise_column='name'):
    """
    Apply standardization to your exercise DataFrame using broadcast join
    """
    
    standardizer = AdvancedExerciseStandardizer()
    unique_exercises = [row.name for row in df.select("name").distinct().collect() if row.name is not None]
    mapping, review_df = standardizer.standardize_exercise_dataset(unique_exercises)
    
    print("Type of mapping:", type(mapping))
    print("First few items:", list(mapping.items())[:5] if hasattr(mapping, 'items') else mapping.head())
    
    # mapping df containing original name and standardized name
    if isinstance(mapping, dict):
        spark = SparkSession.getActiveSession()
        mapping_data = [(k, v) for k, v in mapping.items()]
        mapping_df = spark.createDataFrame(mapping_data, ["name", "standardized_name"])
        
        # can safely use a broadcast join here, mapping_df is quite small (< 500 records)
        df = df.join(broadcast(mapping_df), on="name", how="left")
    else:
        df = df.join(broadcast(mapping), on="name", how="left")
    
    # drop records where a standardized name was not successfully generated
    df = df.dropna(subset=['standardized_name'])
    
    return df, mapping, review_df

# apply gen AI standardization of exercise names
standardized_df, name_mapping, review_report = apply_standardization(exercise_df)

In [0]:
standardized_df = standardized_df.select('name', 'standardized_name', 'category', 'equipment',  'muscles', 'muscles_secondary', 'description')
display(standardized_df)