In [0]:
# imports
import requests
import time
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, concat_ws, regexp_replace, to_date, trim, lit, when, length, udf
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType, ArrayType
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from functools import partial
import sys

def get_exercises(api_url):
    '''Retrieves complete list of exercises from the wger public API'''
    all_exercises = []
    current_url = api_url

    try:
        while current_url:
            response = requests.get(current_url)
            response.raise_for_status()
            data = response.json()
            all_exercises.extend(data['results'])
            current_url = data.get('next')
        return all_exercises
    except Exception as e:
        print('error fetching exercise data from wger API:', e)

# documentation link: https://exercise.hellogym.io/nl/software/api
# the exerciseinfo contains denormalized data with nested
wger_api_url = "https://wger.de/api/v2/exerciseinfo/?status=2&language=2"
exercises = get_exercises(wger_api_url)

In [0]:
import traceback
import json
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from datetime import datetime

# Define the schema
ex_schema = StructType([
    StructField("id", IntegerType()),
    StructField("uuid", StringType()), 
    StructField("name", StringType()), 
    StructField("description", StringType()),
    StructField("created", TimestampType()),
    StructField("last_update", TimestampType()),
    StructField("category", StringType()),
    StructField("muscles",  StringType()),
    StructField("muscles_secondary",  StringType()),
    StructField("equipment",  StringType()),
    StructField("variations", StringType()),
    StructField("license_author", StringType())])

# Normalize the data
def normalize_record(record):
    normalized = {
        "id": record.get("id"),
        "uuid": record.get("uuid"),
        "name": [record['translations'][i].get("name") for i in range(len(record['translations'])) if record['translations'][i].get("language") == 2][0],
        "description": [record['translations'][i].get("description") for i in range(len(record['translations'])) if record['translations'][i].get("language") == 2][0],
        "created": datetime.fromisoformat(record["created"].replace('Z', '+00:00')) if record.get("created") else None,
        "last_update": datetime.fromisoformat(record["last_update"].replace('Z', '+00:00')) if record.get("last_update") else None,
        "category": record.get("category"),
        "muscles": [muscle["name"] for muscle in record.get("muscles", [])],
        "muscles_secondary": [muscle["name"] for muscle in record.get("muscles_secondary", [])],
        "equipment": [equip["name"] for equip in record.get("equipment", [])],
        "variations": record.get("variations", []) if record.get("variations") is not None else [],
        "license_author": record.get("license_author")
    }
    return normalized

exercise_abbv = [normalize_record(record) for record in exercises]

try:
    exercise_df = spark.createDataFrame(exercise_abbv, schema=ex_schema)
    exercise_df = exercise_df.drop("uuid", "created", "last_update", "license_author")
    display(exercise_df)
except Exception as e:
    print("error creating dataframe:", e)
    traceback.print_exc()