In [0]:
# imports
import requests
import time
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, concat_ws, regexp_replace, to_date, trim, lit, when, length, udf
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType, ArrayType
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from functools import partial
import sys

def get_exercises(api_url):
    '''Retrieves complete list of exercises from the wger public API'''
    all_exercises = []
    current_url = api_url

    while current_url:
        response = requests.get(current_url)
        response.raise_for_status()  # Raise an exception for bad status codes

        data = response.json()
        all_exercises.extend(data['results'])  # Assuming exercises are in 'results' key
        current_url = data.get('next') # Example: data['next'] if available

    return all_exercises

wger_api_url = "https://wger.de/api/v2/exerciseinfo/"  # Replace with the actual endpoint
exercises = get_exercises(wger_api_url)

In [0]:

ex_schema = StructType([
    StructField("id", IntegerType()),
    StructField("uuid", StringType()), 
    StructField("created", TimestampType()),
    StructField("last_update", TimestampType()),
    StructField("category", ArrayType(StructType())),
    StructField("muscles", ArrayType(StructType())),
    StructField("muscles_secondary", ArrayType(StructType())),
    StructField("equipment", ArrayType(StructType())),
    StructField("variations", ArrayType(StructType())),
    StructField("license_author", StringType())])

fields = ["id", "uuid", "created", "last_update", "category", "muscles", "muscles_secondary", "equipment", "variations", "license_author"]
exercise_abbv = [{key: record[key] for key in fields if key in record} for record in exercises]
print(exercise_abbv[0])
exercise_df = spark.createDataFrame(exercise_abbv)
exercise_df = exercise_df.drop('uuid', 'created', 'ast_update', 'license_author')
exercise_df.show()
