In [10]:
import requests
import json
import base64
import pandas as pd
from bs4 import BeautifulSoup
from pyspark.sql import SparkSession
from pandas import json_normalize
from notebookutils import mssparkutils
from datetime import datetime, timedelta

key_vault_name = 'LevelUpKeyVault-Prod'
username_secret_name = 'VolunteerHubAPI-Username'
password_secret_name = 'VolunteerHubAPI-Password'
key_vault_uri = f"https://{key_vault_name}.vault.azure.net/"

# Retrieve username and password
username = mssparkutils.credentials.getSecret(key_vault_uri, username_secret_name)
api_key = mssparkutils.credentials.getSecret(key_vault_uri, password_secret_name)

subdomain = '3deschools'  # e.g., 'myorg' from myorg.volunteerhub.com

# Create Basic Auth header
credentials = f"{username}:{api_key}"
encoded_credentials = base64.b64encode(credentials.encode('utf-8')).decode('utf-8')
print(encoded_credentials)
headers = {
    'Authorization': f'Basic {encoded_credentials}',
    'Accept': 'application/json'
}

StatementMeta(, 044449c9-6ad5-4365-a4a5-968825b88a6e, 12, Finished, Available, Finished)

a2V2aW50cmFuM2RlOkthcnRpbWVvd3N0b29tdWNoMTk5OCE=


In [7]:
# USERS Dataset
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

base_url = f"https://{subdomain}.volunteerhub.com/api/v1/users"
params_base = {
    'query': 'LastUpdate',
    'earliestLastUpdate': datetime.now().strftime("%Y-%m-%d"),
    'pageSize': 50
}

# UID Map
qid_map = {
    "name": "e750db65-2ee0-4538-92a3-4e733d701381",
    "email": "34e7cf97-7b65-4853-ba30-1bfe2a2fa0ad",
    "phone": "44bb4e5c-ad6e-4108-a521-322bfeb4edf2",
    "dob": "294fe8a5-3b9e-4401-acaf-68801569eb98",
    "gender": "5e562907-c7d0-42fe-90f3-eee4363164e6",
    "race": "ec21cce1-716f-4ab3-bdf7-1ac491f7dee9",
    "org": "8f02ea11-97a8-49bc-8b6e-60a5913cbefb",
    "approved": "5c3142d2-d6a9-40d7-ba9f-3da34e7d1753",
    "address": "7f124ac4-a3ad-4bd5-89b5-f5ade6282868"
}

# API fetch per page
def fetch_page(page):
    params = params_base.copy()
    params['page'] = page
    try:
        r = requests.get(base_url, headers=headers, params=params)
        if r.status_code == 200:
            return r.json()
        return []
    except:
        return []

# Form answer parsing
def extract_form_data(answers):
    data = {}
    for ans in answers:
        qid = ans.get("FormQuestionUid")
        qtype = ans.get("$type", "")
        if qid == qid_map["name"] and "FormAnswerName" in qtype:
            data["FirstName"] = ans.get("FirstName")
            data["LastName"] = ans.get("LastName")
        elif qid == qid_map["email"]:
            data["Email"] = ans.get("Value")
        elif qid == qid_map["phone"]:
            data["Phone"] = ans.get("Value")
        elif qid == qid_map["dob"]:
            data["DOB"] = ans.get("Value")
        elif qid == qid_map["gender"]:
            data["Gender"] = ans.get("Value")
        elif qid == qid_map["race"]:
            data["Race"] = ans.get("Value")
        elif qid == qid_map["org"]:
            data["Organization"] = ans.get("Value")
        elif qid == qid_map["approved"]:
            data["ApprovedStatus"] = ans.get("Value")
        elif qid == qid_map["address"] and "FormAnswerAddress" in qtype:
            data["Address1"] = ans.get("Address1")
            data["City"] = ans.get("City")
            data["State"] = ans.get("State")
            data["PostalCode"] = ans.get("PostalCode")
    return data

# Format full row
def process_user(user):
    base = {
        "UserUid": user.get("UserUid"),
        "Username": user.get("Username"),
        "EmailAllowed": user.get("EmailAllowed"),
        "Created": user.get("Created")
    }
    form_data = extract_form_data(user.get("FormAnswers", []))
    base.update(form_data)
    return base

# Parallelize pages 0 to 1000 (~50,000 records)
pages = list(range(0, 1000))  # Adjust if needed
rdd = sc.parallelize(pages, numSlices=50).flatMap(fetch_page).map(process_user)
users = rdd.collect()

# Pandas DataFrame
users_df = pd.DataFrame(users)
users_df['Email'] = users_df['Email'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text() if pd.notna(x) else x)

# Convert to Spark DataFrame
spark_df_users = spark.createDataFrame(users_df)
display(spark_df_users)

StatementMeta(, a617b7f3-649c-494b-a0f0-fcd6d9f62999, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, d6afc30a-f796-4844-9e75-2baa33e08617)

In [None]:
LAKEHOUSE_NAME = "VolunteerHub_Bronze"

#df = spark.createDataFrame([json.dumps(response_data)], "string")
spark_df_users.write.mode("append").saveAsTable("VolunteerHub_Bronze.users")

StatementMeta(, a617b7f3-649c-494b-a0f0-fcd6d9f62999, -1, Cancelled, , Cancelled)

In [6]:
# EVENTS Dataset
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

base_url = f"https://{subdomain}.volunteerhub.com/api/v1/events"
params_base = {
    'query': 'Time',
    'earliestTime': datetime.now().strftime("%Y-%m-%d"),
    'pageSize': 50  # max per API spec
}

# --- FETCH ONE PAGE FUNCTION ---
def fetch_event_page(page):
    params = params_base.copy()
    params['page'] = page
    try:
        r = requests.get(base_url, headers=headers, params=params)
        if r.status_code == 200:
            return r.json()
        else:
            return []
    except:
        return []

# --- PARSE ONE EVENT FUNCTION ---
def process_event(event):
    return {
        'Event_Name': event.get('Name'),
        'Start_Time': event.get('StartTime'),
        'End_Time': event.get('EndTime'),
        'Location': event.get('Location'),
        'Short_Description': event.get('ShortDescription'),
        'Long_Description': event.get('LongDescription'),
        'Event_UID': event.get('EventUid'),
        'Event_Group_UID': event.get('EventGroupUid'),
        'Slot_Limit': event.get('SlotLimit'),
        'Version': event.get('Version'),
        'Raisers_Edge_Job_ID': event.get('RaisersEdgeJobId'),
        'User_Group_Registrations': event.get('UserGroupRegistrations')

    }

# --- PARALLEL FETCH ---
pages = list(range(0, 1000))  # Adjust upper bound depending on expected data volume
rdd = sc.parallelize(pages, numSlices=50).flatMap(fetch_event_page).map(process_event)
events_raw = rdd.collect()

events_df = pd.DataFrame(events_raw)

# --- OPTIONAL CLEANING ---
def strip_html(html):
    return BeautifulSoup(html, "html.parser").get_text() if html else ""

events_df['Short_Description'] = events_df['Short_Description'].apply(strip_html)
events_df['Long_Description'] = events_df['Long_Description'].apply(strip_html)

# parse User Group Registrations column
flat_rows = []

for _, row in events_df.iterrows():
    ugr_list = row['User_Group_Registrations']

    # Safety check in case the field is empty or malformed
    if not isinstance(ugr_list, list) or not ugr_list:
        continue

    for ugr in ugr_list:
        ugr_meta = {
            'User_Group_UID': ugr.get('UserGroupUid'),
            'Group_Slots_Reserved': ugr.get('SlotsReserved'),
            'Anonymous_Group_Slots_Used': ugr.get('AnonymousSlotsUsed'),
            'Deleted_Group_Registration': ugr.get('Deleted')
        }

        user_regs = ugr.get('UserRegistrations', [])

        if not isinstance(user_regs, list) or not user_regs:
            # If no user registrations, still log the UGR-level data
            row_flat = row.drop(labels=['User_Group_Registrations']).to_dict()
            flat_rows.append({**row_flat, **ugr_meta})
            continue

        for reg in user_regs:
            reg_data = {
                'User_UID': reg.get('UserUid'),
                'Hours': reg.get('Hours'),
                'Registration_Date': reg.get('RegistrationDate'),
                'Anonymous_Slots_Used_Reg': reg.get('AnonymousSlotsUsed'),
                'Waitlisted': reg.get('Waitlisted'),
                'Deleted_User_Registration': reg.get('Deleted')
            }

            row_flat = row.drop(labels=['User_Group_Registrations']).to_dict()
            flat_rows.append({**row_flat, **ugr_meta, **reg_data})

# Final flattened DataFrame
parsed_events_df = pd.DataFrame(flat_rows)

# --- CONVERT TO SPARK ---
spark_df_events = spark.createDataFrame(parsed_events_df)

# --- DISPLAY ---
display(spark_df_events)

StatementMeta(, 85d43fab-68ed-44ac-904d-6b4cf4067356, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 2e20b64a-ee86-443c-a3e7-2e3c753c9ff0)

In [7]:
spark_df_events.write.mode("append").saveAsTable("VolunteerHub_Bronze.events")

StatementMeta(, 85d43fab-68ed-44ac-904d-6b4cf4067356, 9, Finished, Available, Finished)

In [4]:
# GET request to Organization endpoint
url = f'https://{subdomain}.volunteerhub.com/api/v1/organization'

response = requests.get(url, headers=headers)

if response.status_code == 200:
    print("Success:")
    # Pretty-print the JSON response
    formatted_json = json.dumps(response.json(), indent=4)
    print(formatted_json)
else:
    print(f"Error: {response.status_code} - {response.text}")


#df = spark.createDataFrame(json.dumps(response.json()), "string")
df = spark.read.json(sc.parallelize([json.dumps(response.json())]))
#spark_df_users.write.mode("overwrite").saveAsTable("VolunteerHub_Bronze.users")

StatementMeta(, 044449c9-6ad5-4365-a4a5-968825b88a6e, 6, Finished, Available, Finished)

Success:
{
    "ContactUserUid": "502d687a-f8c1-4e6c-82cc-3d80cd2df48e",
    "Name": "3DE Schools",
    "Url": "3deschools.org",
    "Forms": [
        {
            "Name": "User Info",
            "FormQuestions": [
                {
                    "FormQuestionUid": "e750db65-2ee0-4538-92a3-4e733d701381",
                    "Editability": "AdministratorsAndUsers",
                    "FormQuestionType": "Name",
                    "Name": "Full Name",
                    "Ordinal": 4,
                    "Prompt": "Full Name",
                    "Required": true,
                    "SubPrompt": "<p>Please use your full legal name.</p>",
                    "Visibility": "AdministratorsAndUsers",
                    "Tokens": [
                        "UserName"
                    ]
                },
                {
                    "FormQuestionUid": "34e7cf97-7b65-4853-ba30-1bfe2a2fa0ad",
                    "Editability": "AdministratorsAndUsers",
                  

In [6]:
display(df)

StatementMeta(, 044449c9-6ad5-4365-a4a5-968825b88a6e, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 8da410ee-66f3-4319-b8d0-7b574ed255df)

In [4]:
# GET request to Event Groups endpoint
url = f'https://{subdomain}.volunteerhub.com/api/v1/eventgroups'

response = requests.get(url, headers=headers)

if response.status_code == 200:
    print("Success:")
    # Pretty-print the JSON response
    formatted_json = json.dumps(response.json(), indent=4)
    print(formatted_json)
else:
    print(f"Error: {response.status_code} - {response.text}")

StatementMeta(, fa6328eb-bbee-4596-8444-6e664610bf5e, 6, Finished, Available, Finished)

Success:
[
    {
        "EventGroupUid": "a73c93f6-2d1e-46f8-80d0-5edfead75e39",
        "Name": "All Events",
        "ParentEventGroupId": null
    },
    {
        "EventGroupUid": "6c9bac34-4640-4578-b5fa-4595893caa68",
        "Name": "3DE National",
        "ParentEventGroupId": "a73c93f6-2d1e-46f8-80d0-5edfead75e39"
    },
    {
        "EventGroupUid": "c976600f-9dbf-4969-9c17-3ec819d93201",
        "Name": "Templates",
        "ParentEventGroupId": "6c9bac34-4640-4578-b5fa-4595893caa68"
    },
    {
        "EventGroupUid": "0d3a13d1-6859-44df-9ed6-dc2a98dd346d",
        "Name": "Training",
        "ParentEventGroupId": "6c9bac34-4640-4578-b5fa-4595893caa68"
    },
    {
        "EventGroupUid": "3a82eef5-67c6-410b-98cb-fc14329a8fa5",
        "Name": "Examples",
        "ParentEventGroupId": "0d3a13d1-6859-44df-9ed6-dc2a98dd346d"
    },
    {
        "EventGroupUid": "1e6ee492-98cb-4ee0-b96a-d63b03c8c976",
        "Name": "JA Arizona",
        "ParentEventGroupId": "a73c93f6-

In [None]:
# GET request to User Groups endpoint
url = f'https://{subdomain}.volunteerhub.com/api/v1/userGroups'

response = requests.get(url, headers=headers)

if response.status_code == 200:
    print("Success:")
    # Pretty-print the JSON response
    formatted_json = json.dumps(response.json(), indent=4)
    print(formatted_json)
else:
    print(f"Error: {response.status_code} - {response.text}")

In [20]:
# GET request to Events endpoint
url = f'https://{subdomain}.volunteerhub.com/api/v1/events'

params_base = {
    'query': 'Time',
    'earliestTime': '2024-01-01',
}

response = requests.get(url, headers=headers, params=params_base)

if response.status_code == 200:
    print("Success:")
    # Pretty-print the JSON response
    formatted_json = json.dumps(response.json(), indent=4)
    print(formatted_json)
else:
    print(f"Error: {response.status_code} - {response.text}")

StatementMeta(, 2cedae7a-566f-44a7-8af6-e90a241e945b, 30, Finished, Available, Finished)

Success:
[
    {
        "EventUid": "7669dd84-0147-42cb-be22-810996461b6e",
        "EndTime": "2024-02-28T10:15:00",
        "EventGroupUid": "2aa87db5-0f30-4585-baff-b03de66d6b45",
        "Location": "Hillsborough High School @ 5000 N Central Ave, Tampa, FL 33603",
        "LongDescription": "<p><strong><span style=\"color:#000000;\">Your Role</span></strong></p>\r\n<p><span style=\"color:#000000;\">As a <strong>Business Client,</strong> you will act as an advisor to a team of 4-6 3DE high school seniors who are consulting on a project for your company. Your role will be to help provide guidance and feedback to the students as they research, brainstorm, and develop solutions.</span></p>\r\n<p><span style=\"color:#000000;\"></span></p>\r\n<p><strong><span style=\"color:#000000;\">Consultancy Overview</span></strong></p>\r\n<p><span style=\"color:#000000;\">The 3DE Senior Consultancy culminates the high school experience in a unique and meaningful way - where students are able to app

In [24]:
# GET request to Users endpoint
url = f'https://{subdomain}.volunteerhub.com/api/v1/users'

params_base = {
    'query': 'LastUpdate',
    'earliestLastUpdate': '2025-01-01',
}

response = requests.get(url, headers=headers, params=params_base)

if response.status_code == 200:
    print("Success:")
    # Pretty-print the JSON response
    formatted_json = json.dumps(response.json(), indent=4)
    print(formatted_json)
else:
    print(f"Error: {response.status_code} - {response.text}")

StatementMeta(, 2cedae7a-566f-44a7-8af6-e90a241e945b, 34, Finished, Available, Finished)

Success:
[
    {
        "UserUid": "eefce3a1-ab6a-427a-a2ec-5b62a2dd86a7",
        "LastUpdate": "2025-01-02T05:22:00",
        "FormAnswers": [
            {
                "$type": "VolunteerHub.Data.Api.FormAnswerAddress, VolunteerHub.Data.Api",
                "Address1": "1084 Simonton Hill Court",
                "Address2": "",
                "Address3": "",
                "City": "Lawrenceville",
                "State": "GA",
                "PostalCode": "30045",
                "FormQuestionUid": "7f124ac4-a3ad-4bd5-89b5-f5ade6282868"
            },
            {
                "$type": "VolunteerHub.Data.Api.FormAnswerBoolean, VolunteerHub.Data.Api",
                "Value": false,
                "FormQuestionUid": "5990f1a1-9306-48a7-8b7d-4d46b2e31e2e"
            },
            {
                "$type": "VolunteerHub.Data.Api.FormAnswerDate, VolunteerHub.Data.Api",
                "Value": "1972-07-24T00:00:00",
                "FormQuestionUid": "294fe8a5-3b9e-440