In [1]:
import requests
import json
import io
import zipfile
import pandas as pd
import time
from pyspark.sql import SparkSession
import re
from datetime import datetime

StatementMeta(, 3d73cea5-d595-40c9-9f55-7c662210e9a4, 3, Finished, Available, Finished)

In [2]:
spark = SparkSession.builder.getOrCreate()
tokens_df = spark.table("my3de_survey_bronze.tokens")
token_json = tokens_df.collect()[0][0]
token_data = json.loads(token_json)
access_token = token_data['access_token']

StatementMeta(, 3d73cea5-d595-40c9-9f55-7c662210e9a4, 4, Finished, Available, Finished)

In [3]:
# Constants
BASE_URL = "https://learning.my3de.org/d2l/api/lp/1.43/datasets/bds/"
LAKEHOUSE_NAME = "my3de_survey_bronze" 

DATASET_NAMES = ["rubricObjects", "surveyAttempts", "rubricObjectCriteria", "users", "userEnrollments", "organizationalUnits", "courseAccess", "rubricAssessment", "surveyObjects", "roleDetails", "surveyUserAnswerResponses", "questionLibrary", "surveyQuestionAnswers", "quizobjects", "quizattempts", "quizuseranswers"]
EXTRACTS_LINKS_FULL = [
    "https://learning.my3de.org/d2l/api/lp/1.43/datasets/bds/c473942c-74e4-4ef2-83a6-638434a7db26/plugins/841308a2-e761-498e-a4cc-0c3619791c19/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.43/datasets/bds/c597fc3e-b7ee-4d30-af3f-a7c825cc59f3/plugins/10c06cbe-7171-4d6f-bd06-8330a93e5d2e/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.43/datasets/bds/fac02315-302f-41b8-8b07-10c00f7a8d1d/plugins/df537dc9-8358-4c28-9ab9-ddb8d364a9fc/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.43/datasets/bds/b21a6414-38f8-4da8-9a65-8b5586f9fe3b/plugins/1d6d722e-b572-456f-97c1-d526570daa6b/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.43/datasets/bds/5ced736e-4c4c-4b01-96c8-1c7d404ac5c2/plugins/533f84c8-b2ad-4688-94dc-c839952e9c4f/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.43/datasets/bds/53d5273c-1dc0-412b-beb3-417298bd0c6d/plugins/07a9e561-e22f-4e82-8dd6-7bfb14c91776/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.43/datasets/bds/2386cc16-2058-4495-a3ee-2148e7dddf0f/plugins/e260902a-582c-48c9-8dd8-80aa7dfa6b76/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.43/datasets/bds/d197b592-0c59-438c-8186-f42af6fddd35/plugins/cd7fa762-841e-48c5-abd7-6379b84963bf/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.46/datasets/bds/ed15df0c-49e7-4ace-aeb3-22a415975a3f/plugins/6bb3c6c2-7a61-44df-a081-d8762d93a3b5/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.46/datasets/bds/d70f64e0-ad63-4140-aac5-e337560b8371/plugins/bd61f20b-be91-4b93-b449-46361e2c323f/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.46/datasets/bds/54d695fe-1c34-4fae-b1d5-a5923ca933a2/plugins/20923295-981b-4d3c-8ab8-aa149abfdb45/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.46/datasets/bds/c9edec37-1322-44ed-a922-f68d11472f6e/plugins/5c0f2c70-4737-44ee-8780-be67bfa43594/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.46/datasets/bds/b22fc5b0-cc1d-460b-8f5c-8e0140ad4669/plugins/365132d5-49dd-4014-b405-802dae49e4d7/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.43/datasets/bds/f6fc270a-20ec-4fe0-9e90-c461fb2c53b2/plugins/eef7ca81-86bb-430c-96ee-382b83f5c0f9/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.46/datasets/bds/a7d6e843-bf8d-4965-9274-95028f3c4d86/plugins/f1623581-c5d7-4562-93fe-6ad16010c96b/extracts",
    "https://learning.my3de.org/d2l/api/lp/1.43/datasets/bds/241c5dfb-4807-4f54-a7c4-78fbd7fb2671/plugins/93d6063b-61d4-4629-a6af-b4fad71f8c55/extracts" 
]

assert len(DATASET_NAMES) == len(EXTRACTS_LINKS_FULL)

StatementMeta(, 3d73cea5-d595-40c9-9f55-7c662210e9a4, 5, Finished, Available, Finished)

In [4]:
def fetch_dataset_info(extracts_links, access_token):
    """Fetch dataset download links and CreatedDate for each link."""
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Accept": "application/json"
    }
    
    datasets = []
    
    for extracts_link, dataset_name in zip(extracts_links, DATASET_NAMES):
        try:
            response = requests.get(extracts_link, headers=headers)
            response.raise_for_status()
            
            api_objects = response.json().get("Objects", [])
            
            if api_objects:  # Assuming we're fetching the first (latest) dataset from each link
                latest_dataset = api_objects[0]
                download_link = latest_dataset.get("DownloadLink")
                created_date = latest_dataset.get("CreatedDate", "unknown_date")
                bds_type = latest_dataset.get("BdsType", "unknown_type")

                if created_date != "unknown_date":
                    created_date = datetime.strptime(created_date, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y-%m-%d")

                datasets.append((download_link, created_date, bds_type, dataset_name))
            else:
                print(f"No datasets found for {dataset_name} at {extracts_link}")

        except requests.exceptions.HTTPError as err:
            print(f"HTTP error occurred for {dataset_name} at {extracts_link}: {err}")
        except Exception as err:
            print(f"An error occurred for {dataset_name} at {extracts_link}: {err}")

    return datasets if datasets else None

# for FULL Brightspace Datasets
def process_and_store_csv_full(download_link, access_token, created_date, bds_type, dataset_name):
    headers = {
        "Authorization": f"Bearer {access_token}"
    }

    try:
        response = requests.get(download_link, headers=headers, stream=True)
        response.raise_for_status()

        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            for csv_file in [f for f in z.namelist() if f.endswith(".csv")]:
                csv_content = z.read(csv_file)
                df = pd.read_csv(io.BytesIO(csv_content), low_memory=False)
                
                df.columns = [col.lower().replace(' ', '_') for col in df.columns]
                
                # Add the "Data Retrieval Date" column
                df['data_retrieval_date'] = created_date
                
                # Convert to Spark DataFrame
                spark = SparkSession.builder.getOrCreate()
                spark_df = spark.createDataFrame(df)

                # Use dataset_name for table name
                table_name = dataset_name.replace(' ', '_')  # Replace spaces with underscores if necessary

                # Uniform name for the table
                uniform_table_name = f"bds_full_{table_name}"

                spark.sql(f"""
                CREATE TABLE IF NOT EXISTS {LAKEHOUSE_NAME}.{uniform_table_name} 
                USING DELTA 
                TBLPROPERTIES (
                    'delta.columnMapping.mode' = 'name',
                    'delta.minReaderVersion' = '2',
                    'delta.minWriterVersion' = '5'
                )
                """)
                
                spark_df.write \
                    .format("delta") \
                    .mode("overwrite") \
                    .option("overwriteSchema", "true") \
                    .saveAsTable(f"{LAKEHOUSE_NAME}.{uniform_table_name} ")

                print(f"Stored: {LAKEHOUSE_NAME}.{uniform_table_name}")

    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
    except zipfile.BadZipFile:
        print("Downloaded file is not a valid zip archive.")
    except Exception as err:
        print(f"An error occurred: {err}")

StatementMeta(, 3d73cea5-d595-40c9-9f55-7c662210e9a4, 6, Finished, Available, Finished)

In [5]:
# FULL BDS
print("Fetching dataset information...")
datasets_info = fetch_dataset_info(EXTRACTS_LINKS_FULL, access_token)

if datasets_info:
    for download_link, created_date, bds_type, dataset_name in datasets_info:
        print(f"Processing dataset: {dataset_name}")
        process_and_store_csv_full(download_link, access_token, created_date, bds_type, dataset_name)
else:
    print("Failed to retrieve any dataset information.")

StatementMeta(, 3d73cea5-d595-40c9-9f55-7c662210e9a4, 7, Finished, Available, Finished)

Fetching dataset information...
Processing dataset: rubricObjects
Stored: my3de_survey_bronze.bds_full_rubricObjects
Processing dataset: surveyAttempts
Stored: my3de_survey_bronze.bds_full_surveyAttempts
Processing dataset: rubricObjectCriteria
Stored: my3de_survey_bronze.bds_full_rubricObjectCriteria
Processing dataset: users
Stored: my3de_survey_bronze.bds_full_users
Processing dataset: userEnrollments
Stored: my3de_survey_bronze.bds_full_userEnrollments
Processing dataset: organizationalUnits
Stored: my3de_survey_bronze.bds_full_organizationalUnits
Processing dataset: courseAccess
Stored: my3de_survey_bronze.bds_full_courseAccess
Processing dataset: rubricAssessment
Stored: my3de_survey_bronze.bds_full_rubricAssessment
Processing dataset: surveyObjects
Stored: my3de_survey_bronze.bds_full_surveyObjects
Processing dataset: roleDetails
Stored: my3de_survey_bronze.bds_full_roleDetails
Processing dataset: surveyUserAnswerResponses
Stored: my3de_survey_bronze.bds_full_surveyUserAnswerRes

In [6]:
df = spark.sql("SELECT * FROM my3de_survey_bronze.bds_full_surveyattempts")
test = df.toPandas() #.sort_values(by='dayaccessed')



StatementMeta(, 3d73cea5-d595-40c9-9f55-7c662210e9a4, 8, Finished, Available, Finished)

In [7]:
test.shape

StatementMeta(, 3d73cea5-d595-40c9-9f55-7c662210e9a4, 9, Finished, Available, Finished)

(139065, 12)

In [8]:
test

StatementMeta(, 3d73cea5-d595-40c9-9f55-7c662210e9a4, 10, Finished, Available, Finished)

Unnamed: 0,attemptid,surveyid,userid,orgunitid,attemptnumber,timestarted,timecompleted,attemptedfromorgunitid,oldattemptnumber,isdeleted,version,data_retrieval_date
0,586263,1368694,703963,164418,2,2025-04-04T13:04:33.2030000Z,2025-04-04T13:05:08.5800000Z,164418,,False,281956515,2025-07-09
1,582502,1368694,703914,164418,2,2025-04-02T15:45:23.9600000Z,2025-04-02T15:45:43.8000000Z,164418,,False,281361416,2025-07-09
2,582280,1368694,703680,164418,2,2025-04-02T14:44:55.0200000Z,2025-04-02T14:45:40.1430000Z,164418,,False,281319536,2025-07-09
3,581387,1368694,703530,164418,2,2025-04-02T00:50:08.2930000Z,2025-04-02T00:50:30.8870000Z,164418,,False,281205796,2025-07-09
4,595657,1368694,703006,164418,2,2025-04-09T22:36:03.6170000Z,2025-04-09T22:36:45.0800000Z,164418,,False,283441588,2025-07-09
...,...,...,...,...,...,...,...,...,...,...,...,...
139060,1796,15751,1132,8835,1,2023-07-20T13:47:35.3330000Z,,8835,,False,38501099,2025-07-09
139061,1517,15751,635,8835,1,2023-07-19T14:18:09.1030000Z,,8835,,False,38379431,2025-07-09
139062,1463,15751,632,8835,1,2023-07-19T12:40:47.2170000Z,,8835,,False,38281758,2025-07-09
139063,2166,15751,630,8835,1,2023-07-21T19:45:11.3370000Z,,8835,,False,38769979,2025-07-09
