## Bronze Layer - Raw Data Ingestion 


In [0]:
import pandas as pd
import numpy as np
import os
# importing required libraries


In [0]:
# defining file paths instead of hardcoding 
BASE_PATH = "/Volumes/student_risk_data/default/data"

files = {
    "demographics": "student_demographics.csv",
    "academics": "student_academic_performance.csv",
    "attendance": "student_attendance.csv",
    "retention": "student_retention_history.csv"
}


In [0]:
def load_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded {os.path.basename(file_path)} successfully.")
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except pd.errors.EmptyDataError:
        print(f"File is empty: {file_path}")
    except Exception as e:
        print(f"Error loading {file_path}: {e}")


In [0]:
# loading student demographics dataset into a df
df_demographics = load_csv(os.path.join(BASE_PATH, files["demographics"]))

Loaded student_demographics.csv successfully.


In [0]:
# loading student attendance dataset into a df
df_attendance = load_csv(os.path.join(BASE_PATH, files["attendance"]))

Loaded student_attendance.csv successfully.


In [0]:
# loading student academics dataset into a df
df_academics = load_csv(os.path.join(BASE_PATH, files["academics"]))

Loaded student_academic_performance.csv successfully.


In [0]:
# loading student retention dataset into a df
df_retention = load_csv(os.path.join(BASE_PATH, files["retention"]))

Loaded student_retention_history.csv successfully.


In [0]:
# save raw data into bronze tables
spark_academics = spark.createDataFrame(df_academics)
spark_attendance = spark.createDataFrame(df_attendance)
spark_demographics = spark.createDataFrame(df_demographics)
spark_retention = spark.createDataFrame(df_retention)

spark_demographics.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("student_risk_data.default.bronze_demographics")

spark_attendance.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("student_risk_data.default.bronze_attendance")

spark_academics.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("student_risk_data.default.bronze_academics")

spark_retention.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("student_risk_data.default.bronze_retention")
    

In [0]:
%sql
SELECT COUNT(*) FROM student_risk_data.default.bronze_academics;

COUNT(*)
5025


In [0]:
%sql
SELECT COUNT(*) FROM student_risk_data.default.bronze_attendance;

COUNT(*)
5025


In [0]:
%sql
SELECT COUNT(*) FROM student_risk_data.default.bronze_demographics;

COUNT(*)
1000


In [0]:
%sql
SELECT COUNT(*) FROM student_risk_data.default.bronze_retention;

COUNT(*)
2000
