In [0]:
import pandas as pd
import numpy as np
import os
from pyspark.sql import functions as F
from pyspark.sql.types import *
# importing required libraries

## Silver Layer - Cleaning & Feature Engineering

In [0]:
# Read Bronze Delta tables
df_bronze_demographics = spark.table(
    "student_risk_data.default.bronze_demographics"
)

df_bronze_attendance = spark.table(
    "student_risk_data.default.bronze_attendance"
)

df_bronze_academics = spark.table(
    "student_risk_data.default.bronze_academics"
)

df_bronze_retention = spark.table(
    "student_risk_data.default.bronze_retention"
)



In [0]:
# Add ingestion & lineage metadata
def add_metadata(df, source_table):
    return (
        df
        .withColumn("ingestion_timestamp", F.current_timestamp())
        .withColumn("source_table", F.lit(source_table))
    )

df_bronze_demographics = add_metadata(
    df_bronze_demographics, "bronze_demographics"
)

df_bronze_attendance = add_metadata(
    df_bronze_attendance, "bronze_attendance"
)

df_bronze_academics = add_metadata(
    df_bronze_academics, "bronze_academics"
)

df_bronze_retention = add_metadata(
    df_bronze_retention, "bronze_retention"
)


In [0]:
# enforce a strict schema
demo_typed = (
    df_bronze_demographics
    .select(
        F.col("student_id").cast("string"),
        F.col("student_name").cast("string"),
        F.col("gender").cast("string"),
       F.to_date(F.col("date_of_birth"), "dd-MM-yyyy").alias("date_of_birth"),
        F.col("academic_year").cast("string"),
        F.col("disability_flag").cast("int"),
        F.col("annual_family_income").cast("int"),
        F.col("parental_education").cast("string"),
        F.col("first_generation_student").cast("int"),
        F.col("urban_rural").cast("string"),
        F.col("admission_type").cast("string"),
        F.col("institution_code").cast("string"),
        "ingestion_timestamp",
        "source_table"
    )
)
att_typed = (
    df_bronze_attendance
    .select(
        F.col("student_id").cast("string"),
        F.col("subject_code").cast("string"),
        F.col("attendance_percentage").cast("int"),
        F.col("participation_score").cast("int"),
        F.col("academic_year").cast("string"),
        "ingestion_timestamp",
        "source_table"
    )
)
acad_typed = (
    df_bronze_academics
    .select(
        F.col("student_id").cast("string"),
        F.col("subject_code").cast("string"),
        F.col("subject_name").cast("string"),
        F.col("internal_marks").cast("int"),
        F.col("external_mark").cast("int"),
        F.col("academic_year").cast("string"),
        "ingestion_timestamp",
        "source_table"
    )
)
ret_typed = (
    df_bronze_retention
    .select(
        F.col("student_id").cast("string"),
        F.col("academic_year").cast("string"),
        F.col("dropout_flag").cast("int"),
        F.to_date(F.col("dropout_date"), "dd-MM-yyyy").alias("dropout_date"),
        F.col("dropout_stage").cast("string"),
        F.col("dropout_reason").cast("string"),
        F.col("overall_attendance_percentage").cast("int"),
        F.col("family_income_band").cast("string"),
        F.col("academic_score").cast("int"),
        F.col("institution_code").cast("string"),
        F.col("age_at_enrollment").cast("int"),
        F.col("gender").cast("string"),
        "ingestion_timestamp",
        "source_table"
    )
)


In [0]:
demo_validated = (
    demo_typed
    .withColumn(
        "valid_student_id",
        F.col("student_id").rlike("^S[0-9]{3}-[0-9]{2}$")
    )
)
att_validated = (
    att_typed
    .withColumn(
        "valid_attendance",
        F.when(
            F.col("attendance_percentage").isNull(),
            F.lit(False)
        ).otherwise(
            F.col("attendance_percentage").between(0, 100)
        )
    )
    .withColumn(
        "valid_participation",
        F.when(
            F.col("participation_score").isNull(),
            F.lit(False)
        ).otherwise(
            F.col("participation_score").between(0, 10)
        )
    )
)

acad_validated = (
    acad_typed
    .withColumn(
        "valid_subject",
        F.col("subject_code").isin(
            "SUB01", "SUB02", "SUB03", "SUB04", "SUB05"
        )
    )
    .withColumn(
        "valid_score",
        (F.coalesce(F.col("external_mark").between(0, 60), F.lit(False)) &
        F.coalesce(F.col("internal_marks").between(0, 40), F.lit(False)))
    )
)
ret_validated = (
    ret_typed
    .withColumn(
        "valid_flag",
        F.col("dropout_flag").isin(0, 1)
    )
    .withColumn(
        "valid_reason",
        (F.col("dropout_flag") == 0) |
        ((F.col("dropout_flag") == 1) & F.col("dropout_reason").isNotNull())
    )
    .withColumn(
        "valid_attendance",
        F.coalesce(F.col("overall_attendance_percentage").between(0, 100),
        F.lit(False))
    )
)



In [0]:
valid_student_ids = demo_validated.select("student_id").distinct()

# Cross-table validation
acad_validated = (
    acad_validated
    .join(
        valid_student_ids.withColumnRenamed("student_id", "sid"),
        acad_validated.student_id == F.col("sid"),
        "left"
    )
    .withColumn("valid_student", F.col("sid").isNotNull())
    .drop("sid")
)
acad_final = (
    acad_validated
    .withColumn(
        "is_valid_record",
        F.col("valid_student") &
        F.col("valid_score") &
        F.col("valid_subject")
    )
)
# Quarantine Table for unusual records 
acad_quarantine = (
    acad_final
    .filter(~F.col("is_valid_record"))
    .withColumn("quarantine_timestamp", F.current_timestamp())
)

acad_quarantine.write.mode("append").saveAsTable(
    "student_risk_data.default.bronze_academics_quarantine"
)

acad_valid = acad_final.filter("is_valid_record = true")



In [0]:
print("Final:", acad_final.count())
print("Valid:", acad_valid.count())
print("Quarantine DF:", acad_quarantine.count())


Final: 5025
Valid: 4533
Quarantine DF: 492


In [0]:
# Quarantine Attendance table for unusual records
att_validated = (
    att_validated
    .join(
        valid_student_ids.withColumnRenamed("student_id", "sid"),
        att_validated.student_id == F.col("sid"),
        "left"
    )
    .withColumn("valid_student", F.col("sid").isNotNull())
    .drop("sid")
)
att_final = (
    att_validated
    .withColumn(
        "is_valid_record",
        F.col("valid_attendance") &
        F.col("valid_participation") &
        F.col("valid_student")
    )
)
att_quarantine = (
    att_final
    .filter(~F.col("is_valid_record"))
    .withColumn("quarantine_timestamp", F.current_timestamp())
)

att_quarantine.write.mode("append").saveAsTable(
    "student_risk_data.default.bronze_attendance_quarantine"
)
att_valid = att_final.filter("is_valid_record = true")



In [0]:
print("Final:", att_final.count())
print("Valid:", att_valid.count())
print("Quarantine DF:", att_quarantine.count())


Final: 5025
Valid: 4550
Quarantine DF: 475


In [0]:
# Convert to Pandas for cleaning & FE logic
df_demographics = demo_validated.toPandas()
df_attendance   = att_valid.toPandas()
df_academics    = acad_valid.toPandas()
df_retention    = ret_validated.toPandas()

In [0]:
df_demographics

Unnamed: 0,student_id,student_name,gender,date_of_birth,academic_year,disability_flag,annual_family_income,parental_education,first_generation_student,urban_rural,admission_type,institution_code,ingestion_timestamp,source_table,valid_student_id
0,S001-24,Aarav Sharma,MALE,2003-11-21,2023-2024,0.0,259805.0,none,0,URBAN,RESERVATION,INST01,2026-01-05 06:41:23.409796,bronze_demographics,True
1,S002-25,Aarav Verma,female,2001-05-14,2024-2025,0.0,1899189.0,graduate,1,URBAN,merit,INST02,2026-01-05 06:41:23.409796,bronze_demographics,True
2,S003-24,Aarav Patel,FEMALE,2002-08-12,2023-2024,1.0,3903935.0,primary,0,Rural,merit,INST02,2026-01-05 06:41:23.409796,bronze_demographics,True
3,S004-24,Aarav Iyer,Female,2002-08-12,2023-2024,0.0,3155303.0,PRIMARY,1,Rural,RESERVATION,INST02,2026-01-05 06:41:23.409796,bronze_demographics,True
4,S005-24,Aarav Reddy,female,2002-08-12,2023-2024,1.0,1971394.0,primary,1,URBAN,merit,INST02,2026-01-05 06:41:23.409796,bronze_demographics,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,S996-24,Naveen Singh,Male,2003-11-21,2023-2024,1.0,2589446.0,Primary,1,Rural,merit,INST01,2026-01-05 06:41:23.409796,bronze_demographics,True
996,S997-24,Naveen Gupta,male,2002-08-12,2023-2024,0.0,4145970.0,SECONDARY,1,urban,merit,INST01,2026-01-05 06:41:23.409796,bronze_demographics,True
997,S998-25,Naveen Mehta,female,2002-08-12,2024-2025,0.0,497906.0,none,1,Rural,management,INST01,2026-01-05 06:41:23.409796,bronze_demographics,True
998,S999-24,Naveen Nair,female,2002-08-12,2023-2024,0.0,1109720.0,NONE,1,Urban,MANAGEMENT,INST01,2026-01-05 06:41:23.409796,bronze_demographics,True


## Cleaning Student demographics data

In [0]:
# Standardization
df_demographics["academic_year"] = df_demographics["academic_year"].astype(str).str.strip()
df_demographics["institution_code"] = df_demographics["institution_code"].str.upper().str.strip() # capitalize
df_demographics["date_of_birth"] = pd.to_datetime(
    df_demographics["date_of_birth"],
    errors="coerce"
)


In [0]:
# handling missing values
df_demographics["annual_family_income"] = (
    df_demographics["annual_family_income"]
    .fillna(df_demographics["annual_family_income"].median()) # impute income with median
)

df_demographics["disability_flag"] = (
    df_demographics["disability_flag"].fillna(0) # assume 0 if missing
)

df_demographics["parental_education"] = df_demographics["parental_education"].fillna("NONE") 
# fill missing values with a placeholder

df_demographics["first_generation_student"] = df_demographics["first_generation_student"].fillna(1)

df_demographics["urban_rural"] = df_demographics["urban_rural"].fillna("URBAN")

df_demographics["admission_type"] = df_demographics["admission_type"].fillna("MERIT")

In [0]:
# Normalization of categorical values
gender_map = {
    "M": "MALE", "F": "FEMALE",
    "male": "MALE", "female": "FEMALE"
}

df_demographics["gender"] = (
    df_demographics["gender"]
    .map(gender_map)
    .fillna("OTHER") # impute missing valued with label OTHER
)

df_demographics["parental_education"] = df_demographics["parental_education"].str.upper().str.strip()
df_demographics["urban_rural"] = df_demographics["urban_rural"].str.upper().str.strip()
df_demographics["admission_type"] = df_demographics["admission_type"].str.upper().str.strip()

In [0]:
# feature engineering
# Age is calculated as of academic year start (June 1)
df_demographics["age"] = (
    pd.to_datetime(
        df_demographics["academic_year"].str[:4] + "-06-01"
    ) - df_demographics["date_of_birth"]
).dt.days // 365


In [0]:
# classify income into bands
''' 
< 2,00,000 = 1 (low income)
betweeen 2,00,000 and 10,00,000 = 2 (medium income)
> 10,00,000 = 3 (high income)
'''
def income_band(income):
    if pd.isna(income):
        return np.nan
    elif income < 200000:
        return 1 
    elif income <= 1000000:
        return 2
    else:
        return 3

df_demographics["income_band"] = df_demographics["annual_family_income"].apply(income_band)

In [0]:
df_demographics["age_risk_flag"] = (
    (df_demographics["age"] < 18) | (df_demographics["age"] > 22)
).astype(int)
# Overage and underage students correlate with dropout risk

In [0]:
df_demographics["socio_economic_risk_flag"] = (
    (df_demographics["income_band"] == 1) &
    (df_demographics["first_generation_student"] == 1)
).astype(int)
# Low income + first generation is a risk 

In [0]:
df_demographics

Unnamed: 0,student_id,student_name,gender,date_of_birth,academic_year,disability_flag,annual_family_income,parental_education,first_generation_student,urban_rural,admission_type,institution_code,ingestion_timestamp,source_table,valid_student_id,age,income_band,age_risk_flag,socio_economic_risk_flag
0,S001-24,Aarav Sharma,OTHER,2003-11-21,2023-2024,0.0,259805.0,NONE,0,URBAN,RESERVATION,INST01,2026-01-05 06:41:23.409796,bronze_demographics,True,19,2,0,0
1,S002-25,Aarav Verma,FEMALE,2001-05-14,2024-2025,0.0,1899189.0,GRADUATE,1,URBAN,MERIT,INST02,2026-01-05 06:41:23.409796,bronze_demographics,True,23,3,1,0
2,S003-24,Aarav Patel,OTHER,2002-08-12,2023-2024,1.0,3903935.0,PRIMARY,0,RURAL,MERIT,INST02,2026-01-05 06:41:23.409796,bronze_demographics,True,20,3,0,0
3,S004-24,Aarav Iyer,OTHER,2002-08-12,2023-2024,0.0,3155303.0,PRIMARY,1,RURAL,RESERVATION,INST02,2026-01-05 06:41:23.409796,bronze_demographics,True,20,3,0,0
4,S005-24,Aarav Reddy,FEMALE,2002-08-12,2023-2024,1.0,1971394.0,PRIMARY,1,URBAN,MERIT,INST02,2026-01-05 06:41:23.409796,bronze_demographics,True,20,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,S996-24,Naveen Singh,OTHER,2003-11-21,2023-2024,1.0,2589446.0,PRIMARY,1,RURAL,MERIT,INST01,2026-01-05 06:41:23.409796,bronze_demographics,True,19,3,0,0
996,S997-24,Naveen Gupta,MALE,2002-08-12,2023-2024,0.0,4145970.0,SECONDARY,1,URBAN,MERIT,INST01,2026-01-05 06:41:23.409796,bronze_demographics,True,20,3,0,0
997,S998-25,Naveen Mehta,FEMALE,2002-08-12,2024-2025,0.0,497906.0,NONE,1,RURAL,MANAGEMENT,INST01,2026-01-05 06:41:23.409796,bronze_demographics,True,21,2,0,0
998,S999-24,Naveen Nair,FEMALE,2002-08-12,2023-2024,0.0,1109720.0,NONE,1,URBAN,MANAGEMENT,INST01,2026-01-05 06:41:23.409796,bronze_demographics,True,20,3,0,0


## Cleaning Attendance Records


In [0]:
# Standardization
df_attendance["academic_year"] = df_attendance["academic_year"].astype(str).str.strip()
df_attendance["subject_code"] = df_attendance["subject_code"].str.upper().str.strip()


In [0]:
# deduplication
df_attendance = (
    df_attendance
    .sort_values(
        by=["student_id", "academic_year", "subject_code"]
    )
    .drop_duplicates( 
        subset=["student_id", "academic_year", "subject_code"],
        keep="first"
    )
    .reset_index(drop=True)
)


In [0]:
df_attendance["attendance_percentage"] = (
    df_attendance
    .groupby("subject_code")["attendance_percentage"] 
    .transform(lambda x: x.fillna(x.median())) # fill missing values with subject wise median
)

df_attendance["participation_score"] = (
    df_attendance["participation_score"]
    .fillna(df_attendance["participation_score"].median()) # fill missing values with overall median
)


In [0]:
# feature engineering
df_attendance["low_attendance"] = (
    np.where(df_attendance["attendance_percentage"]<60, 1, 0) # 1 = low, 0 = high
)
df_attendance["low_participation"] = (
    np.where(df_attendance["participation_score"]<6, 1, 0) # 1 = low, 0 = high
)

## Cleaning Academic Performance Data

In [0]:
# standardization
df_academics["academic_year"] = df_academics["academic_year"].astype(str).str.strip()
df_academics["subject_code"] = df_academics["subject_code"].str.upper().str.strip() # converting to upper case
df_academics["subject_name"] = df_academics["subject_name"].str.upper().str.strip() 


In [0]:
df_academics = (
    df_academics
    .sort_values(
        by=["student_id", "academic_year", "subject_code"]
    )
    .drop_duplicates( # dropping duplicate records and marks
        subset=["student_id", "academic_year", "subject_code"],
        keep="first"
    )
    .reset_index(drop=True)
)


In [0]:
# Handle missing marks
df_academics["internal_marks"] = (
    df_academics
    .groupby("subject_code")["internal_marks"]
    .transform(lambda x: x.fillna(x.mean())) # imputing with subject wise mean
)

df_academics["external_mark"] = (
    df_academics
    .groupby("subject_code")["external_mark"]
    .transform(lambda x: x.fillna(x.mean()))
)

# feature engineering
df_academics["total_marks"] = (
    df_academics["internal_marks"] +
    df_academics["external_mark"]
)

conditions = [
    df_academics["total_marks"] >= 85,
    df_academics["total_marks"].between(70, 84),
    df_academics["total_marks"].between(55, 69),
    df_academics["total_marks"].between(40, 54),
    df_academics["total_marks"] < 40
]
grades = ["A", "B", "C", "D", "F"]  

# assigning a grade
df_academics["grade"] = np.select(conditions, grades, default='')

# assigning a pass/fail status
df_academics["pass_status"] = np.where(
    df_academics["grade"] == "F",
    "FAIL",
    "PASS"
)


## Dropout & Retention

In [0]:
# Standardization
df_retention["academic_year"] = df_retention["academic_year"].astype(str).str.strip()
df_retention["institution_code"] = df_retention["institution_code"].str.upper().str.strip()
df_retention["dropout_date"] = pd.to_datetime(
    df_retention["dropout_date"],
    errors="coerce"
)

In [0]:
# Handle missing values
df_retention["overall_attendance_percentage"] = (
    df_retention["overall_attendance_percentage"]
    .fillna(df_retention["overall_attendance_percentage"].median())
)

df_retention["family_income_band"] = df_retention["family_income_band"].fillna("Unknown")

df_retention["academic_score"] = (
    df_retention["academic_score"]
    .fillna(df_retention["academic_score"].median())
)

df_retention["age_at_enrollment"] = (
    df_retention["age_at_enrollment"]
    .fillna(df_retention["age_at_enrollment"].median())
)

df_retention["dropout_reason"] = (
    df_retention["dropout_reason"].fillna("Unknown")
)

df_retention["dropout_stage"] = (
    df_retention["dropout_stage"].fillna("Unknown")
)

In [0]:
gender_map = {
    "M": "MALE", "F": "FEMALE",
    "male": "MALE", "female": "FEMALE"
}

df_retention["gender"] = (
    df_retention["gender"]
    .map(gender_map)
    .fillna("OTHER") # impute missing valued with label OTHER
)

df_retention["family_income_band"] = df_retention["family_income_band"].str.upper().str.strip()

In [0]:
df_retention["low_attendance_flag"] = (
    df_retention["overall_attendance_percentage"] < 60
).astype(int)

df_retention["low_academic_flag"] = (
    df_retention["academic_score"] < 50
).astype(int)

In [0]:
df_retention = df_retention.drop(columns=["dropout_date"])


In [0]:
df_retention


Unnamed: 0,student_id,academic_year,dropout_flag,dropout_stage,dropout_reason,overall_attendance_percentage,family_income_band,academic_score,institution_code,age_at_enrollment,gender,ingestion_timestamp,source_table,valid_flag,valid_reason,valid_attendance,low_attendance_flag,low_academic_flag
0,H1703-23,2022-2023,1,Mid,Health,52.0,HIGH,42.0,INST02,23.0,FEMALE,2026-01-05 06:41:25.069327,bronze_retention,True,True,True,1,1
1,H1181-23,2022-2023,1,Mid,Academic,51.0,LOW,42.0,INST02,21.0,OTHER,2026-01-05 06:41:25.069327,bronze_retention,True,True,True,1,1
2,H1586-22,2021-2022,1,Early,Academic,51.0,MEDIUM,42.0,INST01,22.0,MALE,2026-01-05 06:41:25.069327,bronze_retention,True,True,True,1,1
3,H1927-23,2022-2023,1,Early,Health,51.0,LOW,42.0,INST02,25.0,MALE,2026-01-05 06:41:25.069327,bronze_retention,True,True,True,1,1
4,H786-21,2020-2021,1,Early,Health,49.0,MEDIUM,42.0,INST02,18.0,OTHER,2026-01-05 06:41:25.069327,bronze_retention,True,True,True,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,H996-22,2021-2022,0,Unknown,Unknown,86.0,MEDIUM,88.0,INST02,24.0,FEMALE,2026-01-05 06:41:25.069327,bronze_retention,True,True,True,0,0
1996,H573-23,2022-2023,1,Early,Financial,54.0,HIGH,42.0,INST02,21.0,OTHER,2026-01-05 06:41:25.069327,bronze_retention,True,True,True,1,1
1997,H1376-23,2022-2023,1,Early,Financial,54.0,MEDIUM,42.0,INST02,24.0,MALE,2026-01-05 06:41:25.069327,bronze_retention,True,True,True,1,1
1998,H509-22,2021-2022,1,Late,Financial,53.0,HIGH,42.0,INST01,23.0,OTHER,2026-01-05 06:41:25.069327,bronze_retention,True,True,True,1,1


## Data quality validation checks

In [0]:
# Checking for null values
print("df_demographics null count: ",df_demographics.isna().sum().sum())
print("df_attendance null count: ", df_attendance.isna().sum().sum())
print("df_academics null count: ", df_academics.isna().sum().sum())
print("df_retention null count: ", df_retention.isna().sum().sum())

df_demographics null count:  0
df_attendance null count:  0
df_academics null count:  0
df_retention null count:  0


In [0]:
# check for duplicates
duplicate_count = df_academics.duplicated(subset=["student_id", "academic_year", "subject_code"]).sum()
print("Academics Dataset Duplicate records:", duplicate_count)

duplicate_count = df_attendance.duplicated(subset=["student_id", "academic_year", "subject_code"]).sum()
print("Attendance Dataset Duplicate records:", duplicate_count)

duplicate_count = df_demographics.duplicated(subset=["student_id"]).sum()
print("demographics Dataset Duplicate records:", duplicate_count)

duplicate_count = df_retention.duplicated(subset=["student_id", "academic_year"]).sum()
print("retention Dataset Duplicate records:", duplicate_count)

Academics Dataset Duplicate records: 0
Attendance Dataset Duplicate records: 0
demographics Dataset Duplicate records: 0
retention Dataset Duplicate records: 0


## Saving cleaned tables for gold aggregation

In [0]:
# convert pandas df to spark for persisting tables
spark_academics = spark.createDataFrame(df_academics)
spark_attendance = spark.createDataFrame(df_attendance)
spark_demographics = spark.createDataFrame(df_demographics)
spark_retention = spark.createDataFrame(df_retention)

In [0]:
spark_academics.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("student_risk_data.default.silver_academics")

spark_attendance.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("student_risk_data.default.silver_attendance")

spark_demographics.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("student_risk_data.default.silver_demographics")

spark_retention.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("student_risk_data.default.silver_retention")


In [0]:
%sql
SELECT * FROM student_risk_data.default.silver_academics;

student_id,subject_code,subject_name,internal_marks,external_mark,academic_year,ingestion_timestamp,source_table,valid_subject,valid_score,valid_student,is_valid_record,total_marks,grade,pass_status
S597-25,SUB03,ENGLISH,23,51,2024-2025,2026-01-05T06:41:24.365Z,bronze_academics,True,True,True,True,74,B,PASS
S597-25,SUB04,SOCIAL STUDIES,23,31,2024-2025,2026-01-05T06:41:24.365Z,bronze_academics,True,True,True,True,54,D,PASS
S597-25,SUB05,COMPUTER SCIENCE,8,31,2024-2025,2026-01-05T06:41:24.365Z,bronze_academics,True,True,True,True,39,F,FAIL
S598-25,SUB01,MATHEMATICS,34,33,2024-2025,2026-01-05T06:41:24.365Z,bronze_academics,True,True,True,True,67,C,PASS
S598-25,SUB02,SCIENCE,7,37,2024-2025,2026-01-05T06:41:24.365Z,bronze_academics,True,True,True,True,44,D,PASS
S598-25,SUB03,ENGLISH,25,38,2024-2025,2026-01-05T06:41:24.365Z,bronze_academics,True,True,True,True,63,C,PASS
S598-25,SUB04,SOCIAL STUDIES,5,39,2024-2025,2026-01-05T06:41:24.365Z,bronze_academics,True,True,True,True,44,D,PASS
S598-25,SUB05,COMPUTER SCIENCE,14,45,2024-2025,2026-01-05T06:41:24.365Z,bronze_academics,True,True,True,True,59,C,PASS
S599-24,SUB01,MATHEMATICS,35,29,2023-2024,2026-01-05T06:41:24.365Z,bronze_academics,True,True,True,True,64,C,PASS
S599-24,SUB02,SCIENCE,30,39,2023-2024,2026-01-05T06:41:24.365Z,bronze_academics,True,True,True,True,69,C,PASS


In [0]:
%sql
SELECT * FROM student_risk_data.default.silver_retention;

student_id,academic_year,dropout_flag,dropout_stage,dropout_reason,overall_attendance_percentage,family_income_band,academic_score,institution_code,age_at_enrollment,gender,ingestion_timestamp,source_table,valid_flag,valid_reason,valid_attendance,low_attendance_flag,low_academic_flag
H1703-23,2022-2023,1,Mid,Health,52.0,HIGH,42.0,INST02,23.0,FEMALE,2026-01-05T06:41:25.069Z,bronze_retention,True,True,True,1,1
H1181-23,2022-2023,1,Mid,Academic,51.0,LOW,42.0,INST02,21.0,OTHER,2026-01-05T06:41:25.069Z,bronze_retention,True,True,True,1,1
H1586-22,2021-2022,1,Early,Academic,51.0,MEDIUM,42.0,INST01,22.0,MALE,2026-01-05T06:41:25.069Z,bronze_retention,True,True,True,1,1
H1927-23,2022-2023,1,Early,Health,51.0,LOW,42.0,INST02,25.0,MALE,2026-01-05T06:41:25.069Z,bronze_retention,True,True,True,1,1
H786-21,2020-2021,1,Early,Health,49.0,MEDIUM,42.0,INST02,18.0,OTHER,2026-01-05T06:41:25.069Z,bronze_retention,True,True,True,1,1
H1006-22,2021-2022,0,Unknown,Unknown,72.0,LOW,90.0,INST02,25.0,OTHER,2026-01-05T06:41:25.069Z,bronze_retention,True,True,True,0,0
H073-23,2022-2023,1,Late,Personal,48.0,LOW,42.0,INST02,20.0,FEMALE,2026-01-05T06:41:25.069Z,bronze_retention,True,True,True,1,1
H1008-21,2020-2021,0,Unknown,Unknown,52.0,LOW,52.0,INST02,18.0,FEMALE,2026-01-05T06:41:25.069Z,bronze_retention,True,True,True,1,0
H156-22,2021-2022,1,Mid,Health,47.0,MEDIUM,42.0,INST01,22.0,OTHER,2026-01-05T06:41:25.069Z,bronze_retention,True,True,True,1,1
H495-22,2021-2022,1,Late,Personal,47.0,MEDIUM,42.0,INST02,18.0,MALE,2026-01-05T06:41:25.069Z,bronze_retention,True,True,True,1,1


In [0]:
%sql
SELECT * FROM student_risk_data.default.silver_attendance;


student_id,subject_code,attendance_percentage,participation_score,academic_year,ingestion_timestamp,source_table,valid_attendance,valid_participation,valid_student,is_valid_record,low_attendance,low_participation
S198-24,SUB03,46,7,2023-2024,2026-01-05T06:41:23.710Z,bronze_attendance,True,True,True,True,1,0
S198-24,SUB05,76,5,2023-2024,2026-01-05T06:41:23.710Z,bronze_attendance,True,True,True,True,0,1
S199-24,SUB01,38,3,2023-2024,2026-01-05T06:41:23.710Z,bronze_attendance,True,True,True,True,1,1
S199-24,SUB02,56,0,2023-2024,2026-01-05T06:41:23.710Z,bronze_attendance,True,True,True,True,1,1
S199-24,SUB03,34,2,2023-2024,2026-01-05T06:41:23.710Z,bronze_attendance,True,True,True,True,1,1
S199-24,SUB04,63,2,2023-2024,2026-01-05T06:41:23.710Z,bronze_attendance,True,True,True,True,0,1
S199-24,SUB05,52,3,2023-2024,2026-01-05T06:41:23.710Z,bronze_attendance,True,True,True,True,1,1
S200-25,SUB01,82,7,2024-2025,2026-01-05T06:41:23.710Z,bronze_attendance,True,True,True,True,0,0
S200-25,SUB02,49,5,2024-2025,2026-01-05T06:41:23.710Z,bronze_attendance,True,True,True,True,1,1
S200-25,SUB03,48,7,2024-2025,2026-01-05T06:41:23.710Z,bronze_attendance,True,True,True,True,1,0


In [0]:
%sql
SELECT * FROM student_risk_data.default.silver_demographics;

student_id,student_name,gender,date_of_birth,academic_year,disability_flag,annual_family_income,parental_education,first_generation_student,urban_rural,admission_type,institution_code,ingestion_timestamp,source_table,valid_student_id,age,income_band,age_risk_flag,socio_economic_risk_flag
S001-24,Aarav Sharma,OTHER,2003-11-21T00:00:00.000Z,2023-2024,0.0,259805.0,NONE,0,URBAN,RESERVATION,INST01,2026-01-05T06:41:23.409Z,bronze_demographics,True,19,2,0,0
S002-25,Aarav Verma,FEMALE,2001-05-14T00:00:00.000Z,2024-2025,0.0,1899189.0,GRADUATE,1,URBAN,MERIT,INST02,2026-01-05T06:41:23.409Z,bronze_demographics,True,23,3,1,0
S003-24,Aarav Patel,OTHER,2002-08-12T00:00:00.000Z,2023-2024,1.0,3903935.0,PRIMARY,0,RURAL,MERIT,INST02,2026-01-05T06:41:23.409Z,bronze_demographics,True,20,3,0,0
S004-24,Aarav Iyer,OTHER,2002-08-12T00:00:00.000Z,2023-2024,0.0,3155303.0,PRIMARY,1,RURAL,RESERVATION,INST02,2026-01-05T06:41:23.409Z,bronze_demographics,True,20,3,0,0
S005-24,Aarav Reddy,FEMALE,2002-08-12T00:00:00.000Z,2023-2024,1.0,1971394.0,PRIMARY,1,URBAN,MERIT,INST02,2026-01-05T06:41:23.409Z,bronze_demographics,True,20,3,0,0
S006-25,Aarav Singh,MALE,2003-11-21T00:00:00.000Z,2024-2025,0.0,3086646.0,PRIMARY,1,URBAN,MANAGEMENT,INST01,2026-01-05T06:41:23.409Z,bronze_demographics,True,20,3,0,0
S007-24,Aarav Singh,OTHER,2001-05-14T00:00:00.000Z,2023-2024,0.0,4554433.0,GRADUATE,1,URBAN,RESERVATION,INST01,2026-01-05T06:41:23.409Z,bronze_demographics,True,22,3,0,0
S008-24,Aarav Mehta,MALE,2003-11-21T00:00:00.000Z,2023-2024,1.0,3186616.0,SECONDARY,0,URBAN,MANAGEMENT,INST01,2026-01-05T06:41:23.409Z,bronze_demographics,True,19,3,0,0
S009-24,Aarav Nair,OTHER,2001-05-14T00:00:00.000Z,2023-2024,1.0,4036957.0,PRIMARY,0,RURAL,MANAGEMENT,INST01,2026-01-05T06:41:23.409Z,bronze_demographics,True,22,3,0,0
S010-24,Aarav Khan,MALE,2001-05-14T00:00:00.000Z,2023-2024,0.0,1934897.0,NONE,0,URBAN,MERIT,INST02,2026-01-05T06:41:23.409Z,bronze_demographics,True,22,3,0,0


In [0]:
%sql
SELECT * FROM student_risk_data.default.bronze_attendance_quarantine LIMIT 10;

student_id,subject_code,attendance_percentage,participation_score,academic_year,ingestion_timestamp,source_table,valid_attendance,valid_participation,valid_student,is_valid_record,quarantine_timestamp
S004-24,SUB03,59.0,,2023-2024,2026-01-05T05:31:13.377Z,bronze_attendance,True,False,True,False,2026-01-05T05:31:13.377Z
S005-24,SUB02,,6.0,2023-2024,2026-01-05T05:31:13.377Z,bronze_attendance,False,True,True,False,2026-01-05T05:31:13.377Z
S013-24,SUB02,83.0,,2023-2024,2026-01-05T05:31:13.377Z,bronze_attendance,True,False,True,False,2026-01-05T05:31:13.377Z
S014-24,SUB05,,9.0,2023-2024,2026-01-05T05:31:13.377Z,bronze_attendance,False,True,True,False,2026-01-05T05:31:13.377Z
S019-24,SUB05,66.0,,2023-2024,2026-01-05T05:31:13.377Z,bronze_attendance,True,False,True,False,2026-01-05T05:31:13.377Z
S020-24,SUB02,,3.0,2023-2024,2026-01-05T05:31:13.377Z,bronze_attendance,False,True,True,False,2026-01-05T05:31:13.377Z
S022-25,SUB02,52.0,,2024-2025,2026-01-05T05:31:13.377Z,bronze_attendance,True,False,True,False,2026-01-05T05:31:13.377Z
S025-25,SUB03,,8.0,2024-2025,2026-01-05T05:31:13.377Z,bronze_attendance,False,True,True,False,2026-01-05T05:31:13.377Z
S028-25,SUB03,,7.0,2024-2025,2026-01-05T05:31:13.377Z,bronze_attendance,False,True,True,False,2026-01-05T05:31:13.377Z
S028-25,SUB05,,6.0,2024-2025,2026-01-05T05:31:13.377Z,bronze_attendance,False,True,True,False,2026-01-05T05:31:13.377Z


In [0]:
%sql
SELECT * FROM student_risk_data.default.bronze_academics_quarantine LIMIT 10;

student_id,subject_code,subject_name,internal_marks,external_mark,academic_year,ingestion_timestamp,source_table,valid_subject,valid_score,valid_student,is_valid_record,quarantine_timestamp
S719-25,SUB04,Social Studies,134,25.0,2024-2025,2026-01-05T06:41:14.699Z,bronze_academics,True,False,True,False,2026-01-05T06:41:14.699Z
S636-25,SUB05,Computer Science,133,34.0,2024-2025,2026-01-05T06:41:14.699Z,bronze_academics,True,False,True,False,2026-01-05T06:41:14.699Z
S046-25,SUB04,Social Studies,35,,2024-2025,2026-01-05T06:41:14.699Z,bronze_academics,True,False,True,False,2026-01-05T06:41:14.699Z
S237-24,SUB04,Social Studies,35,,2023-2024,2026-01-05T06:41:14.699Z,bronze_academics,True,False,True,False,2026-01-05T06:41:14.699Z
S439-24,SUB02,Science,35,,2023-2024,2026-01-05T06:41:14.699Z,bronze_academics,True,False,True,False,2026-01-05T06:41:14.699Z
S665-24,SUB02,Science,35,,2023-2024,2026-01-05T06:41:14.699Z,bronze_academics,True,False,True,False,2026-01-05T06:41:14.699Z
S720-25,SUB02,Science,35,,2024-2025,2026-01-05T06:41:14.699Z,bronze_academics,True,False,True,False,2026-01-05T06:41:14.699Z
S755-25,SUB02,Science,35,,2024-2025,2026-01-05T06:41:14.699Z,bronze_academics,True,False,True,False,2026-01-05T06:41:14.699Z
S274-25,SUB04,Social Studies,34,,2024-2025,2026-01-05T06:41:14.699Z,bronze_academics,True,False,True,False,2026-01-05T06:41:14.699Z
S274-25,SUB05,Computer Science,30,,2024-2025,2026-01-05T06:41:14.699Z,bronze_academics,True,False,True,False,2026-01-05T06:41:14.699Z
