# DAMO-630-29 Assignment 01

In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sdv.metadata import Metadata
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer
from sdmetrics.reports.single_table import QualityReport, DiagnosticReport

## BUSINESS CHALLENGE #01

# TASK I - Exploratory Data Analisis

The EDA offers an initial overview of the dataset by inspecting its structure, detecting missing values or outliers, and applying descriptive statistics with visualizations. These insights provide the foundation for subsequent synthetic data generation and evaluation.

In [None]:
# 1.1. load dataset
df = pd.read_csv("Datasets\HealthInsurance.csv")  # adjust file name as needed

In [None]:
# 1.2. Shape
print("Shape:", df.shape)

In [None]:
# 1.3. Preview
display(df.head())

In [None]:
# 1.4. Info
df.info()

In [None]:
# 1.5. Descriptive statistics
display(df.describe())

In [None]:
# 1.6. Missing values
print(df.isnull().sum())

# 1.7. Distribution plots (example numeric columns)
for col in df.select_dtypes(include=np.number).columns[:3]:
    plt.hist(df[col].dropna(), bins=30)
    plt.title(f"Distribution: {col}")
    plt.show()

# Task II — Baseline Synthetic Data

In [None]:
# 2.1 Random noise baseline
synthetic_baseline = pd.DataFrame(
    np.random.randn(df.shape[0], df.shape[1]),
    columns=df.columns
)
display(synthetic_baseline.head())

# Task III — Advanced Synthetic Data (SDV)

In [None]:
# 3.1 Infer table metadata (types, constraints, relations)
metadata = Metadata.detect_from_dataframe(data=df, table_name="my_table")

In [None]:
# 3.2 GaussianCopula
gc = GaussianCopulaSynthesizer(metadata)
gc.fit(df)
synthetic_gc = gc.sample(num_rows=len(df))
display(synthetic_gc.head())

In [None]:
# 3.3 CTGAN
ctgan = CTGANSynthesizer(metadata, epochs=200, batch_size=100, verbose=True)
ctgan.fit(df)
synthetic_ctgan = ctgan.sample(num_rows=len(df))
display(synthetic_ctgan.head())

# Task IV — Evaluation

Convert metadata for sdmetrics (single table)

In [None]:
# Convert metadata for sdmetrics (single table)
_meta_dict = metadata.to_dict()
if "tables" in _meta_dict:
    _table_name = next(iter(_meta_dict["tables"].keys()))
    single_table_meta = _meta_dict["tables"][_table_name]
else:
    single_table_meta = _meta_dict

In [None]:
# 4.1 Quality and Diagnostics
qr_gc = QualityReport(); qr_gc.generate(df, synthetic_gc, single_table_meta)
qr_ct = QualityReport(); qr_ct.generate(df, synthetic_ctgan, single_table_meta)

print("Quality — GC:", qr_gc.get_score())
print("Quality — CTGAN:", qr_ct.get_score())

dr_gc = DiagnosticReport(); dr_gc.generate(df, synthetic_gc, single_table_meta)
dr_ct = DiagnosticReport(); dr_ct.generate(df, synthetic_ctgan, single_table_meta)

In [None]:
# 4.2 Correlation Preservation
def corr_rmse(a, b):
    cols = a.select_dtypes(include=np.number).columns.intersection(
        b.select_dtypes(include=np.number).columns
    )
    if len(cols) < 2:
        return np.nan
    ca, cb = a[cols].corr(), b[cols].corr()
    mask = np.triu(np.ones_like(ca, dtype=bool), k=1)
    diff = (ca - cb).where(mask)
    vals = diff.values[~np.isnan(diff.values)]
    return np.sqrt(np.mean(vals**2)) if len(vals) else np.nan

print("Correlation RMSE — GC:", corr_rmse(df, synthetic_gc))
print("Correlation RMSE — CTGAN:", corr_rmse(df, synthetic_ctgan))

In [None]:
# 4.3 Utility — TSTR (Train on Synthetic, Test on Real)
def tstr_classification(real_df, synth_df, target):
    Xs, ys = synth_df.drop(columns=[target]), synth_df[target]
    Xr, yr = real_df.drop(columns=[target]), real_df[target]
    Xs = Xs.select_dtypes(include=np.number).fillna(Xs.median(numeric_only=True))
    Xr = Xr.select_dtypes(include=np.number).fillna(Xr.median(numeric_only=True))
    clf = RandomForestClassifier(n_estimators=300, random_state=42)
    clf.fit(Xs, ys)
    pred = clf.predict(Xr)
    out = {
        "accuracy": accuracy_score(yr, pred),
        "f1_macro": f1_score(yr, pred, average="macro")
    }
    if len(clf.classes_) == 2:
        out["roc_auc"] = roc_auc_score(yr, clf.predict_proba(Xr)[:, 1])
    return out

def tstr_regression(real_df, synth_df, target):
    Xs, ys = synth_df.drop(columns=[target]), synth_df[target]
    Xr, yr = real_df.drop(columns=[target]), real_df[target]
    Xs = Xs.select_dtypes(include=np.number).fillna(Xs.median(numeric_only=True))
    Xr = Xr.select_dtypes(include=np.number).fillna(Xr.median(numeric_only=True))
    reg = RandomForestRegressor(n_estimators=400, random_state=42)
    reg.fit(Xs, ys)
    pred = reg.predict(Xr)
    return {"r2": r2_score(yr, pred), "mae": mean_absolute_error(yr, pred)}

# Example (uncomment and set target column)
# print(tstr_classification(df, synthetic_gc, "your_target"))
# print(tstr_classification(df, synthetic_ctgan, "your_target"))

In [None]:
# 4.4 Privacy — exact duplicates
def exact_dup_rate(real_df, synth_df):
    r = real_df.astype(str).agg("|".join, axis=1)
    s = synth_df.astype(str).agg("|".join, axis=1)
    return len(set(r) & set(s)) / max(1, len(s))

print("Duplication rate — GC:", exact_dup_rate(df, synthetic_gc))
print("Duplication rate — CTGAN:", exact_dup_rate(df, synthetic_ctgan))

## BUSINESS CHALLENGE #02

Preparation

In [None]:
import os, subprocess, sys

# 1) Point to your Temurin JDK 17 install (adjust if your folder name differs)
os.environ["JAVA_HOME"] = r"C:\Program Files\Eclipse Adoptium\jdk-17"
os.environ["PATH"] = os.environ["JAVA_HOME"] + r"\bin;" + os.environ["PATH"]

# 2) Quick check: Java visible to this kernel?
try:
    out = subprocess.check_output(["java", "-version"], stderr=subprocess.STDOUT)
    print(out.decode("utf-8"))
except Exception as e:
    print("Java not visible to the kernel:", e)

# 3) (Optional) Confirm PySpark version
try:
    import pyspark
    print("PySpark:", pyspark.__version__)
except Exception as e:
    print("PySpark not importable:", e, "\nTip: pip install -U 'pyspark>=3.5,<4.0'")


Create SparkSession

In [None]:
# =========================
# BC2 — Setup (PySpark)
# =========================

Load parquet file - dataset

In [None]:
TAXI_PATH = "Datasets\yellow_tripdata_2025-03.parquet"
df = spark.read.parquet(TAXI_PATH)

print("Row count:", df.count())
df.printSchema()
df.show(5, truncate=False)