In [22]:
import warnings
import dask.dataframe as dd
from ydata_profiling import ProfileReport
from dask.diagnostics import ProgressBar  # Progress bar support
import time

# 1. Configuration and Initialization
warnings.filterwarnings("ignore", category=UserWarning, message="Glyph 9")

# 2. Define Data Structure
COLUMNS = ['label'] + [f'I{i}' for i in range(1, 14)] + [f'C{i}' for i in range(1, 27)]
NUM_FEATURES = [f'I{i}' for i in range(1, 14)]  # 13 numerical features
CAT_FEATURES = [f'C{i}' for i in range(1, 27)]  # 26 categorical features

print("⏳ Starting data loading...")
start_time = time.time()

# 3. Optimized Data Loading (with progress display)
with ProgressBar():
    ddf = dd.read_csv(
        "../../data/raw/train.txt",
        sep="\t",
        header=None,
        names=COLUMNS,
        dtype={**{col: 'float32' for col in NUM_FEATURES},
               **{col: 'category' for col in CAT_FEATURES},
               'label': 'float32'}
    )
    # Force computation to display progress
    row_count = len(ddf)

print(f"✅ Data loading completed! Total rows: {row_count:,} | Time taken: {time.time() - start_time:.1f} seconds\n")

# 4. Intelligent Sampling (with progress)
print("⏳ Randomly sampling data...")
start_time = time.time()

with ProgressBar():
    sample = ddf.head(1000)
    # sample = ddf.sample(frac=min(0.1, sample_size/len(ddf))).compute()   # 1% random sample

print(f"✅ Sampling completed! Sample size: {len(sample):,} rows | Time taken: {time.time() - start_time:.1f} seconds\n")

# 5. Generate Report (with stage prompts)
print("📊 Generating EDA report (this process may take several minutes)...")
start_time = time.time()

profile = ProfileReport(
    sample,
    # Explicitly disable discretization for numerical columns
    vars={"num": {"discretize": False, "bins": None}},
    title="Large-Scale Feature Analysis",
    progress_bar=True,
    variables={
        "descriptions": {
            "label": "Target variable (classification label)",
            **{f"I{i}": f"Numerical feature I{i}" for i in range(1, 14)},
            **{f"C{i}": f"Categorical feature C{i}" for i in range(1, 27)}
        }
    },
    # Optional: Simplify configuration if correlation calculations don't need discretization results
    correlations={
        "pearson": {"calculate": True},
        "cramers": {"calculate": True}
    },
    interactions={
        "targets": ["label"]
    }
)

# 6. Output Report
print("💾 Saving HTML report...")
profile.to_file("08_eda_dask_ydata_profiling_report.html")

print(f"""
🎉 EDA report generation completed! Key information:
- Numerical features: {len(NUM_FEATURES)}
- Categorical features: {len(CAT_FEATURES)}
- Sample size: {len(sample):,} records (original data: {row_count:,} records)
- Total time taken: {time.time() - start_time:.1f} seconds
Report saved to: 08_eda_dask_ydata_profiling_report.html
""")

⏳ Starting data loading...
[########################################] | 100% Completed | 163.92 s
✅ Data loading completed! Total rows: 45,840,617 | Time taken: 163.9 seconds

⏳ Randomly sampling data...
[########################################] | 100% Completed | 1.25 ss
✅ Sampling completed! Sample size: 1,000 rows | Time taken: 1.3 seconds

📊 Generating EDA report (this process may take several minutes)...
💾 Saving HTML report...


Summarize dataset:  38%|███▊      | 17/45 [00:00<00:00, 82.48it/s, Describe variable: C12] 
Summarize dataset:  69%|██████▉   | 31/45 [00:00<00:00, 67.74it/s, Describe variable: C26]
100%|██████████| 40/40 [00:00<00:00, 153.31it/s]<00:00, 59.55it/s, Describe variable: C26]
Summarize dataset: 100%|██████████| 64/64 [00:01<00:00, 33.87it/s, Completed]                    
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.49s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.87it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 243.90it/s]


🎉 EDA report generation completed! Key information:
- Numerical features: 13
- Categorical features: 26
- Sample size: 1,000 records (original data: 45,840,617 records)
- Total time taken: 6.7 seconds
Report saved to: 08_eda_dask_ydata_profiling_report.html




