In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

# Set seed for reproducibility
np.random.seed(42)
n_rows = 100000

# 1. Generate realistic distributions
data = {
    "Q1_Please_enter_your_age": np.random.randint(5, 90, n_rows),
    "Q2_What_is_your_gender": np.random.choice(["Male", "Female", "Other", "Prefer not to say"], n_rows),
    "Q3_How_satisfied_are_you_with_our_service": np.random.choice([1, 2, 3, 4, 5], n_rows, p=[0.05, 0.1, 0.2, 0.4, 0.25]),
    "completion_time_sec": np.random.gamma(shape=2, scale=30, size=n_rows).astype(int) # Skewed distribution
}

# 2. Add some missing values (to test our dropna logic)
df_large = pd.DataFrame(data)
df_large.loc[df_large.sample(frac=0.05).index, "Q3_How_satisfied_are_you_with_our_service"] = np.nan

# 3. Save to CSV
input_path = Path("data/raw/survey_sample.csv")
input_path.parent.mkdir(parents=True, exist_ok=True)
df_large.to_csv(input_path, index=False)

print(f"ðŸš€ Success: Large-scale dummy data ({n_rows} rows) created at {input_path}")

ðŸš€ Success: Large-scale dummy data (100000 rows) created at data/raw/survey_sample.csv


In [3]:
# Cell 1: Setup
%load_ext autoreload
%autoreload 2

from pathlib import Path
import pandas as pd
from preprocess import load_config, apply_cleaning, optimize_memory

# Cell 2: Configuration & Path Handling
config = load_config()
input_file = Path(config['paths']['input_raw'])
output_path = Path(config['paths']['output_dir']) / config['paths']['output_name']

# Cell 3: Data Ingestion (Sampling for performance)
# Sample 1000 records to prevent Jupyter from freezing during development
df_raw = pd.read_csv(input_file).rename(columns=config['column_settings']['mapping'])

# Safety sample: takes 500 or the maximum available
df_raw = df_raw.sample(n=min(500, len(df_raw)))

# Cell 4: Execute Pipeline
df_cleaned = apply_cleaning(df_raw, config)
df_final = optimize_memory(df_cleaned, config)

# Cell 5: Save to Parquet (Preserves schema and saves space)
output_path.parent.mkdir(parents=True, exist_ok=True)
df_final.to_parquet(output_path, engine='pyarrow')

print(f"Pipeline complete. Final memory usage: {df_final.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
df_final.head()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
          Data Cleaning Trace           
----------------------------------------
Initial records          :    500
Time filter applied      :    358 (Dropped: 142)
Null values removed      :    343 (Dropped: 15)
Age range filter applied :    256 (Dropped: 87)
----------------------------------------
Pipeline complete. Final memory usage: 0.00 MB


Unnamed: 0,age,gender,satisfaction,completion_time_sec
12545,35,Other,3,141
17478,27,Prefer not to say,4,154
24648,54,Other,4,80
39320,38,Male,5,44
5215,25,Female,4,32


In [None]:

output_file = Path("data/processed/cleaned_survey.parquet")
output_file.parent.mkdir(parents=True, exist_ok=True)
df.to_parquet(output_file)