In [8]:
import sys 
from pathlib import Path
#  import from src/ module
sys.path.append(str(Path("..").resolve() / "src"))

from clean_prep import load_and_clean

# output folder
OUT_DIR = Path("..").resolve() / "data"
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# 1) Load cleaned data
df = load_and_clean("train")

print("Total rows:", len(df))
print("Timestamp min/max:", df["timestamp"].min(), df["timestamp"].max())

Total rows: 16468027
Timestamp min/max: 0 2671199


In [None]:
# 2) 200k sample (for quick testing and avoid large computes for limited snowflake credits)
SAMPLE_N = 200_000
df_sample = df.head(SAMPLE_N).copy()

# Remaining rows for incremental batches
df_rest = df.iloc[SAMPLE_N:].copy()

print("Sample rows:", len(df_sample))
print("Rest rows:", len(df_rest))

Sample rows: 200000
Rest rows: 16268027


In [10]:
# Compute 3 batch cutoffs on the remaining data for incremental loads 
# Using quantiles on timestamp so batches are chronological
q1, q2 = df_rest["timestamp"].quantile([1/3, 2/3]).astype(int).tolist()

batch_1 = df_rest[df_rest["timestamp"] <= q1]
batch_2 = df_rest[(df_rest["timestamp"] > q1) & (df_rest["timestamp"] <= q2)]
batch_3 = df_rest[df_rest["timestamp"] > q2]

print("\nCutoffs:", q1, q2)
print("Batch sizes:", len(batch_1), len(batch_2), len(batch_3))
print("Batch timestamp ranges:")
print("  b1:", batch_1["timestamp"].min(), batch_1["timestamp"].max())
print("  b2:", batch_2["timestamp"].min(), batch_2["timestamp"].max())
print("  b3:", batch_3["timestamp"].min(), batch_3["timestamp"].max())



Cutoffs: 889986 1770562
Batch sizes: 5422682 5422672 5422673
Batch timestamp ranges:
  b1: 42386 889986
  b2: 889987 1770562
  b3: 1770563 2671199


In [11]:

# 4) Export to Parquet (fastest for Snowflake COPY)
df_sample.to_parquet(OUT_DIR / "criteo_sample_200k.parquet", index=False)
batch_1.to_parquet(OUT_DIR / "criteo_batch_1.parquet", index=False)
batch_2.to_parquet(OUT_DIR / "criteo_batch_2.parquet", index=False)
batch_3.to_parquet(OUT_DIR / "criteo_batch_3.parquet", index=False)

print("\n✅ Export complete:")
for p in [
    OUT_DIR / "criteo_sample_200k.parquet",
    OUT_DIR / "criteo_batch_1.parquet",
    OUT_DIR / "criteo_batch_2.parquet",
    OUT_DIR / "criteo_batch_3.parquet",
]:
    print(" -", p)


✅ Export complete:
 - C:\Users\User\Documents\Projects\marketing_analytics_engineering_pipeline\data\criteo_sample_200k.parquet
 - C:\Users\User\Documents\Projects\marketing_analytics_engineering_pipeline\data\criteo_batch_1.parquet
 - C:\Users\User\Documents\Projects\marketing_analytics_engineering_pipeline\data\criteo_batch_2.parquet
 - C:\Users\User\Documents\Projects\marketing_analytics_engineering_pipeline\data\criteo_batch_3.parquet
