In [None]:
# Cell 1 – metadata
# Direct Pandas Pipeline – chunked CSV → Parquet

In [2]:
# Cell 2 – imports & config
import pandas as pd
from pathlib import Path
CSV1 = Path("../data/raw/dataset1_final.csv")
CSV2 = Path("../data/raw/dataset2_final.csv")
CHUNK = 50_000
PARQUET_DIR = Path("../data/parquet")
PARQUET_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
pd.read_csv(CSV1, nrows=5)


Unnamed: 0,user_pseudo_id,event_name,category,city,region,country,source,medium,purchase_revenue,total_item_quantity,transaction_id,eventDate,eventTimestamp,gender,Age,page_type,income_group,page_path
0,1789251000.0,session_start,mobile,Poquoson,Virginia,United States,Facebook,PaidSocial,,,,2025-05-13,2025-05-13 10:21:57.850268,male,35-44,homepage,Top 10%,https://demo.com/
1,1789251000.0,page_view,mobile,Poquoson,Virginia,United States,Facebook,PaidSocial,,,,2025-05-13,2025-05-13 10:21:57.850268,female,above 64,homepage,below 50%,https://demo.com/
2,1788384000.0,session_start,mobile,Carthage,New York,United States,Facebook,PaidSocial,,,,2025-05-13,2025-05-13 12:38:06.968220,male,45-54,collections,11-20%,https://demo.com/collections/
3,1788384000.0,page_view,mobile,Carthage,New York,United States,Facebook,PaidSocial,,,,2025-05-13,2025-05-13 12:38:06.968220,male,45-54,collections,11-20%,https://demo.com/collections/
4,1198796000.0,page_view,mobile,Phoenix,Arizona,United States,(direct),(none),,,,2025-05-13,2025-05-13 14:20:32.933828,male,25-34,products,Top 10%,https://demo.com/products/ITEM377/


In [4]:
def csv_to_parquet(src: Path, dst_dir: Path, chunksize: int = 50_000):
    """Read a large CSV in chunks and write each chunk to Parquet."""
    import time, pandas as pd
    t0 = time.time()

    for i, chunk in enumerate(pd.read_csv(src, chunksize=chunksize, dtype_backend="pyarrow")):
        outfile = dst_dir / f"{src.stem}_part{i:03d}.parquet"
        chunk.to_parquet(outfile, engine="pyarrow", index=False)
        print(f"[{i:03d}] wrote {outfile.name}  rows={len(chunk):,}")

    print("✅ done in", round(time.time() - t0, 1), "s")


In [5]:
# convert both datasets
csv_to_parquet(CSV1, PARQUET_DIR)
csv_to_parquet(CSV2, PARQUET_DIR)


[000] wrote dataset1_final_part000.parquet  rows=50,000
[001] wrote dataset1_final_part001.parquet  rows=50,000
[002] wrote dataset1_final_part002.parquet  rows=50,000
[003] wrote dataset1_final_part003.parquet  rows=50,000
[004] wrote dataset1_final_part004.parquet  rows=50,000
[005] wrote dataset1_final_part005.parquet  rows=50,000
[006] wrote dataset1_final_part006.parquet  rows=50,000
[007] wrote dataset1_final_part007.parquet  rows=50,000
[008] wrote dataset1_final_part008.parquet  rows=50,000
[009] wrote dataset1_final_part009.parquet  rows=50,000
[010] wrote dataset1_final_part010.parquet  rows=50,000
[011] wrote dataset1_final_part011.parquet  rows=50,000
[012] wrote dataset1_final_part012.parquet  rows=50,000
[013] wrote dataset1_final_part013.parquet  rows=50,000
[014] wrote dataset1_final_part014.parquet  rows=50,000
[015] wrote dataset1_final_part015.parquet  rows=50,000
[016] wrote dataset1_final_part016.parquet  rows=50,000
[017] wrote dataset1_final_part017.parquet  rows

  for i, chunk in enumerate(pd.read_csv(src, chunksize=chunksize, dtype_backend="pyarrow")):


ArrowInvalid: ("Could not convert '452362343.1719424151' with type str: tried to convert to double", 'Conversion failed for column user_pseudo_id with type object')

chunksize:        50 000
parquet_engine:   pyarrow
user_pseudo_id:   forced string  (avoid ArrowInvalid on mixed types)


In [6]:
def csv_to_parquet(src: Path, dst_dir: Path, chunksize: int = 50_000):
    """Read a large CSV in chunks and write each chunk to Parquet."""
    import time, pandas as pd
    t0 = time.time()

    # ↓ Explicit dtype prevents mixed-type surprises
    explicit = {"user_pseudo_id": "string"}      # keep IDs as strings
    for i, chunk in enumerate(
        pd.read_csv(src,
                    chunksize=chunksize,
                    dtype=explicit,          # <- key change
                    low_memory=False)
    ):
        # (optional) drop the .000 / scientific notation if you prefer integers
        # chunk["user_pseudo_id"] = (
        #     pd.to_numeric(chunk["user_pseudo_id"], errors="coerce")
        #       .astype("Int64")  # nullable int
        # )

        outfile = dst_dir / f"{src.stem}_part{i:03d}.parquet"
        chunk.to_parquet(outfile, engine="pyarrow", index=False)
        print(f"[{i:03d}] wrote {outfile.name}  rows={len(chunk):,}")

    print("✅ done in", round(time.time() - t0, 1), "s")


In [7]:
csv_to_parquet(CSV1, PARQUET_DIR)
csv_to_parquet(CSV2, PARQUET_DIR)


[000] wrote dataset1_final_part000.parquet  rows=50,000
[001] wrote dataset1_final_part001.parquet  rows=50,000
[002] wrote dataset1_final_part002.parquet  rows=50,000
[003] wrote dataset1_final_part003.parquet  rows=50,000
[004] wrote dataset1_final_part004.parquet  rows=50,000
[005] wrote dataset1_final_part005.parquet  rows=50,000
[006] wrote dataset1_final_part006.parquet  rows=50,000
[007] wrote dataset1_final_part007.parquet  rows=50,000
[008] wrote dataset1_final_part008.parquet  rows=50,000
[009] wrote dataset1_final_part009.parquet  rows=50,000
[010] wrote dataset1_final_part010.parquet  rows=50,000
[011] wrote dataset1_final_part011.parquet  rows=50,000
[012] wrote dataset1_final_part012.parquet  rows=50,000
[013] wrote dataset1_final_part013.parquet  rows=50,000
[014] wrote dataset1_final_part014.parquet  rows=50,000
[015] wrote dataset1_final_part015.parquet  rows=50,000
[016] wrote dataset1_final_part016.parquet  rows=50,000
[017] wrote dataset1_final_part017.parquet  rows

In [8]:
len(list(PARQUET_DIR.glob("dataset1_final_part*.parquet")))


132

Smoke-test one chunk

In [9]:
import pyarrow.parquet as pq
tbl = pq.read_table(PARQUET_DIR / "dataset1_final_part000.parquet")
print(tbl.num_rows, "rows | schema →", tbl.schema)


50000 rows | schema → user_pseudo_id: string
event_name: string
category: string
city: string
region: string
country: string
source: string
medium: string
purchase_revenue: double
total_item_quantity: double
transaction_id: string
eventDate: string
eventTimestamp: string
gender: string
Age: string
page_type: string
income_group: string
page_path: string
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 2221


chunksize = 50 000  
dataset1_final → 132 files, 36 s  
dataset2_final → 1 file, 0.1 s  


In [None]:
#STEP 1,2
# ── 0. Imports & paths
import pandas as pd
from pathlib import Path
PARQUET_DIR = Path("../data/parquet")


In [12]:
# ── 1. Load all event chunks
df_events = pd.read_parquet(
    list(PARQUET_DIR.glob("dataset1_final_part*.parquet")),
    engine="pyarrow"
)
print("events:", len(df_events))

events: 6593721


In [13]:
# ── 2. Load transactions (single part)
df_txn = pd.read_parquet(
    list(PARQUET_DIR.glob("dataset2_final_part*.parquet"))[0],
    engine="pyarrow"
)
print("txn rows:", len(df_txn))

txn rows: 27500


In [14]:
# ── 3. Parse & sort timestamps
df_events["eventTimestamp"] = pd.to_datetime(df_events["eventTimestamp"], utc=True)
df_events.sort_values(["user_pseudo_id", "eventTimestamp"], inplace=True)

In [15]:
# ── 4. Build 30-minute sessions
df_events["prev_ts"] = (
    df_events.groupby("user_pseudo_id")["eventTimestamp"].shift()
)
gap = df_events["eventTimestamp"] - df_events["prev_ts"]
df_events["new_session"] = gap.gt(pd.Timedelta("30min")).fillna(True)
df_events["session_id"] = (
    df_events.groupby("user_pseudo_id")["new_session"].cumsum()
)

In [17]:
# 4 bis – harmonise dtypes before merging
df_events["transaction_id"] = df_events["transaction_id"].astype("string")
df_txn["Transaction_ID"]    = df_txn["Transaction_ID"].astype("string")


In [18]:
# ── 5. Merge revenue onto events
df = df_events.merge(
    df_txn,
    left_on="transaction_id",
    right_on="Transaction_ID",
    how="left"
)

In [19]:
print("✅ merged shape:", df.shape)

✅ merged shape: (6602845, 29)


In [20]:
# ── 6. Aggregate to session level
session_df = (
    df.groupby(["user_pseudo_id", "session_id"], sort=False)
      .agg(
          page_views=("event_name", "size"),
          session_start=("eventTimestamp", "min"),
          session_end=("eventTimestamp", "max"),
          revenue=("purchase_revenue", "sum"),
          items=("total_item_quantity", "sum")
      )
      .reset_index()
)

print("sessions:", len(session_df))
session_df.to_parquet(PARQUET_DIR / "sessions.parquet", index=False)
print("✅ sessions.parquet written")

sessions: 1004534
✅ sessions.parquet written


Session rule  : new session if time gap > 30 min  
Output rows   : 1004534
