## Data Cleaning and Merging

### This script:
 1) samples 100,000 impressions (chunked)
 2) collects unique userId/mlogId/creatorId
 3) filters user/card/creator tables to those keys
 4) merges
 5) saves CSV to "data/"

In [4]:

from pathlib import Path
import os
from pathlib import Path
import pandas as pd

DATA_DIR = Path("../csv_data")
OUT_DIR = Path("../data")
N = 100_000
CHUNKSIZE = 250_000
SEED = 42

# File paths
IMP_PATH = DATA_DIR / "impression_data.csv"
USR_PATH = DATA_DIR / "user_demographics.csv"
CRD_PATH = DATA_DIR / "mlog_demographics.csv"
CRT_PATH = DATA_DIR / "creator_demographics.csv"

def sample_impressions(path, n=N, chunksize=CHUNKSIZE, seed=SEED):
    cols = [
        "userId","mlogId","impressTime","dt","impressPosition",
        "isClick","isLike","isComment","isShare","isViewComment","mlogViewTime"
    ]
    taken = 0
    parts = []
    for i, chunk in enumerate(pd.read_csv(path, usecols=cols, chunksize=chunksize)):
        need = n - taken
        if need <= 0:
            break
        k = min(need, len(chunk))
        parts.append(chunk.sample(n=k, random_state=seed + i))
        taken += k
    if not parts:
        return pd.DataFrame(columns=cols)
    return pd.concat(parts, ignore_index=True)

def read_filtered(path, key_col, keep_keys, usecols, chunksize=CHUNKSIZE):
    keep_keys = set(str(x) for x in keep_keys)
    out_parts = []
    for chunk in pd.read_csv(path, usecols=usecols, chunksize=chunksize):
        chunk[key_col] = chunk[key_col].astype(str)
        out = chunk[chunk[key_col].isin(keep_keys)]
        if not out.empty:
            out_parts.append(out)
    if out_parts:
        return pd.concat(out_parts, ignore_index=True)
    return pd.DataFrame(columns=usecols)

def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    # 1) Base sample
    imp = sample_impressions(IMP_PATH)

    # 2) Unique keys
    user_keys = imp["userId"].astype(str).unique()
    mlog_keys = imp["mlogId"].astype(str).unique()

    # 3) Filter dimension tables
    users = read_filtered(
        USR_PATH, "userId", user_keys,
        ["userId","age","gender","province","level","registeredMonthCnt","followCnt"]
    )
    cards = read_filtered(
        CRD_PATH, "mlogId", mlog_keys,
        ["mlogId","type","contentId","talkId","publishTime","creatorId"]
    )
    creator_keys = cards["creatorId"].dropna().astype(str).unique()
    creators = read_filtered(
        CRT_PATH, "creatorId", creator_keys,
        ["creatorId","creatorType","level"]
    )

    # 4) Merge
    df = (imp
          .merge(users, on="userId", how="left")
          .merge(cards, on="mlogId", how="left")
          .merge(creators, on="creatorId", how="left"))

    # 5) Save CSV
    out_csv = OUT_DIR / "imp_sample_100k_merged.csv"
    df = df.rename(columns={"level_x":"user_level","level_y":"creator_level"})
    df.to_csv(out_csv, index=False)
    print(f"Saved: {out_csv}")

if __name__ == "__main__":
    main()

path = Path("/mnt/data/build_eda_sample_simple.py")



Saved: ../data/imp_sample_100k_merged.csv


#### Sanity Checks

In [9]:
df = pd.read_csv("../data/imp_sample_100k_merged.csv")
# 1) Should be ~100,000 rows
length=len(df)
print("Rows: ",length)

# 2) How many unique users/cards/creators in the sample?
print("Users: ",df["userId"].nunique())
print("Cards: ",df["mlogId"].nunique())
print("Creators: ",df["creatorId"].nunique())

# 3) Null rates on key columns
print("Null rates: ",df[["age","gender","province","type","contentId","talkId","creatorType"]].isna().mean().round(3))

# 4) Spot-check that clicks exist
print("Clicks: ",df["isClick"].sum(), df["isShare"].sum())

Rows:  100000
Users:  9096
Cards:  21264
Creators:  10781
Null rates:  age            0.373
gender         0.373
province       0.000
type           0.000
contentId      0.172
talkId         0.000
creatorType    0.000
dtype: float64
Clicks:  4805 23
