registeredMonthCnt = 0 oder 1
Nur user die eine action getÃ¤tigt haben


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

DATA_DIR = Path("../csv_data")
OUT_DIR = Path("../data")
OUT_CSV = OUT_DIR / "users_with_actions.csv"

CHUNKSIZE = 250_000

#file paths
IMP_PATH = DATA_DIR / "impression_data.csv"
USR_PATH = DATA_DIR / "user_demographics.csv"
CRD_PATH = DATA_DIR / "mlog_demographics.csv"
CRT_PATH = DATA_DIR / "creator_demographics.csv"
MLOG_STATS_PATH = DATA_DIR / "mlog_stats.csv"
CREATOR_STATS_PATH = DATA_DIR / "creator_stats.csv"

ACTION_COLS = [
    "isClick","isLike","isComment","isShare","isViewComment","isIntoPersonalHomepage"
]

# methods
def users_with_registered_month_in(user_csv, months=(0, 1), chunksize=CHUNKSIZE):

    months = set(months)
    keep = []
    usecols = ["userId", "registeredMonthCnt"]
    dtypes = {"userId": "string", "registeredMonthCnt": "Int64"}
    for chunk in pd.read_csv(user_csv, usecols=usecols, chunksize=chunksize, dtype=dtypes):

        sel = chunk.loc[
            chunk["registeredMonthCnt"].isin(months), "userId"
        ].dropna().astype("string")
        if not sel.empty:
            keep.append(sel)
    if not keep:
        return set()
    return set(pd.concat(keep, ignore_index=True).unique())

def read_filtered(path, key_col, keep_keys, usecols, chunksize=CHUNKSIZE):

    keep_keys = {str(x) for x in pd.Series(list(keep_keys)).dropna().unique()}
    parts = []
    for chunk in pd.read_csv(path, usecols=usecols, chunksize=chunksize, dtype={key_col: "string"}):
        chunk[key_col] = chunk[key_col].astype("string")
        sel = chunk[chunk[key_col].isin(keep_keys)]
        if not sel.empty:
            parts.append(sel)
    return pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=usecols)

def read_filtered_two_keys(path, key_cols, keep_pairs, usecols, chunksize=CHUNKSIZE, dtypes=None):

    k1, k2 = key_cols

    pairs_df = pd.DataFrame(keep_pairs, columns=[k1, k2]).dropna()
    if not pairs_df.empty:
        pairs_df[k1] = pairs_df[k1].astype("string")
        pairs_df[k2] = pd.to_numeric(pairs_df[k2], errors="coerce").astype("Int64")
        keep_set = set((pairs_df[k1].astype("string") + "|" + pairs_df[k2].astype("string")).tolist())
    else:
        keep_set = set()

    if dtypes is None:
        dtypes = {}

    parts = []
    for chunk in pd.read_csv(path, usecols=usecols, chunksize=chunksize, dtype=dtypes):
        c1 = chunk[k1].astype("string")
        c2 = pd.to_numeric(chunk[k2], errors="coerce").astype("Int64")
        composite = (c1.astype("string") + "|" + c2.astype("string"))
        mask = composite.isin(keep_set)
        sel = chunk.loc[mask]
        if not sel.empty:
            parts.append(sel)

    return pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=usecols)

# Build impressions for selected users 

def collect_impressions_for_users(imp_path, selected_users, chunksize=CHUNKSIZE, action_only=True):
    usecols = [
        "userId","mlogId","impressTime","dt","impressPosition",
        "isClick","isLike","isComment","isShare","isViewComment",
        "isIntoPersonalHomepage","mlogViewTime"
    ]
    dtypes = {
        "userId": "string",
        "mlogId": "string",
        "dt": "Int16",
        "impressPosition": "Int16",
        "isClick": "Int8",
        "isLike": "Int8",
        "isComment": "Int8",
        "isShare": "Int8",
        "isViewComment": "Int8",
        "isIntoPersonalHomepage": "Int8",
        "mlogViewTime": "float32"
    }
    selected_users = set(pd.Series(list(selected_users)).dropna().astype("string").unique())
    parts = []
    for chunk in pd.read_csv(imp_path, usecols=usecols, chunksize=chunksize, dtype=dtypes):
        # keep only selected users
        sel = chunk[chunk["userId"].isin(selected_users)].copy()
        if sel.empty:
            continue
        if action_only:
            # keep rows where ANY action flag == 1
            action_mask = sel[ACTION_COLS].fillna(0).astype("int8").eq(1).any(axis=1)
            sel = sel.loc[action_mask]
            if sel.empty:
                continue
        # preserve early behavior ordering
        sel.sort_values(["userId", "impressTime"], inplace=True, kind="mergesort")
        parts.append(sel)
    return pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=usecols)

# hygiene & dtypes after merges
def cast_post_merge(df: pd.DataFrame) -> pd.DataFrame:
    if {"userId","mlogId","impressTime"}.issubset(df.columns):
        df.drop_duplicates(["userId","mlogId","impressTime"], inplace=True)

    for c in ACTION_COLS:
        if c in df:
            df[c] = df[c].fillna(0).astype("uint8")

    if "impressPosition" in df:
        df["impressPosition"] = df["impressPosition"].astype("Int16")
    if "mlogViewTime" in df:
        df["mlogViewTime"] = pd.to_numeric(df["mlogViewTime"], errors="coerce").astype("float32")

    for c in ["gender","province","type","contentId","talkId","creatorType"]:
        if c in df:
            df[c] = df[c].astype("category")

    return df

# main
def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    # 1) Select users with registeredMonthCnt in {0, 1}
    selected_users = users_with_registered_month_in(USR_PATH, months=(0, 1), chunksize=CHUNKSIZE)
    print(f"Users with registeredMonthCnt in {{0,1}}: {len(selected_users):,}")

    # 2) Pull actions for those users
    imp = collect_impressions_for_users(IMP_PATH, selected_users, CHUNKSIZE, action_only=True)
    print(f"Collected action-only impressions: {len(imp):,}")

    if imp.empty:
        imp.to_csv(OUT_CSV, index=False)
        print(f"No matching impressions with actions; saved empty schema to: {OUT_CSV}")
        return

    # 3) Keys for joins
    user_keys = imp["userId"].astype("string").unique()
    mlog_keys = imp["mlogId"].astype("string").unique()

    # 4) Read user/card/creator dims
    users = read_filtered(
        USR_PATH, "userId", user_keys,
        ["userId","age","gender","province","level","registeredMonthCnt","followCnt"]
    )
    cards = read_filtered(
        CRD_PATH, "mlogId", mlog_keys,
        ["mlogId","type","contentId","talkId","publishTime","creatorId"]
    )
    creator_keys = cards["creatorId"].dropna().astype("string").unique() if not cards.empty else []
    creators = read_filtered(
        CRT_PATH, "creatorId", creator_keys,
        ["creatorId","creatorType","level"]
    )

    # 5) Merge core dims
    df = (imp
          .merge(users, on="userId", how="left", suffixes=("", "_user"))
          .merge(cards, on="mlogId", how="left", suffixes=("", "_card"))
          .merge(creators, on="creatorId", how="left", suffixes=("", "_creator"))
          )

    # 6) Rename levels and create label
    if "level" in df.columns:
        df.rename(columns={"level": "user_level"}, inplace=True)
    if "level_creator" in df.columns:
        df.rename(columns={"level_creator": "creator_level"}, inplace=True)
    elif "level_y" in df.columns and "creator_level" not in df.columns:
        df.rename(columns={"level_y": "creator_level"}, inplace=True)
    if "level_x" in df.columns and "user_level" not in df.columns:
        df.rename(columns={"level_x": "user_level"}, inplace=True)

    if "user_level" in df.columns:
        df["y_active"] = (pd.to_numeric(df["user_level"], errors="coerce") >= 2).astype("uint8")
    else:
        df["y_active"] = np.nan

    # 7) Merge mlog_stats on (mlogId, dt)
    if not df.empty:
        mlog_dt_pairs = df[["mlogId", "dt"]].dropna().drop_duplicates().values.tolist()
        mlog_stats = read_filtered_two_keys(
            MLOG_STATS_PATH,
            key_cols=("mlogId", "dt"),
            keep_pairs=mlog_dt_pairs,
            usecols=[
                "mlogId","dt",
                "userImprssionCount","userClickCount","userLikeCount",
                "userCommentCount","userShareCount","userViewCommentCount",
                "userIntoPersonalHomepageCount","userFollowCreatorCount"
            ],
            dtypes={"mlogId": "string", "dt": "Int64"}
        )
        if not mlog_stats.empty:
            mlog_stats["dt"] = mlog_stats["dt"].astype("Int16")
            df = df.merge(mlog_stats, on=["mlogId","dt"], how="left", suffixes=("", "_mlogstats"))

    # 8) Merge creator_stats on (creatorId, dt)
    if not df.empty and "creatorId" in df.columns:
        creator_dt_pairs = df[["creatorId", "dt"]].dropna().drop_duplicates().values.tolist()
        creator_stats = read_filtered_two_keys(
            CREATOR_STATS_PATH,
            key_cols=("creatorId", "dt"),
            keep_pairs=creator_dt_pairs,
            usecols=["creatorId","dt","PushlishMlogCnt"],
            dtypes={"creatorId": "string", "dt": "Int64"}
        )
        if not creator_stats.empty:
            creator_stats["dt"] = creator_stats["dt"].astype("Int16")
            df = df.merge(creator_stats, on=["creatorId","dt"], how="left", suffixes=("", "_creatorstats"))

    # 9) dtypes & hygiene
    df = cast_post_merge(df)

    # 10) Save
    df.to_csv(OUT_CSV, index=False)
    print(f"Saved: {OUT_CSV}")

if __name__ == "__main__":
    main()


Users with registeredMonthCnt in {0,1}: 116,843
Collected action-only impressions: 117,386
Saved: ../data/users_with_actions.csv
