In [1]:
import pandas as pd

from tqdm.notebook import tqdm

In [2]:
sessions = pd.read_csv("data/sessions.csv")

In [3]:
sessions["secs_elapsed"] = sessions["secs_elapsed"].fillna(0)
sessions["action"] = sessions["action"].fillna("BLANK").replace("", "BLANK")
sessions["action_type"] = sessions["action_type"].fillna("BLANK").replace("", "BLANK")
sessions["action_detail"] = (
    sessions["action_detail"].fillna("BLANK").replace("", "BLANK")
)

sessions = sessions.astype(
    {
        "action": "category",
        "action_type": "category",
        "action_detail": "category",
        "device_type": "category",
    }
)

sessions.drop(columns=["secs_elapsed"], inplace=True)
sessions["count"] = 1

In [4]:
batch_size = 50000
imax = len(sessions)

sessions_grouped = None

for i1 in tqdm(range(0, imax, batch_size), desc="Processing batches"):
    i2 = min(i1 + batch_size, imax)

    batch = sessions.iloc[i1:i2]
    batch_dummies = pd.get_dummies(batch.drop(columns=["user_id"]), drop_first=False)
    batch_dummies = batch[["user_id"]].join(batch_dummies.mul(batch["count"], axis=0))

    if sessions_grouped is None:
        sessions_grouped = batch_dummies.groupby("user_id").sum().reset_index()
    else:
        sessions_grouped = (
            pd.concat([sessions_grouped, batch_dummies])
            .groupby("user_id")
            .sum()
            .reset_index()
        )

sessions_grouped.to_feather("data/preprocessed/sessions_grouped")

Processing batches:   0%|          | 0/212 [00:00<?, ?it/s]