In [1]:
import pandas as pd

from tqdm import tqdm

In [2]:
sessions = pd.read_csv("data/sessions.csv")

In [None]:
# 添加 seq 列，表示每个 user_id 的操作序列编号
sessions["seq"] = sessions.groupby("user_id").cumcount() + 1

# 添加 seq_rev 列，表示每个 user_id 的操作序列反序编号
sessions["seq_rev"] = sessions.groupby("user_id").cumcount(ascending=False) + 1

# 添加 action2 列，将 action、action_type、action_detail 和 device_type 拼接为字符串
sessions["action2"] = (
    sessions["action"].astype(str)
    + "_"
    + sessions["action_type"].astype(str)
    + "_"
    + sessions["action_detail"].astype(str)
    + "_"
    + sessions["device_type"].astype(str)
)

In [None]:
# 计算每个 user_id 和 action 的 secs_elapsed 总和
sessions_action_se_sum = (
    sessions.groupby(['user_id', 'action'])['secs_elapsed']
    .sum()
    .reset_index()
    .rename(columns={'secs_elapsed': 'secs_elapsed_sum'})
)

# 转换为长格式（melt），在 pandas 中直接添加新列后不需要删除变量列
sessions_action_se_sum = pd.melt(
    sessions_action_se_sum,
    id_vars=['user_id', 'action'],
    value_vars=['secs_elapsed_sum'],
    var_name='variable',
    value_name='value'
)

# 删除 variable 列
sessions_action_se_sum.drop(columns=['variable'], inplace=True)

# 重命名列
sessions_action_se_sum.columns = ['id', 'feature', 'value']

# 修改 feature 列的值
sessions_action_se_sum['feature'] = 'action_se_sum_' + sessions_action_se_sum['feature']

# 计算特征的唯一值数量
n_distinct_features = sessions_action_se_sum['feature'].nunique()
print(f"Number of distinct features: {n_distinct_features}")

# 保存结果
sessions_action_se_sum.to_pickle("cache/sessions_action_se_sum.pkl")

In [None]:
# **************************************
# sessions_action_type_se_sum
# **************************************
# 计算每个 user_id 和 action_type 的 secs_elapsed 总和
sessions_action_type_se_sum = (
    sessions.groupby(['user_id', 'action_type'])['secs_elapsed']
    .sum()
    .reset_index()
    .rename(columns={'secs_elapsed': 'secs_elapsed_sum'})
)

# 转换为长格式（melt）
sessions_action_type_se_sum = pd.melt(
    sessions_action_type_se_sum,
    id_vars=['user_id', 'action_type'],
    value_vars=['secs_elapsed_sum'],
    var_name='variable',
    value_name='value'
)

# 删除 variable 列
sessions_action_type_se_sum.drop(columns=['variable'], inplace=True)

# 重命名列
sessions_action_type_se_sum.columns = ['id', 'feature', 'value']

# 修改 feature 列的值
sessions_action_type_se_sum['feature'] = 'action_type_se_sum_' + sessions_action_type_se_sum['feature']

# 计算特征的唯一值数量
n_distinct_features_action_type = sessions_action_type_se_sum['feature'].nunique()
print(f"Number of distinct features (action_type): {n_distinct_features_action_type}")

# 保存结果
sessions_action_type_se_sum.to_pickle("cache/sessions_action_type_se_sum.pkl")

In [None]:
# **************************************
# sessions_device_type_se_sum
# **************************************
# 计算每个 user_id 和 device_type 的 secs_elapsed 总和
sessions_device_type_se_sum = (
    sessions.groupby(['user_id', 'device_type'])['secs_elapsed']
    .sum()
    .reset_index()
    .rename(columns={'secs_elapsed': 'secs_elapsed_sum'})
)

# 转换为长格式（melt）
sessions_device_type_se_sum = pd.melt(
    sessions_device_type_se_sum,
    id_vars=['user_id', 'device_type'],
    value_vars=['secs_elapsed_sum'],
    var_name='variable',
    value_name='value'
)

# 删除 variable 列
sessions_device_type_se_sum.drop(columns=['variable'], inplace=True)

# 重命名列
sessions_device_type_se_sum.columns = ['id', 'feature', 'value']

# 修改 feature 列的值
sessions_device_type_se_sum['feature'] = 'device_type_se_sum_' + sessions_device_type_se_sum['feature']

# 计算特征的唯一值数量
n_distinct_features_device_type = sessions_device_type_se_sum['feature'].nunique()
print(f"Number of distinct features (device_type): {n_distinct_features_device_type}")

# 保存结果
sessions_device_type_se_sum.to_pickle("cache/sessions_device_type_se_sum.pkl")

In [3]:
sessions["secs_elapsed"] = sessions["secs_elapsed"].fillna(0)
sessions["action"] = sessions["action"].fillna("BLANK").replace("", "BLANK")
sessions["action_type"] = sessions["action_type"].fillna("BLANK").replace("", "BLANK")
sessions["action_detail"] = (
    sessions["action_detail"].fillna("BLANK").replace("", "BLANK")
)

sessions = sessions.astype(
    {
        "action": "category",
        "action_type": "category",
        "action_detail": "category",
        "device_type": "category",
    }
)

sessions.drop(columns=["secs_elapsed"], inplace=True)
sessions["count"] = 1

In [4]:
batch_size = 50000
imax = len(sessions)

sessions_grouped = None

for i1 in tqdm(range(0, imax, batch_size), desc="Processing batches"):
    i2 = min(i1 + batch_size, imax)

    batch = sessions.iloc[i1:i2]
    batch_dummies = pd.get_dummies(batch.drop(columns=["user_id"]), drop_first=False)
    batch_dummies = batch[["user_id"]].join(batch_dummies.mul(batch["count"], axis=0))

    if sessions_grouped is None:
        sessions_grouped = batch_dummies.groupby("user_id").sum().reset_index()
    else:
        sessions_grouped = (
            pd.concat([sessions_grouped, batch_dummies])
            .groupby("user_id")
            .sum()
            .reset_index()
        )

sessions_grouped.to_feather("data/preprocessed/sessions_grouped")

Processing batches: 100%|██████████| 212/212 [04:53<00:00,  1.38s/it]
