In [1]:
import numpy as np
import pandas as pd

In [2]:
sessions = pd.read_csv("data/sessions.csv")

In [3]:
sessions["flg"] = 1  # 添加 flg 列并赋值为 1

# 添加 seq 列，表示每个 user_id 的操作序列编号
sessions["seq"] = sessions.groupby("user_id").cumcount() + 1

# 添加 seq_rev 列，表示每个 user_id 的操作序列反序编号
sessions["seq_rev"] = sessions.groupby("user_id").cumcount(ascending=False) + 1

# 添加 action2 列，将 action、action_type、action_detail 和 device_type 拼接为字符串
sessions["action2"] = (
    sessions["action"].astype(str)
    + "_"
    + sessions["action_type"].astype(str)
    + "_"
    + sessions["action_detail"].astype(str)
    + "_"
    + sessions["device_type"].astype(str)
)

In [4]:
# **************************************
# sessions_action_se_sum
# **************************************
# 计算每个 user_id 和 action 的 secs_elapsed 总和
sessions_action_se_sum = (
    sessions.groupby(["user_id", "action"])["secs_elapsed"]
    .sum()
    .reset_index()
    .rename(columns={"secs_elapsed": "secs_elapsed_sum"})
)

# 转换为长格式（melt），在 pandas 中直接添加新列后不需要删除变量列
sessions_action_se_sum = pd.melt(
    sessions_action_se_sum,
    id_vars=["user_id", "action"],
    value_vars=["secs_elapsed_sum"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_se_sum.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_se_sum.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_se_sum["feature"] = "action_se_sum_" + sessions_action_se_sum["feature"]

# 计算特征的唯一值数量
n_distinct_features = sessions_action_se_sum["feature"].nunique()
print(f"Number of distinct features: {n_distinct_features}")

# 保存结果
sessions_action_se_sum.to_pickle("cache/sessions_action_se_sum.pkl")

Number of distinct features: 359


In [5]:
# **************************************
# sessions_action_type_se_sum
# **************************************
# 计算每个 user_id 和 action_type 的 secs_elapsed 总和
sessions_action_type_se_sum = (
    sessions.groupby(['user_id', 'action_type'])['secs_elapsed']
    .sum()
    .reset_index()
    .rename(columns={'secs_elapsed': 'secs_elapsed_sum'})
)

# 转换为长格式（melt）
sessions_action_type_se_sum = pd.melt(
    sessions_action_type_se_sum,
    id_vars=['user_id', 'action_type'],
    value_vars=['secs_elapsed_sum'],
    var_name='variable',
    value_name='value'
)

# 删除 variable 列
sessions_action_type_se_sum.drop(columns=['variable'], inplace=True)

# 重命名列
sessions_action_type_se_sum.columns = ['id', 'feature', 'value']

# 修改 feature 列的值
sessions_action_type_se_sum['feature'] = 'action_type_se_sum_' + sessions_action_type_se_sum['feature']

# 计算特征的唯一值数量
n_distinct_features_action_type = sessions_action_type_se_sum['feature'].nunique()
print(f"Number of distinct features (action_type): {n_distinct_features_action_type}")

# 保存结果
sessions_action_type_se_sum.to_pickle("cache/sessions_action_type_se_sum.pkl")

Number of distinct features (action_type): 10


In [6]:
# **************************************
# sessions_action_detail_se_sum
# **************************************
# 按 user_id 和 action_detail 分组计算 secs_elapsed 的总和
sessions_action_detail_se_sum = sessions.groupby(
    ["user_id", "action_detail"], as_index=False
)["secs_elapsed"].sum()

# 修改列名
sessions_action_detail_se_sum.rename(
    columns={"user_id": "id", "secs_elapsed": "value"}, inplace=True
)

# 将 action_detail 列合并到 feature 中，形成特征名
sessions_action_detail_se_sum["feature"] = (
    "action_detail_se_sum_" + sessions_action_detail_se_sum["action_detail"].astype(str)
)

# 删除 action_detail 列
sessions_action_detail_se_sum.drop(columns=["action_detail"], inplace=True)

# 检查特征数目
unique_features_count = sessions_action_detail_se_sum["feature"].nunique()

# 保存数据到文件
sessions_action_detail_se_sum.to_pickle("cache/sessions_action_detail_se_sum.pkl")

print(f"Number of distinct features: {unique_features_count}")

Number of distinct features: 155


In [7]:
# **************************************
# sessions_device_type_se_sum
# **************************************
# 计算每个 user_id 和 device_type 的 secs_elapsed 总和
sessions_device_type_se_sum = (
    sessions.groupby(['user_id', 'device_type'])['secs_elapsed']
    .sum()
    .reset_index()
    .rename(columns={'secs_elapsed': 'secs_elapsed_sum'})
)

# 转换为长格式（melt）
sessions_device_type_se_sum = pd.melt(
    sessions_device_type_se_sum,
    id_vars=['user_id', 'device_type'],
    value_vars=['secs_elapsed_sum'],
    var_name='variable',
    value_name='value'
)

# 删除 variable 列
sessions_device_type_se_sum.drop(columns=['variable'], inplace=True)

# 重命名列
sessions_device_type_se_sum.columns = ['id', 'feature', 'value']

# 修改 feature 列的值
sessions_device_type_se_sum['feature'] = 'device_type_se_sum_' + sessions_device_type_se_sum['feature']

# 计算特征的唯一值数量
n_distinct_features_device_type = sessions_device_type_se_sum['feature'].nunique()
print(f"Number of distinct features (device_type): {n_distinct_features_device_type}")

# 保存结果
sessions_device_type_se_sum.to_pickle("cache/sessions_device_type_se_sum.pkl")

Number of distinct features (device_type): 14


In [8]:
# **************************************
# sessions_device_type_se_sum
# **************************************
# 计算每个 user_id 和 device_type 的 secs_elapsed 总和
sessions_device_type_se_sum = (
    sessions.groupby(["user_id", "device_type"])["secs_elapsed"]
    .sum()
    .reset_index()
    .rename(columns={"secs_elapsed": "secs_elapsed_sum"})
)

# 转换为长格式（melt）
sessions_device_type_se_sum = pd.melt(
    sessions_device_type_se_sum,
    id_vars=["user_id", "device_type"],
    value_vars=["secs_elapsed_sum"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_device_type_se_sum.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_device_type_se_sum.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_device_type_se_sum["feature"] = (
    "device_type_se_sum_" + sessions_device_type_se_sum["feature"]
)

# 计算特征的唯一值数量
n_distinct_features_device_type = sessions_device_type_se_sum["feature"].nunique()
print(f"Number of distinct features (device_type): {n_distinct_features_device_type}")

# 保存结果
sessions_device_type_se_sum.to_pickle("cache/sessions_device_type_se_sum.pkl")

Number of distinct features (device_type): 14


In [9]:
# **************************************
# sessions_action_flg_sum
# **************************************
# 计算每个 user_id 和 action 的 flg 总和
sessions_action_flg_sum = (
    sessions.groupby(["user_id", "action"])["flg"]
    .sum(min_count=1)  # 等价于 R 中的 na.rm=T
    .reset_index()
    .rename(columns={"flg": "flg_sum"})
)

# 转换为长格式（melt）
sessions_action_flg_sum = pd.melt(
    sessions_action_flg_sum,
    id_vars=["user_id", "action"],
    value_vars=["flg_sum"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_flg_sum.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_flg_sum.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_flg_sum["feature"] = (
    "action_flg_sum_" + sessions_action_flg_sum["feature"]
)

# 计算特征的唯一值数量
n_distinct_features_action_flg = sessions_action_flg_sum["feature"].nunique()
print(f"Number of distinct features (action_flg): {n_distinct_features_action_flg}")

# 保存结果
sessions_action_flg_sum.to_pickle("cache/sessions_action_flg_sum.pkl")

Number of distinct features (action_flg): 359


In [10]:
# **************************************
# sessions_action_type_flg_sum
# **************************************
# 计算每个 user_id 和 action_type 的 flg 总和
sessions_action_type_flg_sum = (
    sessions.groupby(["user_id", "action_type"])["flg"]
    .sum(min_count=1)  # 等价于 R 中的 na.rm=T
    .reset_index()
    .rename(columns={"flg": "flg_sum"})
)

# 转换为长格式（melt）
sessions_action_type_flg_sum = pd.melt(
    sessions_action_type_flg_sum,
    id_vars=["user_id", "action_type"],
    value_vars=["flg_sum"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_type_flg_sum.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_type_flg_sum.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_type_flg_sum["feature"] = (
    "action_type_flg_sum_" + sessions_action_type_flg_sum["feature"]
)

# 计算特征的唯一值数量
n_distinct_features_action_type_flg = sessions_action_type_flg_sum["feature"].nunique()
print(
    f"Number of distinct features (action_type_flg): {n_distinct_features_action_type_flg}"
)

# 保存结果
sessions_action_type_flg_sum.to_pickle("cache/sessions_action_type_flg_sum.pkl")

Number of distinct features (action_type_flg): 10


In [11]:
# **************************************
# sessions_action_detail_flg_sum
# **************************************
# 计算每个 user_id 和 action_detail 的 flg 总和
sessions_action_detail_flg_sum = (
    sessions.groupby(["user_id", "action_detail"])["flg"]
    .sum(min_count=1)  # 等价于 R 中的 na.rm=T
    .reset_index()
    .rename(columns={"flg": "flg_sum"})
)

# 转换为长格式（melt）
sessions_action_detail_flg_sum = pd.melt(
    sessions_action_detail_flg_sum,
    id_vars=["user_id", "action_detail"],
    value_vars=["flg_sum"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_detail_flg_sum.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_detail_flg_sum.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_detail_flg_sum["feature"] = (
    "action_detail_flg_sum_" + sessions_action_detail_flg_sum["feature"]
)

# 计算特征的唯一值数量
n_distinct_features_action_detail_flg = sessions_action_detail_flg_sum[
    "feature"
].nunique()
print(
    f"Number of distinct features (action_detail_flg): {n_distinct_features_action_detail_flg}"
)

# 保存结果
sessions_action_detail_flg_sum.to_pickle("cache/sessions_action_detail_flg_sum.pkl")

Number of distinct features (action_detail_flg): 155


In [12]:
# **************************************
# sessions_device_type_flg_sum
# **************************************
# 计算每个 user_id 和 device_type 的 flg 总和
sessions_device_type_flg_sum = (
    sessions.groupby(["user_id", "device_type"])["flg"]
    .sum(min_count=1)  # 等价于 R 中的 na.rm=T
    .reset_index()
    .rename(columns={"flg": "flg_sum"})
)

# 转换为长格式（melt）
sessions_device_type_flg_sum = pd.melt(
    sessions_device_type_flg_sum,
    id_vars=["user_id", "device_type"],
    value_vars=["flg_sum"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_device_type_flg_sum.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_device_type_flg_sum.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_device_type_flg_sum["feature"] = (
    "device_type_flg_sum_" + sessions_device_type_flg_sum["feature"]
)

# 计算特征的唯一值数量
n_distinct_features_device_type_flg = sessions_device_type_flg_sum["feature"].nunique()
print(
    f"Number of distinct features (device_type_flg): {n_distinct_features_device_type_flg}"
)

# 保存结果
sessions_device_type_flg_sum.to_pickle("cache/sessions_device_type_flg_sum.pkl")

Number of distinct features (device_type_flg): 14


In [13]:
# **************************************
# sessions_action_se_mean
# **************************************
# 计算每个 user_id 和 action 的 secs_elapsed 平均值
sessions_action_se_mean = (
    sessions.groupby(["user_id", "action"])["secs_elapsed"]
    .mean()  # 等价于 R 中的 mean(na.rm=T)
    .reset_index()
    .rename(columns={"secs_elapsed": "secs_elapsed_mean"})
)

# 转换为长格式（melt）
sessions_action_se_mean = pd.melt(
    sessions_action_se_mean,
    id_vars=["user_id", "action"],
    value_vars=["secs_elapsed_mean"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_se_mean.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_se_mean.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_se_mean["feature"] = (
    "action_se_mean_" + sessions_action_se_mean["feature"]
)

# 计算特征的唯一值数量
n_distinct_features_action_se_mean = sessions_action_se_mean["feature"].nunique()
print(
    f"Number of distinct features (action_se_mean): {n_distinct_features_action_se_mean}"
)

# 保存结果
sessions_action_se_mean.to_pickle("cache/sessions_action_se_mean.pkl")

Number of distinct features (action_se_mean): 359


In [14]:
# **************************************
# sessions_action_type_se_mean
# **************************************
# 计算每个 user_id 和 action_type 的 secs_elapsed 平均值
sessions_action_type_se_mean = (
    sessions.groupby(["user_id", "action_type"])["secs_elapsed"]
    .mean()  # 自动忽略NaN值，等价于R中的 mean(na.rm=T)
    .reset_index()
    .rename(columns={"secs_elapsed": "secs_elapsed_mean"})
)

# 转换为长格式
sessions_action_type_se_mean = pd.melt(
    sessions_action_type_se_mean,
    id_vars=["user_id", "action_type"],
    value_vars=["secs_elapsed_mean"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_type_se_mean.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_type_se_mean.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_type_se_mean["feature"] = (
    "action_type_se_mean_" + sessions_action_type_se_mean["feature"]
)

# 计算唯一特征的数量
n_distinct_features_action_type_se_mean = sessions_action_type_se_mean[
    "feature"
].nunique()
print(
    f"Number of distinct features (action_type_se_mean): {n_distinct_features_action_type_se_mean}"
)

# 保存结果
sessions_action_type_se_mean.to_pickle("cache/sessions_action_type_se_mean.pkl")

Number of distinct features (action_type_se_mean): 10


In [15]:
# **************************************
# sessions_action_detail_se_mean
# **************************************
# 计算每个 user_id 和 action_detail 的 secs_elapsed 平均值
sessions_action_detail_se_mean = (
    sessions.groupby(["user_id", "action_detail"])["secs_elapsed"]
    .mean()
    .reset_index()
    .rename(columns={"secs_elapsed": "secs_elapsed_mean"})
)

# 转换为长格式
sessions_action_detail_se_mean = pd.melt(
    sessions_action_detail_se_mean,
    id_vars=["user_id", "action_detail"],
    value_vars=["secs_elapsed_mean"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_detail_se_mean.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_detail_se_mean.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_detail_se_mean["feature"] = (
    "action_detail_se_mean_" + sessions_action_detail_se_mean["feature"]
)

# 计算唯一特征的数量
n_distinct_features_action_detail_se_mean = sessions_action_detail_se_mean[
    "feature"
].nunique()
print(
    f"Number of distinct features (action_detail_se_mean): {n_distinct_features_action_detail_se_mean}"
)

# 保存结果
sessions_action_detail_se_mean.to_pickle("cache/sessions_action_detail_se_mean.pkl")

Number of distinct features (action_detail_se_mean): 155


In [16]:
# **************************************
# sessions_device_type_se_mean
# **************************************
# 计算每个 user_id 和 device_type 的 secs_elapsed 平均值
sessions_device_type_se_mean = (
    sessions.groupby(["user_id", "device_type"])["secs_elapsed"]
    .mean()
    .reset_index()
    .rename(columns={"secs_elapsed": "secs_elapsed_mean"})
)

# 转换为长格式
sessions_device_type_se_mean = pd.melt(
    sessions_device_type_se_mean,
    id_vars=["user_id", "device_type"],
    value_vars=["secs_elapsed_mean"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_device_type_se_mean.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_device_type_se_mean.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_device_type_se_mean["feature"] = (
    "device_type_se_mean_" + sessions_device_type_se_mean["feature"]
)

# 计算唯一特征的数量
n_distinct_features_device_type_se_mean = sessions_device_type_se_mean[
    "feature"
].nunique()
print(
    f"Number of distinct features (device_type_se_mean): {n_distinct_features_device_type_se_mean}"
)

# 保存结果
sessions_device_type_se_mean.to_pickle("cache/sessions_device_type_se_mean.pkl")

Number of distinct features (device_type_se_mean): 14


In [17]:
# **************************************
# sessions_action_se_sd
# **************************************
# 计算每个 user_id 和 action 的 secs_elapsed 标准差
sessions_action_se_sd = (
    sessions.groupby(["user_id", "action"])["secs_elapsed"]
    .std()  # 等价于 R 中的 sd(na.rm=T)
    .reset_index()
    .rename(columns={"secs_elapsed": "secs_elapsed_sd"})
)

# 转换为长格式
sessions_action_se_sd = pd.melt(
    sessions_action_se_sd,
    id_vars=["user_id", "action"],
    value_vars=["secs_elapsed_sd"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_se_sd.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_se_sd.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_se_sd["feature"] = "action_se_sd_" + sessions_action_se_sd["feature"]

# 计算唯一特征数量
n_distinct_features_action_se_sd = sessions_action_se_sd["feature"].nunique()
print(f"Number of distinct features (action_se_sd): {n_distinct_features_action_se_sd}")

# 保存结果
sessions_action_se_sd.to_pickle("cache/sessions_action_se_sd.pkl")

Number of distinct features (action_se_sd): 359


In [18]:
# **************************************
# sessions_action_type_se_sd
# **************************************
# 计算每个 user_id 和 action_type 的 secs_elapsed 标准差
sessions_action_type_se_sd = (
    sessions.groupby(["user_id", "action_type"])["secs_elapsed"]
    .std()
    .reset_index()
    .rename(columns={"secs_elapsed": "secs_elapsed_sd"})
)

# 转换为长格式
sessions_action_type_se_sd = pd.melt(
    sessions_action_type_se_sd,
    id_vars=["user_id", "action_type"],
    value_vars=["secs_elapsed_sd"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_type_se_sd.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_type_se_sd.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_type_se_sd["feature"] = (
    "action_type_se_sd_" + sessions_action_type_se_sd["feature"]
)

# 计算唯一特征数量
n_distinct_features_action_type_se_sd = sessions_action_type_se_sd["feature"].nunique()
print(
    f"Number of distinct features (action_type_se_sd): {n_distinct_features_action_type_se_sd}"
)

# 保存结果
sessions_action_type_se_sd.to_pickle("cache/sessions_action_type_se_sd.pkl")

Number of distinct features (action_type_se_sd): 10


In [19]:
# **************************************
# sessions_action_detail_se_sd
# **************************************
# 计算每个 user_id 和 action_detail 的 secs_elapsed 标准差
sessions_action_detail_se_sd = (
    sessions.groupby(["user_id", "action_detail"])["secs_elapsed"]
    .std()
    .reset_index()
    .rename(columns={"secs_elapsed": "secs_elapsed_sd"})
)

# 转换为长格式
sessions_action_detail_se_sd = pd.melt(
    sessions_action_detail_se_sd,
    id_vars=["user_id", "action_detail"],
    value_vars=["secs_elapsed_sd"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_detail_se_sd.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_detail_se_sd.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_detail_se_sd["feature"] = (
    "action_detail_se_sd_" + sessions_action_detail_se_sd["feature"]
)

# 计算唯一特征数量
n_distinct_features_action_detail_se_sd = sessions_action_detail_se_sd[
    "feature"
].nunique()
print(
    f"Number of distinct features (action_detail_se_sd): {n_distinct_features_action_detail_se_sd}"
)

# 保存结果
sessions_action_detail_se_sd.to_pickle("cache/sessions_action_detail_se_sd.pkl")

Number of distinct features (action_detail_se_sd): 155


In [20]:
# **************************************
# sessions_device_type_se_sd
# **************************************
# 计算每个 user_id 和 device_type 的 secs_elapsed 标准差
sessions_device_type_se_sd = (
    sessions.groupby(["user_id", "device_type"])["secs_elapsed"]
    .std()  # 等价于 R 中的 sd(na.rm=T)
    .reset_index()
    .rename(columns={"secs_elapsed": "secs_elapsed_sd"})
)

# 转换为长格式
sessions_device_type_se_sd = pd.melt(
    sessions_device_type_se_sd,
    id_vars=["user_id", "device_type"],
    value_vars=["secs_elapsed_sd"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_device_type_se_sd.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_device_type_se_sd.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_device_type_se_sd["feature"] = (
    "device_type_se_sd_" + sessions_device_type_se_sd["feature"]
)

# 计算唯一特征的数量
n_distinct_features_device_type_se_sd = sessions_device_type_se_sd["feature"].nunique()
print(
    f"Number of distinct features (device_type_se_sd): {n_distinct_features_device_type_se_sd}"
)

# 保存结果
sessions_device_type_se_sd.to_pickle("cache/sessions_device_type_se_sd.pkl")

Number of distinct features (device_type_se_sd): 14


In [21]:
# **************************************
# sessions_action_se_wrmean
# **************************************
# 计算每个 user_id 和 action 的加权平均 secs_elapsed
sessions["weight"] = 1 / sessions["seq_rev"]  # 加权权重
sessions_action_se_wrmean = (
    sessions.groupby(["user_id", "action"])
    .apply(
        lambda group: np.average(group["secs_elapsed"], weights=group["weight"]),
        include_groups=False,
    )
    .reset_index(name="secs_elapsed_wrmean")
)

# 转换为长格式
sessions_action_se_wrmean = pd.melt(
    sessions_action_se_wrmean,
    id_vars=["user_id", "action"],
    value_vars=["secs_elapsed_wrmean"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_se_wrmean.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_se_wrmean.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_se_wrmean["feature"] = (
    "action_se_wrmean_" + sessions_action_se_wrmean["feature"]
)

# 计算唯一特征的数量
n_distinct_features_action_se_wrmean = sessions_action_se_wrmean["feature"].nunique()
print(
    f"Number of distinct features (action_se_wrmean): {n_distinct_features_action_se_wrmean}"
)

# 保存结果
sessions_action_se_wrmean.to_pickle("cache/sessions_action_se_wrmean.pkl")

Number of distinct features (action_se_wrmean): 359


In [22]:
# **************************************
# sessions_action_type_se_wrmean
# **************************************
# 计算每个 user_id 和 action_type 的加权平均 secs_elapsed
sessions_action_type_se_wrmean = (
    sessions.groupby(["user_id", "action_type"])
    .apply(
        lambda group: np.average(group["secs_elapsed"], weights=group["weight"]),
        include_groups=False,
    )
    .reset_index(name="secs_elapsed_wrmean")
)

# 转换为长格式
sessions_action_type_se_wrmean = pd.melt(
    sessions_action_type_se_wrmean,
    id_vars=["user_id", "action_type"],
    value_vars=["secs_elapsed_wrmean"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_type_se_wrmean.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_type_se_wrmean.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_type_se_wrmean["feature"] = (
    "action_type_se_wrmean_" + sessions_action_type_se_wrmean["feature"]
)

# 计算唯一特征的数量
n_distinct_features_action_type_se_wrmean = sessions_action_type_se_wrmean[
    "feature"
].nunique()
print(
    f"Number of distinct features (action_type_se_wrmean): {n_distinct_features_action_type_se_wrmean}"
)

# 保存结果
sessions_action_type_se_wrmean.to_pickle("cache/sessions_action_type_se_wrmean.pkl")

Number of distinct features (action_type_se_wrmean): 10


In [23]:
# **************************************
# sessions_action_detail_se_wrmean
# **************************************
# 计算每个 user_id 和 action_detail 的加权平均 secs_elapsed
sessions_action_detail_se_wrmean = (
    sessions.groupby(["user_id", "action_detail"])
    .apply(
        lambda group: np.average(group["secs_elapsed"], weights=1 / group["seq_rev"]),
        include_groups=False,
    )
    .reset_index(name="secs_elapsed_wrmean")
)

# 转换为长格式
sessions_action_detail_se_wrmean = pd.melt(
    sessions_action_detail_se_wrmean,
    id_vars=["user_id", "action_detail"],
    value_vars=["secs_elapsed_wrmean"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_detail_se_wrmean.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_detail_se_wrmean.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_detail_se_wrmean["feature"] = (
    "action_detail_se_wrmean_" + sessions_action_detail_se_wrmean["feature"]
)

# 计算唯一特征的数量
n_distinct_features_action_detail_se_wrmean = sessions_action_detail_se_wrmean[
    "feature"
].nunique()
print(
    f"Number of distinct features (action_detail_se_wrmean): {n_distinct_features_action_detail_se_wrmean}"
)

# 保存结果
sessions_action_detail_se_wrmean.to_pickle("cache/sessions_action_detail_se_wrmean.pkl")

Number of distinct features (action_detail_se_wrmean): 155


In [24]:
# **************************************
# sessions_device_type_se_wrmean
# **************************************
# 计算每个 user_id 和 device_type 的加权平均 secs_elapsed
sessions_device_type_se_wrmean = (
    sessions.groupby(["user_id", "device_type"])
    .apply(
        lambda group: np.average(group["secs_elapsed"], weights=1 / group["seq_rev"]),
        include_groups=False,
    )
    .reset_index(name="secs_elapsed_wrmean")
)

# 转换为长格式
sessions_device_type_se_wrmean = pd.melt(
    sessions_device_type_se_wrmean,
    id_vars=["user_id", "device_type"],
    value_vars=["secs_elapsed_wrmean"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_device_type_se_wrmean.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_device_type_se_wrmean.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_device_type_se_wrmean["feature"] = (
    "device_type_se_wrmean_" + sessions_device_type_se_wrmean["feature"]
)

# 计算唯一特征的数量
n_distinct_features_device_type_se_wrmean = sessions_device_type_se_wrmean[
    "feature"
].nunique()
print(
    f"Number of distinct features (device_type_se_wrmean): {n_distinct_features_device_type_se_wrmean}"
)

# 保存结果
sessions_device_type_se_wrmean.to_pickle("cache/sessions_device_type_se_wrmean.pkl")

Number of distinct features (device_type_se_wrmean): 14


In [25]:
# **************************************
# sessions_action_se_wmean
# **************************************
# 计算每个 user_id 和 action 的加权平均 secs_elapsed，权重为 1/seq
sessions_action_se_wmean = (
    sessions.groupby(["user_id", "action"])
    .apply(
        lambda group: np.average(group["secs_elapsed"], weights=1 / group["seq"]),
        include_groups=False,
    )
    .reset_index(name="secs_elapsed_wmean")
)

# 转换为长格式
sessions_action_se_wmean = pd.melt(
    sessions_action_se_wmean,
    id_vars=["user_id", "action"],
    value_vars=["secs_elapsed_wmean"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_se_wmean.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_se_wmean.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_se_wmean["feature"] = (
    "action_se_wmean_" + sessions_action_se_wmean["feature"]
)

# 计算唯一特征的数量
n_distinct_features_action_se_wmean = sessions_action_se_wmean["feature"].nunique()
print(
    f"Number of distinct features (action_se_wmean): {n_distinct_features_action_se_wmean}"
)

# 保存结果
sessions_action_se_wmean.to_pickle("cache/sessions_action_se_wmean.pkl")

Number of distinct features (action_se_wmean): 359


In [26]:
# **************************************
# sessions_action_detail_se_wrmean
# **************************************
# 计算每个 user_id 和 action_detail 的加权平均 secs_elapsed
sessions_action_detail_se_wrmean = (
    sessions.groupby(["user_id", "action_detail"])
    .apply(
        lambda group: np.average(group["secs_elapsed"], weights=1 / group["seq_rev"]),
        include_groups=False,
    )
    .reset_index(name="secs_elapsed_wrmean")
)

# 转换为长格式
sessions_action_detail_se_wrmean = pd.melt(
    sessions_action_detail_se_wrmean,
    id_vars=["user_id", "action_detail"],
    value_vars=["secs_elapsed_wrmean"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_detail_se_wrmean.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_detail_se_wrmean.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_detail_se_wrmean["feature"] = (
    "action_detail_se_wrmean_" + sessions_action_detail_se_wrmean["feature"]
)

# 计算唯一特征的数量
n_distinct_features_action_detail_se_wrmean = sessions_action_detail_se_wrmean[
    "feature"
].nunique()
print(
    f"Number of distinct features (action_detail_se_wrmean): {n_distinct_features_action_detail_se_wrmean}"
)

# 保存结果
sessions_action_detail_se_wrmean.to_pickle("cache/sessions_action_detail_se_wrmean.pkl")

Number of distinct features (action_detail_se_wrmean): 155


In [27]:
# **************************************
# sessions_device_type_se_wrmean
# **************************************
# 计算每个 user_id 和 device_type 的加权平均 secs_elapsed
sessions_device_type_se_wrmean = (
    sessions.groupby(["user_id", "device_type"])
    .apply(
        lambda group: np.average(group["secs_elapsed"], weights=1 / group["seq_rev"]),
        include_groups=False,
    )
    .reset_index(name="secs_elapsed_wrmean")
)

# 转换为长格式
sessions_device_type_se_wrmean = pd.melt(
    sessions_device_type_se_wrmean,
    id_vars=["user_id", "device_type"],
    value_vars=["secs_elapsed_wrmean"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_device_type_se_wrmean.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_device_type_se_wrmean.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_device_type_se_wrmean["feature"] = (
    "device_type_se_wrmean_" + sessions_device_type_se_wrmean["feature"]
)

# 计算唯一特征的数量
n_distinct_features_device_type_se_wrmean = sessions_device_type_se_wrmean[
    "feature"
].nunique()
print(
    f"Number of distinct features (device_type_se_wrmean): {n_distinct_features_device_type_se_wrmean}"
)

# 保存结果
sessions_device_type_se_wrmean.to_pickle("cache/sessions_device_type_se_wrmean.pkl")

Number of distinct features (device_type_se_wrmean): 14


In [28]:
# **************************************
# sessions_action_se_wmean
# **************************************
# 计算每个 user_id 和 action 的加权平均 secs_elapsed，权重为 1/seq
sessions_action_se_wmean = (
    sessions.groupby(["user_id", "action"])
    .apply(
        lambda group: np.average(group["secs_elapsed"], weights=1 / group["seq"]),
        include_groups=False,
    )
    .reset_index(name="secs_elapsed_wmean")
)

# 转换为长格式
sessions_action_se_wmean = pd.melt(
    sessions_action_se_wmean,
    id_vars=["user_id", "action"],
    value_vars=["secs_elapsed_wmean"],
    var_name="variable",
    value_name="value",
)

# 删除 variable 列
sessions_action_se_wmean.drop(columns=["variable"], inplace=True)

# 重命名列
sessions_action_se_wmean.columns = ["id", "feature", "value"]

# 修改 feature 列的值
sessions_action_se_wmean["feature"] = (
    "action_se_wmean_" + sessions_action_se_wmean["feature"]
)

# 计算唯一特征的数量
n_distinct_features_action_se_wmean = sessions_action_se_wmean["feature"].nunique()
print(
    f"Number of distinct features (action_se_wmean): {n_distinct_features_action_se_wmean}"
)

# 保存结果
sessions_action_se_wmean.to_pickle("cache/sessions_action_se_wmean.pkl")

Number of distinct features (action_se_wmean): 359


In [29]:
files = [
    "cache/sessions_action_se_sum.pkl",
    "cache/sessions_action_type_se_sum.pkl",
    "cache/sessions_action_detail_se_sum.pkl",
    "cache/sessions_device_type_se_sum.pkl",
    "cache/sessions_action_flg_sum.pkl",
    "cache/sessions_action_type_flg_sum.pkl",
    "cache/sessions_action_detail_flg_sum.pkl",
    "cache/sessions_device_type_flg_sum.pkl",
]

merged_data = pd.DataFrame()

for file in files:
    data = pd.read_pickle(file)
    data_pivot = data.pivot(index="id", columns="feature", values="value")
    if merged_data.empty:
        merged_data = data_pivot
    else:
        merged_data = merged_data.join(data_pivot, how="outer")

merged_data

feature,action_se_sum_10,action_se_sum_11,action_se_sum_12,action_se_sum_15,action_se_sum_about_us,action_se_sum_accept_decline,action_se_sum_account,action_se_sum_acculynk_bin_check_failed,action_se_sum_acculynk_bin_check_success,action_se_sum_acculynk_load_pin_pad,...,device_type_flg_sum_Chromebook,device_type_flg_sum_Linux Desktop,device_type_flg_sum_Mac Desktop,device_type_flg_sum_Opera Phone,device_type_flg_sum_Tablet,device_type_flg_sum_Windows Desktop,device_type_flg_sum_Windows Phone,device_type_flg_sum_iPad Tablet,device_type_flg_sum_iPhone,device_type_flg_sum_iPodtouch
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00023iyk9l,,,,,,,,,,,...,,,36.0,,,,,,4.0,
0010k6l0om,,,,,,,,,,,...,,,63.0,,,,,,,
001wyh0pz8,,,,,,,,,,,...,,,,,,,,,,
0028jgx1x1,,,,,,,,,,,...,,,,,,,,,,
002qnbzfs5,301482.0,,,,,,,,,,...,,,,,,,,,775.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzxox7jnrx,,,,,,,,,,,...,,,,,,89.0,,,,
zzy7t0y9cm,,,,,,,,,,,...,,,,,,8.0,,,,
zzysuoqg6x,,,,,,,,,,,...,,,,,,3.0,,,,
zzywmcn0jv,,,,,,,,,,,...,,,,,,51.0,,,,


In [30]:
merged_data.to_feather("data/preprocessed/sessions_data")

f"{merged_data.shape[0]} ids, {merged_data.shape[1]} features"

'135483 ids, 1076 features'