In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
import os
os.chdir("/data/projects/kwangeun/CVAR/datahub/WWW_data/movielens")

In [3]:
text_features_org = np.load("./text_features.npy", allow_pickle=True)
print(text_features_org.shape) # num_ids, 2
print(text_features_org[0].shape) # id, feature
print(text_features_org[0][1].shape) # feature_size
text_emb_size = text_features_org[0][1].shape[0]
text_features = {f[0]: f[1] for f in text_features_org}

(4953, 2)
(2,)
(768,)


In [4]:
with open("./video_features.pkl", "rb") as pf:
    video_features = pickle.load(pf)
video_emb_size = video_features[1].shape[0]

In [5]:
rename_dict = {
    "userId": "user_id",
    "movielens_id": "item_id",
}
warm_train = pd.read_csv("./warm_train.csv")
warm_val = pd.read_csv("./warm_val.csv")
cold_val = pd.read_csv("./cold_val.csv")
cold_test = pd.read_csv("./cold_test.csv")

warm_train.rename(columns=rename_dict, inplace=True)
warm_val.rename(columns=rename_dict, inplace=True)
cold_val.rename(columns=rename_dict, inplace=True)
cold_test.rename(columns=rename_dict, inplace=True)

warm_train = warm_train[["user_id", "item_id", "rating"]]
warm_val = warm_val[["user_id", "item_id", "rating"]]
cold_val = cold_val[["user_id", "item_id", "rating"]]
cold_test = cold_test[["user_id", "item_id", "rating"]]

In [6]:
# implicit feedback with rating
threshold = 3.5
warm_train["rating"] = warm_train["rating"].map(lambda x: 0 if x < threshold else 1)
warm_val["rating"] = warm_val["rating"].map(lambda x: 0 if x < threshold else 1)
cold_val["rating"] = cold_val["rating"].map(lambda x: 0 if x < threshold else 1)
cold_test["rating"] = cold_test["rating"].map(lambda x: 0 if x < threshold else 1)

In [8]:
def generate_neg_ratings(df):
    item_seq = df.groupby("user_id")["item_id"].apply(list).reset_index(name="item_seq")
    item_seq = {user: seq for user, seq in zip(item_seq["user_id"], item_seq["item_seq"])}
    all_items = df["item_id"].unique()

    for user_id in tqdm(df["user_id"].unique()):
        pos_items = item_seq[user_id]
        num_pos = len(pos_items)
        candidate = [item for item in all_items if item not in pos_items]

        user = np.repeat(user_id, num_pos)
        neg_ratings = np.repeat(0, num_pos)
        neg_items = np.random.choice(candidate, num_pos, replace=False)

        neg_records = pd.DataFrame({
            "user_id": user,
            "item_id": neg_items,
            "rating": neg_ratings
        })

        df = pd.concat([df, neg_records], axis=0)
    return df

In [9]:
warm_train = generate_neg_ratings(warm_train)
warm_val = generate_neg_ratings(warm_val)
cold_val = generate_neg_ratings(cold_val)
cold_test = generate_neg_ratings(cold_test)

100%|██████████| 27993/27993 [02:17<00:00, 204.03it/s]


In [10]:
# warm_train.to_csv("./warm_train_with_negative.csv", index=False)
# warm_val.to_csv("./warm_val_with_negative.csv", index=False)
# cold_val.to_csv("./cold_val_with_negative.csv", index=False)
# cold_test.to_csv("./cold_test_with_negative.csv", index=False)

In [13]:
warm_train = pd.read_csv("./warm_train_with_negative.csv")
warm_val = pd.read_csv("./warm_val_with_negative.csv")
cold_val = pd.read_csv("./cold_val_with_negative.csv")
cold_test = pd.read_csv("./cold_test_with_negative.csv")

In [21]:
warm_items = warm_train["item_id"].unique()
cold_val_items = cold_val["item_id"].unique()
cold_test_items = cold_test["item_id"].unique()
all_items = np.concatenate([warm_items, cold_val_items, cold_test_items], axis=0)

print(f"warm_items: {len(warm_items)}")
print(f"cold_val_items: {len(cold_val_items)}")
print(f"cold_test_items: {len(cold_test_items)}")

warm_items: 4198
cold_val_items: 378
cold_test_items: 377


In [15]:
print(warm_train["rating"].value_counts())
print(warm_val["rating"].value_counts())
print(cold_val["rating"].value_counts())
print(cold_test["rating"].value_counts())

print(warm_train[["user_id", "item_id"]].duplicated().sum())
print(warm_val[["user_id", "item_id"]].duplicated().sum())
print(cold_val[["user_id", "item_id"]].duplicated().sum())
print(cold_test[["user_id", "item_id"]].duplicated().sum())

1    1619534
0    1619534
Name: rating, dtype: int64
1    202400
0    202400
Name: rating, dtype: int64
1    496130
0    496130
Name: rating, dtype: int64
1    659783
0    659783
Name: rating, dtype: int64


In [25]:
content_features = pd.DataFrame(all_items, columns=["item_id"])

# append text features
text = []
for item_id in content_features["item_id"]:
    text.append(text_features[item_id])
content_features["text"] = text

# append video features
video = []
for item_id in content_features["item_id"]:
    video.append(video_features[item_id])
content_features["video"] = video

content_features = {row.item_id: {"text": row.text, "video": row.video} for row in content_features.itertuples()}

In [35]:
all_ratings = pd.concat([warm_train, warm_val, cold_test, cold_val])
orders = ["user_id", "item_id", "rating"]
description = [
    ('user_id', np.max(all_ratings["user_id"]) + 1, 'spr'),
    ('item_id', np.max(all_ratings["item_id"]) + 1, 'spr'),
    ('text', text_emb_size, 'pretrained'),
    ('video', video_emb_size, 'pretrained'),
    ('rating', 2, 'label'),
    ('count', -1, 'ctn'),
]

In [36]:
def add_count(df):
    user2count = df.groupby(['item_id']).size().reset_index(name='count').sort_values(by='count')
    item_ids = list(user2count['item_id'])
    counts = np.array(user2count['count'])

    df = df.join(user2count.set_index('item_id'), on='item_id')
    min_count = np.min(df['count'])
    max_count = np.max(df['count'])
    df['count'] = df['count'].map(lambda x: (x - min_count)/(max_count - min_count))
    return df

In [39]:
warm_train = add_count(warm_train)
warm_val = add_count(warm_val)
cold_val = add_count(cold_val)
cold_test = add_count(cold_test)

In [42]:
save_dic = {
    "warm_train": warm_train,
    "warm_val": warm_val,
    "cold_val": cold_val,
    "cold_test": cold_test,
    "description": description,
    "content_features": content_features,
}
for name, df in save_dic.items():
    print("{} size: {}".format(name, len(df)))
with open('./movielens_data.pkl', 'bw+') as f:
    pickle.dump(save_dic, f)

warm_train size: 3239068
warm_val size: 404800
cold_val size: 992260
cold_test size: 1319566
description size: 6
content_features size: 4953
