In [1]:
import numpy as np
import pandas as pd
import json
from tqdm import tqdm

np.random.seed = 42

In [2]:
ratings = pd.read_csv("movielens/raw/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
users = pd.read_csv("movielens/raw/u.user", sep="|", encoding="latin-1", names=["user_id", "age", "gender", "occupation", "zip"])
items = pd.read_csv("movielens/raw/u.item", sep="|", encoding="latin-1", names=[
        "item_id",
        "title",
        "release_date",
        "video_release_date",
        "imdb_url",
        "unknown",
        "action",
        "adventure",
        "animation",
        "children",
        "comedy",
        "crime",
        "documentary", 
        "drama", 
        "fantasy",
        "film-noir",
        "horror",
        "musical",
        "mystery",
        "romance",
        "sci-Fi",
        "thriller",
        "war",
        "western",
    ])
num_sparse_features = {
    "user_id": users["user_id"].nunique(),
    "item_id": items["item_id"].nunique(),
}
vocabulary = {
    "user_id": np.unique(users["user_id"]),
    "item_id": np.unique(items["item_id"])
}

In [3]:
# 전처리
processed = ratings.copy()

# user_id, item_id 정보가 있는 데이터만 남기기
processed = processed[(
    processed["user_id"].isin(vocabulary["user_id"])
    & processed["item_id"].isin(vocabulary["item_id"])
)]
unknown = len(ratings) - len(processed)
print("삭제된 알 수 없는 데이터:", unknown)


# user_id, item_id를 index 매핑
processed["user_id"] = processed["user_id"].map({
    uid : i 
    for i, uid in enumerate(vocabulary["user_id"])
})

processed["item_id"] = processed["item_id"].map({
    iid : i 
    for i, iid in enumerate(vocabulary["item_id"])
})

삭제된 알 수 없는 데이터: 0


In [4]:
# Dataset Hyperparameter
maximum_positive_sample = 100
num_negative_sample = 5
all_item_indices = np.arange(num_sparse_features["item_id"])

# Leave-one-out split
# Downsampling
# Random Negative Sampling
train, test = [], []
for u, group in tqdm(processed.sort_values(by="timestamp").groupby("user_id")):
    # Leave one
    last = group.iloc[-1, :2].values
    test.append([last[0], last[1], 1])
    
    
    # Downsampling
    group = group.iloc[:-1]
    num_positive_sample = len(group)
    num_positive_sample = min(maximum_positive_sample, num_positive_sample)
    group = group.tail(num_positive_sample)

    # positive samples
    positive = group[["user_id", "item_id"]].values
    positive = np.column_stack([positive, np.ones(len(positive), dtype=int)])

    # negative sampling
    positive_item_indices = group["item_id"].unique()
    negative_item_indices = np.setdiff1d(all_item_indices, positive_item_indices)

    size = num_positive_sample*num_negative_sample
    replace_flag = size > len(negative_item_indices)
    negative_item_indices = np.random.choice(
        negative_item_indices,
        size=size,
        replace=replace_flag
    )

    negative = np.column_stack([
        np.full(size, u, dtype=int),
        negative_item_indices,
        np.zeros(size, dtype=int)
    ])
    train.append(positive)
    train.append(negative)

train = np.vstack(train)
test = np.vstack(test)

100%|███████████████████████████████████████| 943/943 [00:00<00:00, 2467.13it/s]


In [5]:
# 대조군(Popular)
test_data = processed.sort_values(by="timestamp").groupby("user_id").tail(1)
train_data = processed.drop(test_data.index)    
popular_recommendations = train_data["item_id"].value_counts().index.values

# 대조군(Random)
random_recommendations = np.random.choice(vocabulary["item_id"], size=len(all_item_indices), replace=False)

In [6]:
# save in processed
with open("movielens/processed/num_sparse_features.json", "w") as f:
    json.dump(num_sparse_features, f)

with open("movielens/processed/experiment_group.json", "w") as f:
    json.dump({
        "random":random_recommendations.tolist(), 
        "popular":popular_recommendations.tolist(),
    }, f)
    
np.save("movielens/processed/train.npy", train)
np.save("movielens/processed/test.npy", test)