In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
folder = "industrial_and_scientific"
mode = "all"

In [3]:
reviews = pd.read_pickle(f"../../data/{folder}/reviews_{mode}.p")
onehot = pd.read_pickle(f"../../data/{folder}/onehot_{mode}.p")
sbert = pickle.load(open(f"../../data/{folder}/sbert_{mode}.p", "rb"))

In [4]:
uuids, uiids = reviews["user_id"].unique(), reviews["asin"].unique()

In [5]:
user_map = dict(zip(uuids, list(range(1, len(uuids) + 1))))
item_map = dict(zip(uiids, list(range(1, len(uiids) + 1))))

In [6]:
pickle.dump(user_map, open(f"../../data/{folder}/usermap_{mode}.p", "wb"))
pickle.dump(item_map, open(f"../../data/{folder}/itemmap_{mode}.p", "wb"))

In [7]:
r_sorted = reviews.sort_values("timestamp")
r_sorted["iid"] = r_sorted["asin"].map(item_map)
r_sorted["uid"] = r_sorted["user_id"].map(user_map)

In [8]:
r_sorted["date"] = pd.to_datetime(r_sorted["timestamp"], unit="s")
r_sorted["year"], r_sorted["month"], r_sorted["day"], r_sorted["dayofweek"], r_sorted["dayofyear"], r_sorted["week"] = zip(*r_sorted["date"].map(lambda x: [x.year, x.month, x.day, x.dayofweek, x.dayofyear, x.week]))

norm_cols = ["year", "month", "day", "dayofweek", "dayofyear", "week"]
r_sorted[norm_cols] = r_sorted[norm_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [9]:
ctx = {}
for t in r_sorted.itertuples():
    ctx[(t.uid, t.iid)] = [t.year, t.month, t.day, t.dayofweek, t.dayofyear, t.week]

In [10]:
onehot["iid"] = onehot["asin"].map(item_map)
onehot_np = onehot.sort_values("iid").drop(["asin", "iid"], axis=1).to_numpy()

In [11]:
sbert_t = [(k, v) for k, v in sbert.items()]
sbert_df = pd.DataFrame(sbert_t, columns=["asin", "embedding"])
sbert_df["iid"] = sbert_df["asin"].map(item_map)
sbert_np = np.stack(sbert_df.sort_values("iid")["embedding"].to_numpy())

In [12]:
dummpy_np = np.zeros((sbert_np.shape[0], 1), dtype=np.float32)

In [13]:
len(reviews)

1711934

In [14]:
with open(f"../../data/{folder}/profiles_{mode}.txt", "w") as dst:
    for t in r_sorted.itertuples():
        uid = user_map[t.user_id]
        iid = item_map[t.asin]
        dst.write(f"{uid} {iid} {t.timestamp}\n")

In [15]:
with open(f"../../data/{folder}/ctx_{mode}.dat", 'wb') as f:
    pickle.dump(ctx, f)

In [16]:
with open(f"../../data/{folder}/onehot_{mode}.dat", 'wb') as f:
    pickle.dump(onehot_np, f)

In [17]:
with open(f"../../data/{folder}/sbert_{mode}.dat", 'wb') as f:
    pickle.dump(sbert_np, f)

In [18]:
with open(f"../../data/{folder}/dummy_{mode}.dat", 'wb') as f:
    pickle.dump(dummpy_np, f)