In [168]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [169]:
def preprocessing(df, user_core=20, threshold=3.5):
    profile_size = df.groupby("user_id").size()
    uids = profile_size[profile_size >= user_core].index
    modified_df = df[df["user_id"].isin(uids)].copy()
    modified_df = modified_df[modified_df["rating"].astype(float) >= threshold]
    return modified_df

# LFM 100k

In [102]:
df = pd.read_csv("LFM2b-100k-subset/sampled_100000_items_inter.txt", sep="\t", header=None, names=["user_id", "item_id", "listening_count"])
df.head()

Unnamed: 0,user_id,item_id,listening_count
0,0,0,2
1,0,55,3
2,0,54,3
3,0,53,19
4,0,52,20


In [103]:
ratings_df = pd.DataFrame()
for user_id, data in df.groupby("user_id"):
    ratings = MinMaxScaler(feature_range=(1, 5)).fit_transform(data["listening_count"].values.reshape(-1, 1).astype(float))
    new_rows = data[["user_id", "item_id"]].copy()
    new_rows["rating"] = ratings
    ratings_df = pd.concat([ratings_df, new_rows])
ratings_df.columns = ["user_id", "item_id", "rating"]

print(ratings_df["user_id"].nunique(), ratings_df["item_id"].nunique(), len(ratings_df), ratings_df.groupby("user_id").size().mean())



final_df = preprocessing(ratings_df, user_core=20, threshold=3.5)
print(final_df["user_id"].nunique(), final_df["item_id"].nunique(), len(final_df), final_df.groupby("user_id").size().mean())

19972 99831 2830541 141.72546565191269


(16427, 23050, 57241, 3.4845680891215682)

# Amazon Sports and Outdoors

In [172]:
df = pd.read_csv("amazon/Sports_and_Outdoors.csv", sep=",", header=None, names=["item_id", "user_id", "rating", "timestamp"])
df = df[["user_id", "item_id", "rating"]]
df["user_id"] = df["user_id"].map({b: a for a, b in enumerate(df["user_id"].unique())})
df["item_id"] = df["item_id"].map({b: a for a, b in enumerate(df["item_id"].unique())})
df.head()

Unnamed: 0,user_id,item_id,rating
0,0,0,5.0
1,1,0,4.0
2,2,0,1.0
3,3,0,5.0
4,4,0,1.0


In [173]:
ratings_df = df
print(ratings_df["user_id"].nunique(), ratings_df["item_id"].nunique(), len(ratings_df), ratings_df.groupby("user_id").size().mean())

final_df = preprocessing(ratings_df, user_core=20, threshold=3.5)
print(final_df["user_id"].nunique(), final_df["item_id"].nunique(), len(final_df), final_df.groupby("user_id").size().mean())

6703391 957764 12980837 1.9364582791008311
22703 172060 602952 26.558252213363872


In [174]:
(final_df.groupby("item_id").size() > 1).sum()

71609

In [175]:
final_df[["user_id", "item_id"]].to_csv("sportsoutdoors_preprocessed.csv", index=False, header=None)

# LFM 3k User Groups

In [164]:
df = pd.read_csv("LFM-3k/artist_ratings.csv", sep="\;", header=None, names=["user_id", "item_id", "listening_count"])
df.head()

  df = pd.read_csv("LFM-3k/artist_ratings.csv", sep="\;", header=None, names=["user_id", "item_id", "listening_count"])


Unnamed: 0,user_id,item_id,listening_count
0,1021445,12,184.222707
1,1021445,16,1.0
2,1021445,28,27.174672
3,1021445,29,1.0
4,1021445,46,1.0


In [165]:
ratings_df = pd.DataFrame()
for user_id, data in df.groupby("user_id"):
    ratings = MinMaxScaler(feature_range=(1, 5)).fit_transform(data["listening_count"].values.reshape(-1, 1).astype(float))
    new_rows = data[["user_id", "item_id"]].copy()
    new_rows["rating"] = ratings
    ratings_df = pd.concat([ratings_df, new_rows])
ratings_df.columns = ["user_id", "item_id", "rating"]

print(ratings_df["user_id"].nunique(), ratings_df["item_id"].nunique(), len(ratings_df), ratings_df.groupby("user_id").size().mean())

final_df = preprocessing(ratings_df, user_core=20, threshold=1.5)
print(final_df["user_id"].nunique(), final_df["item_id"].nunique(), len(final_df), final_df.groupby("user_id").size().mean(), final_df.groupby("item_id").size().mean(), 100 * (len(final_df) / (final_df["user_id"].nunique() * final_df["item_id"].nunique())))

3000 352805 1755361 585.1203333333333
2999 32129 116108 38.71557185728576 3.6138068411715274 0.12050039483733001


In [146]:
(final_df.groupby("item_id").size() > 1).sum()

12311

In [147]:
final_df[["user_id", "item_id"]].to_csv("lfm3k_preprocessed.csv", index=False, header=None)

# ML 1M

In [157]:
df = pd.read_csv("ml-1m/ml-1m.inter", sep="\t")
df.columns = ["user_id", "item_id", "rating", "timestamp"]
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [163]:
ratings_df = df
print(ratings_df["user_id"].nunique(), ratings_df["item_id"].nunique(), len(ratings_df), ratings_df.groupby("user_id").size().mean(), ratings_df.groupby("item_id").size().mean(), 100 * (len(ratings_df) / (ratings_df["user_id"].nunique() * ratings_df["item_id"].nunique())))

final_df = preprocessing(ratings_df, user_core=20, threshold=3.5)
print(final_df["user_id"].nunique(), final_df["item_id"].nunique(), len(final_df), final_df.groupby("user_id").size().mean(), final_df.groupby("item_id").size().mean(), 100 * (len(final_df) / (final_df["user_id"].nunique() * final_df["item_id"].nunique())))

6040 3706 1000209 165.5975165562914 269.88909875876953 4.468362562231285
6038 3533 575281 95.27674726730706 162.8307387489386 2.6967661270112386


In [142]:
(final_df.groupby("item_id").size() > 1).sum()

5974

In [127]:
final_df[["user_id", "item_id"]].to_csv("ml1m_preprocessed.csv", index=False, header=None)