In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [1]:
def preprocessing(df, user_core=20, threshold=3.5):
    profile_size = df.groupby("user_id").size()
    uids = profile_size[profile_size >= user_core].index
    modified_df = df[df["user_id"].isin(uids)].copy()
    modified_df = modified_df[modified_df["rating"].astype(float) >= threshold]
    return modified_df

# Amazon Grocery and Gourmet Food

In [275]:
df = pd.read_csv("Grocery_and_Gourmet_Food.csv", sep=",", header=None, names=["item_id", "user_id", "rating", "timestamp"])
df = df[["user_id", "item_id", "rating"]]
df["user_id"] = df["user_id"].map({b: a for a, b in enumerate(df["user_id"].unique())})
df["item_id"] = df["item_id"].map({b: a for a, b in enumerate(df["item_id"].unique())})
df.head()

Unnamed: 0,user_id,item_id,rating
0,0,0,5.0
1,1,0,4.0
2,2,0,4.0
3,3,0,5.0
4,4,0,4.0


In [276]:
thresh = df["rating"].mean()
item_core = 5
user_core = 20
modified_df = df.copy()
n_prev_interactions = -1
stop = False
while stop is False:
    # item core
    item_profile_size = modified_df.groupby("item_id").size()
    iids = item_profile_size[item_profile_size >= item_core].index
    modified_df = modified_df[modified_df["item_id"].isin(iids)].copy()

    # user core
    profile_size = modified_df.groupby("user_id").size()
    uids = profile_size[profile_size >= user_core].index
    modified_df = modified_df[modified_df["user_id"].isin(uids)].copy()

    # stop after convergence
    if n_prev_interactions == len(modified_df):
        stop = True
        print()
        print("Converged ... apply rating threshold")
        modified_df = modified_df[modified_df["rating"].astype(float) >= thresh].copy()
        print(modified_df["user_id"].nunique(), modified_df["item_id"].nunique(), len(modified_df), modified_df.groupby("user_id").size().mean(), modified_df.groupby("item_id").size().mean(), 100 * (len(modified_df) / (modified_df["user_id"].nunique() * modified_df["item_id"].nunique())))
    else:
        n_prev_interactions = len(modified_df)
        print(modified_df["user_id"].nunique(), modified_df["item_id"].nunique(), len(modified_df), modified_df.groupby("user_id").size().mean(), modified_df.groupby("item_id").size().mean(), 100 * (len(modified_df) / (modified_df["user_id"].nunique() * modified_df["item_id"].nunique())))

final_df = modified_df.copy()

8454 58532 288374 34.11095339484268 4.926775097382628 0.05827744378261922
4482 12658 147676 32.94868362338242 11.666613999051982 0.2602992860118693
3672 8565 120435 32.798202614379086 14.061295971978984 0.3829328968403863
3412 7503 111681 32.73182883939039 14.88484606157537 0.4362498845713766
3318 7131 108496 32.69921639541893 15.21469639601739 0.4585502229058887
3276 6994 107161 32.71092796092796 15.321847297683728 0.4676998564616523
3260 6945 106666 32.71963190184049 15.358675305975522 0.4711250093857522
3255 6921 106476 32.71152073732719 15.384482011270048 0.47264153644454826
3252 6908 106367 32.70817958179582 15.397654892877823 0.4734826227822209
3250 6905 106319 32.71353846153846 15.39739319333816 0.4737659444104049
3249 6899 106276 32.71037242228378 15.404551384258589 0.47413208323356687
3247 6895 106222 32.71388974437943 15.405656272661348 0.47445815437823674
3244 6890 106145 32.72040690505548 15.40566037735849 0.47489705232301144
3244 6885 106125 32.714241676942045 15.413943355

In [279]:
final_df[["user_id", "item_id"]].to_csv("grocery_preprocessed.csv", index=False, header=None)

# LastFM User Groups

In [22]:
df = pd.read_csv("LFM-3k/artist_ratings.csv", sep="\;", header=None, names=["user_id", "item_id", "listening_count"])
df.head()

  df = pd.read_csv("LFM-3k/artist_ratings.csv", sep="\;", header=None, names=["user_id", "item_id", "listening_count"])


Unnamed: 0,user_id,item_id,listening_count
0,1021445,12,184.222707
1,1021445,16,1.0
2,1021445,28,27.174672
3,1021445,29,1.0
4,1021445,46,1.0


In [23]:
# scale the listening to the range of (1, 5)
ratings_df = pd.DataFrame()
for user_id, data in df.groupby("user_id"):
    ratings = MinMaxScaler(feature_range=(1, 5)).fit_transform(data["listening_count"].values.reshape(-1, 1).astype(float))
    new_rows = data[["user_id", "item_id"]].copy()
    new_rows["rating"] = ratings
    ratings_df = pd.concat([ratings_df, new_rows])
ratings_df.columns = ["user_id", "item_id", "rating"]

print(ratings_df["user_id"].nunique(), ratings_df["item_id"].nunique(), len(ratings_df), ratings_df.groupby("user_id").size().mean())
print(ratings_df["rating"].mean())

# apply user core pruning and filter positive feedback
final_df = preprocessing(ratings_df, user_core=20, threshold=ratings_df["rating"].mean())
print(final_df["user_id"].nunique(), final_df["item_id"].nunique(), len(final_df), final_df.groupby("user_id").size().mean(), final_df.groupby("item_id").size().mean(), 100 * (len(final_df) / (final_df["user_id"].nunique() * final_df["item_id"].nunique())))

3000 352805 1755361 585.1203333333333
1.1292309186701142
2999 78799 348437 116.18439479826608 4.421845454891559 0.14744399649521706


In [24]:
final_df[["user_id", "item_id"]].to_csv("lfm3k_preprocessed.csv", index=False, header=None)

# MovieLens 1M

In [265]:
df = pd.read_csv("ml-1m/ratings.dat", sep="::", header=None)
df.columns = ["user_id", "item_id", "rating", "timestamp"]
df.head()

  df = pd.read_csv("ml-1m/ratings.dat", sep="::", header=None)


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [266]:
# filter positive feedback, user core pruning has already been performed on this dataset
ratings_df = df
print(ratings_df["user_id"].nunique(), ratings_df["item_id"].nunique(), len(ratings_df), ratings_df.groupby("user_id").size().mean(), ratings_df.groupby("item_id").size().mean(), 100 * (len(ratings_df) / (ratings_df["user_id"].nunique() * ratings_df["item_id"].nunique())))
print(ratings_df["rating"].mean())

final_df = preprocessing(ratings_df, user_core=20, threshold=ratings_df["rating"].mean())
print(final_df["user_id"].nunique(), final_df["item_id"].nunique(), len(final_df), final_df.groupby("user_id").size().mean(), final_df.groupby("item_id").size().mean(), 100 * (len(final_df) / (final_df["user_id"].nunique() * final_df["item_id"].nunique())))

6040 3706 1000209 165.5975165562914 269.88909875876953 4.468362562231285
3.581564453029317
6038 3533 575281 95.27674726730706 162.8307387489386 2.6967661270112386


In [267]:
final_df.groupby("user_id").size().describe()

count    6038.000000
mean       95.276747
std       105.005005
min         1.000000
25%        27.000000
50%        58.000000
75%       124.000000
max      1435.000000
dtype: float64

In [30]:
final_df[["user_id", "item_id"]].to_csv("ml1m_preprocessed.csv", index=False, header=None)