In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Anime Small

In [None]:
df = pd.read_csv("data/anime_small/rating.csv")
df.columns = ["user_id", "item_id", "rating"]
df = df[df["rating"] != -1]

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

In [None]:
profile_sizes = df.groupby("user_id").size()
relevant_users = profile_sizes[profile_sizes >= 20].index.tolist()
df = df[df["user_id"].isin(relevant_users)]

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

In [None]:
user_sample = np.random.choice(df["user_id"].unique(), size=5000, replace=False)
df = df[df["user_id"].isin(user_sample)]
print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

In [None]:
df.to_csv("data/anime_small/sample.csv", sep=";", header=False, index=None)

# Jester

In [None]:
df1 = pd.read_excel("data/jester/jester-data-1.xls", header=None)
df1.drop(columns=[0], inplace=True)
df1 = df1.stack().reset_index()
df1.columns = ["user_id", "item_id", "rating"]

df2 = pd.read_excel("data/jester/jester-data-2.xls", header=None)
df2.drop(columns=[0], inplace=True)
df2 = df2.stack().reset_index()
df2.columns = ["user_id", "item_id", "rating"]

df3 = pd.read_excel("data/jester/jester-data-3.xls", header=None)
df3.drop(columns=[0], inplace=True)
df3 = df3.stack().reset_index()
df3.columns = ["user_id", "item_id", "rating"]

n_users_1 = df1["user_id"].nunique()
n_users_2 = df2["user_id"].nunique()
n_users_3 = df3["user_id"].nunique()

user_mapping_1 = {b: a for a, b in enumerate(range(n_users_1))}
user_mapping_2 = {b: a+n_users_1 for a, b in enumerate(range(n_users_2))}
user_mapping_3 = {b: a+n_users_1+n_users_2 for a, b in enumerate(range(n_users_3))}

df1["user_id"] = df1["user_id"].map(user_mapping_1)
df2["user_id"] = df2["user_id"].map(user_mapping_2)
df3["user_id"] = df3["user_id"].map(user_mapping_3)
df = df1.append(df2).append(df3)

df["item_id"] = df["item_id"].apply(lambda v: v-1)
df.astype({"user_id": int, "item_id": int});

df = df[df["rating"] != 99]

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

del df1, df2, df3

In [None]:
profile_size = df.groupby("user_id").size()
relevant_users = profile_size[profile_size >= 20].index.tolist()
df = df[df["user_id"].isin(relevant_users)]

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

In [None]:
user_sample = np.random.choice(df["user_id"].unique(), size=5000, replace=False)
df = df[df["user_id"].isin(user_sample)]
print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

In [None]:
df.to_csv("data/jester/sample.csv", sep=";", header=None, index=False)

# Douban

In [31]:
db_train_df = pd.read_csv("data/douban/db.train.rating", sep="\t", header=None, names=["user_id", "item_id", "rating"])
db_val_df = pd.read_csv("data/douban/db.valid.rating", sep="\t", header=None, names=["user_id", "item_id", "rating"])
db_test_df = pd.read_csv("data/douban/db.test.rating", sep="\t", header=None, names=["user_id", "item_id", "rating"])

df = db_train_df.append(db_val_df).append(db_test_df)

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())
df.to_csv("data/douban/douban.csv", sep=";", header=False, index=False)

del db_train_df, db_val_df, db_test_df

Nr. of ratings: 893575
Nr. of users: 2509
Nr. of items: 39576
Density: 0.008999
Avg. nr. of ratings per user: 356.147868
Avg. nr. of ratings per item: 22.578709


In [None]:
profile_size = df.groupby("user_id").size()
relevant_users = profile_size[profile_size >= 20].index.tolist()
df = df[df["user_id"].isin(relevant_users)]

user_sample = np.random.choice(df["user_id"].unique(), size=1000, replace=False)
df = df[df["user_id"].isin(user_sample)]
print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

In [None]:
df.to_csv("data/douban/sample.csv", sep=";", header=None, index=False)

# Ciao

In [None]:
ciao_train_df = pd.read_csv("data/ciao/ciao.train.rating", sep="\t", header=None, names=["user_id", "item_id", "rating"])
ciao_val_df = pd.read_csv("data/ciao/ciao.valid.rating", sep="\t", header=None, names=["user_id", "item_id", "rating"])
ciao_test_df = pd.read_csv("data/ciao/ciao.test.rating", sep="\t", header=None, names=["user_id", "item_id", "rating"])

df = ciao_train_df.append(ciao_val_df).append(ciao_test_df)

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

df.to_csv("data/ciao/ciao.csv", sep=";", header=False, index=False)

del ciao_train_df, ciao_test_df, ciao_val_df

In [None]:
profile_size = df.groupby("user_id").size()
relevant_users = profile_size[profile_size >= 20].index.tolist()
df = df[df["user_id"].isin(relevant_users)]

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

In [None]:
user_sample = np.random.choice(df["user_id"].unique(), size=1000, replace=False)
df = df[df["user_id"].isin(user_sample)]
print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

In [None]:
df.to_csv("data/ciao/sample.csv", sep=";", header=None, index=False)

# Goodreads

In [24]:
df = pd.read_csv("data/goodreads/goodreads_interactions.csv", sep=",", usecols=["user_id", "book_id", "rating"])
df.columns = ["user_id", "item_id", "rating"]

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

Nr. of ratings: 228648342
Nr. of users: 876145
Nr. of items: 2360650
Density: 0.000111
Avg. nr. of ratings per user: 260.970892
Avg. nr. of ratings per item: 96.858214


In [25]:
df[df["rating"] == 0] = 3
#df[df["rating"] == 0] = np.mean(df[df["rating"] != 0])
print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

Nr. of ratings: 228648342
Nr. of users: 816371
Nr. of items: 2325541
Density: 0.000120
Avg. nr. of ratings per user: 280.078962
Avg. nr. of ratings per item: 98.320495


In [26]:
profile_size = df.groupby("user_id").size()
relevant_users = profile_size[profile_size >= 20].index.tolist()
idxs = df[~df["user_id"].isin(relevant_users)].index
df.drop(idxs, inplace=True)

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

Nr. of ratings: 227293625
Nr. of users: 639453
Nr. of items: 2314387
Density: 0.000154
Avg. nr. of ratings per user: 355.450088
Avg. nr. of ratings per item: 98.208997


In [27]:
user_sample = np.random.choice(df["user_id"].unique(), size=1000, replace=False)
df = df[df["user_id"].isin(user_sample)]
print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

Nr. of ratings: 153297
Nr. of users: 1000
Nr. of items: 75266
Density: 0.002037
Avg. nr. of ratings per user: 153.297000
Avg. nr. of ratings per item: 2.036736


In [28]:
df.to_csv("data/goodreads/sample.csv", sep=";", header=None, index=False)

# Bookcrossing

In [None]:
df = pd.read_csv("data/BX-CSV-Dump/BX-Book-Ratings.csv", sep=";", encoding="latin1")
df.columns = ["user_id", "item_id", "rating"]

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

In [None]:
profile_size = df.groupby("user_id").size()
relevant_users = profile_size[profile_size >= 20].index.tolist()
df = df[df["user_id"].isin(relevant_users)]

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

df.to_csv("data/bookcrossing/full.csv", sep=";", header=None, index=False)

In [None]:
user_sample = np.random.choice(df["user_id"].unique(), size=1000, replace=False)
df = df[df["user_id"].isin(user_sample)]
print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

In [None]:
df.to_csv("data/bookcrossing/sample.csv", sep=";", header=None, index=False)

# Foursquare

In [None]:
df = pd.read_csv("data/foursquare/ratings.dat", sep="|")
df.columns = ["user_id", "item_id", "rating"]
df.dropna(inplace=True)
df["item_id"] = df["item_id"].astype(int)

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

In [None]:
profile_size = df.groupby("user_id").size()
relevant_users = profile_size[profile_size >= 20].index.tolist()
df = df[df["user_id"].isin(relevant_users)]

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

In [None]:
user_sample = np.random.choice(df["user_id"].unique(), size=1000, replace=False)
df = df[df["user_id"].isin(user_sample)]
print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

In [None]:
df.to_csv("data/foursquare/sample.csv", sep=";", header=None, index=False)

# MovieLens 100k

In [32]:
df = pd.read_csv("data/ml-100k/ml-100k/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
df.drop(columns=["timestamp"], inplace=True)

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

Nr. of ratings: 100000
Nr. of users: 943
Nr. of items: 1682
Density: 0.063047
Avg. nr. of ratings per user: 106.044539
Avg. nr. of ratings per item: 59.453032


In [38]:
np.sum(df.groupby("user_id").size() < 30)

199

# MovieLens 1M

In [None]:
df = pd.read_csv("data/ml-1m/ratings.dat", sep="::", names=["user_id", "item_id", "rating", "timestamp"])
df.drop(columns=["timestamp"], inplace=True)

print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

# LFM User Groups

In [29]:
df = pd.read_csv("data/lfm/user_events.txt", sep="\t", header=None, names=["user_id", "artist_id", "album_id", "track_id", "timestamp"])
df.head()

Unnamed: 0,user_id,artist_id,album_id,track_id,timestamp
0,31435741,2,4,4,1385212958
1,31435741,2,4,4,1385212642
2,31435741,2,4,4,1385212325
3,31435741,2,4,4,1385209508
4,31435741,2,4,4,1385209191


In [30]:
df_ = df.groupby(["user_id", "artist_id"]).size().reset_index(name='count')

scaled_df_events = pd.DataFrame()
for user_id, group in df_.groupby("user_id"):
    min_rating = group['count'].min()
    max_rating = group['count'].max()
    scaler = MinMaxScaler(feature_range=(1, 100))
    scaled_ratings = scaler.fit_transform(group['count'].values.reshape(-1, 1).astype(float))
    new_rows = group.copy()
    new_rows['count'] = scaled_ratings
    scaled_df_events = scaled_df_events.append(new_rows)

print("Nr. of ratings: %d" % len(scaled_df_events))
print("Nr. of users: %d" % scaled_df_events["user_id"].nunique())
print("Nr. of items: %d" % scaled_df_events["artist_id"].nunique())
print("Density: %f" % (len(scaled_df_events) / (scaled_df_events["user_id"].nunique() * scaled_df_events["artist_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % scaled_df_events.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % scaled_df_events.groupby("artist_id").size().mean())

scaled_df_events.columns = ["user_id", "item_id", "rating"]
scaled_df_events.to_csv("data/lfm/artist_ratings.csv", sep=";", index=False, header=None)

Nr. of ratings: 1755361
Nr. of users: 3000
Nr. of items: 352805
Density: 0.001658
Avg. nr. of ratings per user: 585.120333
Avg. nr. of ratings per item: 4.975443


In [None]:
df_ = df.groupby(["user_id", "album_id"]).size().reset_index(name='count')

scaled_df_events = pd.DataFrame()
for user_id, group in df_.groupby("user_id"):
    min_rating = group['count'].min()
    max_rating = group['count'].max()
    scaler = MinMaxScaler(feature_range=(1, 1000))
    scaled_ratings = scaler.fit_transform(group['count'].values.reshape(-1, 1).astype(float))
    new_rows = group.copy()
    new_rows['count'] = scaled_ratings
    scaled_df_events = scaled_df_events.append(new_rows)

print("Nr. of ratings: %d" % len(scaled_df_events))
print("Nr. of users: %d" % scaled_df_events["user_id"].nunique())
print("Nr. of items: %d" % scaled_df_events["album_id"].nunique())
print("Density: %f" % (len(scaled_df_events) / (scaled_df_events["user_id"].nunique() * scaled_df_events["album_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % scaled_df_events.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % scaled_df_events.groupby("album_id").size().mean())

scaled_df_events.columns = ["user_id", "item_id", "rating"]
scaled_df_events.to_csv("data/lfm/album_ratings.csv", sep=";", index=False, header=None)

In [None]:
df_ = df.groupby(["user_id", "track_id"]).size().reset_index(name='count')

scaled_df_events = pd.DataFrame()
for user_id, group in df_.groupby("user_id"):
    min_rating = group['count'].min()
    max_rating = group['count'].max()
    scaler = MinMaxScaler(feature_range=(1, 1000))
    scaled_ratings = scaler.fit_transform(group['count'].values.reshape(-1, 1).astype(float))
    new_rows = group.copy()
    new_rows['count'] = scaled_ratings
    scaled_df_events = scaled_df_events.append(new_rows)

print("Nr. of ratings: %d" % len(scaled_df_events))
print("Nr. of users: %d" % scaled_df_events["user_id"].nunique())
print("Nr. of items: %d" % scaled_df_events["track_id"].nunique())
print("Density: %f" % (len(scaled_df_events) / (scaled_df_events["user_id"].nunique() * scaled_df_events["track_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % scaled_df_events.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % scaled_df_events.groupby("track_id").size().mean())

scaled_df_events.columns = ["user_id", "item_id", "rating"]
scaled_df_events.to_csv("data/lfm/track_ratings.csv", sep=";", index=False, header=None)