In [1]:
import pandas as pd
import numpy as np
import sqlalchemy
from sklearn.preprocessing import MinMaxScaler

# Generation of Rating Dataset

In [2]:
beyms = pd.read_csv("../data/beyms.csv")["user_id"].tolist()
ms = pd.read_csv("../data/ms.csv")["user_id"].tolist()

In [20]:
events_df = pd.read_csv("../data/events.csv")
playcounts_df = events_df.groupby(["user_id", "track_id"]).size().reset_index()
playcounts_df.columns = ["user_id", "track_id", "playcount"]
playcounts_df.head()

Unnamed: 0,user_id,track_id,playcount
0,1002693,542,1
1,1002693,580,2
2,1002693,2553,1
3,1002693,6399,1
4,1002693,7462,1


In [4]:
ratings_df = pd.DataFrame()
for user_id, data in playcounts_df.groupby("user_id"):
    ratings = MinMaxScaler(feature_range=(1, 1000)).fit_transform(data["playcount"].values.reshape(-1, 1).astype(float))
    new_rows = data[["user_id", "track_id"]].copy()
    new_rows["rating"] = ratings
    ratings_df = ratings_df.append(new_rows)
ratings_df.columns = ["user_id", "item_id", "rating"]
ratings_df.head()

Unnamed: 0,user_id,item_id,rating
0,1002693,542,1.0
1,1002693,580,67.6
2,1002693,2553,1.0
3,1002693,6399,1.0
4,1002693,7462,1.0


In [25]:
ratings_df.to_csv("../data/ratings.csv", index=False)

# Basic Statistics of Rating Data

In [27]:
user_groups_df = pd.read_csv("../data/user_groups.csv")
U1 = user_groups_df[user_groups_df["user group"] == 0]["user_id"].tolist()
U2 = user_groups_df[user_groups_df["user group"] == 1]["user_id"].tolist()
U3 = user_groups_df[user_groups_df["user group"] == 2]["user_id"].tolist()
U4 = user_groups_df[user_groups_df["user group"] == 3]["user_id"].tolist()

In [28]:
ratings_U1_df = ratings_df[ratings_df["user_id"].isin(U1)]
ratings_U2_df = ratings_df[ratings_df["user_id"].isin(U2)]
ratings_U3_df = ratings_df[ratings_df["user_id"].isin(U3)]
ratings_U4_df = ratings_df[ratings_df["user_id"].isin(U4)]
ratings_beyms_df = ratings_df[ratings_df["user_id"].isin(beyms)]
ratings_ms_df = ratings_df[ratings_df["user_id"].isin(ms)]

In [29]:
sparsity_U1 = 1 - len(ratings_U1_df) / (ratings_U1_df["user_id"].nunique() * ratings_U1_df["item_id"].nunique())
sparsity_U2 = 1 - len(ratings_U2_df) / (ratings_U2_df["user_id"].nunique() * ratings_U2_df["item_id"].nunique())
sparsity_U3 = 1 - len(ratings_U3_df) / (ratings_U3_df["user_id"].nunique() * ratings_U3_df["item_id"].nunique())
sparsity_U4 = 1 - len(ratings_U4_df) / (ratings_U4_df["user_id"].nunique() * ratings_U4_df["item_id"].nunique())
sparsity_beyms = 1 - len(ratings_beyms_df) / (ratings_beyms_df["user_id"].nunique() * ratings_beyms_df["item_id"].nunique())
sparsity_ms = 1 - len(ratings_ms_df) / (ratings_ms_df["user_id"].nunique() * ratings_ms_df["item_id"].nunique())
sparsity_all = 1 - len(ratings_df) / (ratings_df["user_id"].nunique() * ratings_df["item_id"].nunique())

In [30]:
print("[Users] U_1: %d, U_2: %d, U_3: %d, U_4: %d, beyms: %d, ms: %d, all: %d" % (ratings_U1_df["user_id"].nunique(), ratings_U2_df["user_id"].nunique(), ratings_U3_df["user_id"].nunique(), ratings_U4_df["user_id"].nunique(), ratings_beyms_df["user_id"].nunique(), ratings_ms_df["user_id"].nunique(), ratings_df["user_id"].nunique()))
print("[Tracks] U_1: %d, U_2: %d, U_3: %d, U_4: %d, beyms: %d, ms: %d, all: %d" % (ratings_U1_df["item_id"].nunique(), ratings_U2_df["item_id"].nunique(), ratings_U3_df["item_id"].nunique(), ratings_U4_df["item_id"].nunique(), ratings_beyms_df["item_id"].nunique(), ratings_ms_df["item_id"].nunique(), ratings_df["item_id"].nunique()))
print("[Ratings] U_1: %d, U_2: %d, U_3: %d, U_4: %d, beyms: %d, ms: %d, all: %d" % (len(ratings_U1_df), len(ratings_U2_df), len(ratings_U3_df), len(ratings_U4_df), len(ratings_beyms_df), len(ratings_ms_df), len(ratings_df)))
print("[Sparsity] U_1: %f, U_2: %f, U_3: %f, U_4: %f, beyms: %f, ms: %f, all: %f" % (sparsity_U1, sparsity_U2, sparsity_U3, sparsity_U4, sparsity_beyms, sparsity_ms, sparsity_all))

[Users] U_1: 369, U_2: 919, U_3: 143, U_4: 642, beyms: 2074, ms: 2074, all: 4148
[Tracks] U_1: 257013, U_2: 396557, U_3: 119524, U_4: 377768, beyms: 799659, ms: 707005, all: 1084922
[Ratings] U_1: 433060, U_2: 935411, U_3: 164203, U_4: 848247, beyms: 2380922, ms: 2948512, all: 5329434
[Sparsity] U_1: 0.995434, U_2: 0.997433, U_3: 0.990393, U_4: 0.996502, beyms: 0.998564, ms: 0.997989, all: 0.998816


In [29]:
n_artists = events_df[events_df["user_id"].isin(beyms)]["artist_id"].nunique()
n_tracks = events_df[events_df["user_id"].isin(beyms)]["track_id"].nunique()

beyms_countries_df = pd.read_csv("../data/LFM-1b_users.txt", sep="\t")[["user_id", "country"]].set_index("user_id").dropna().loc[beyms]
mean_users_per_country = beyms_countries_df["country"].value_counts().mean()
std_users_per_country = beyms_countries_df["country"].value_counts().std()

events_beyms_df = events_df[events_df["user_id"].isin(beyms)]
n_LEs = len(events_beyms_df)
groupby_beyms = events_beyms_df.groupby("user_id")
min_LEs_per_user = groupby_beyms.size().describe().loc["min"]
q1_LEs_per_user = groupby_beyms.size().describe().loc["25%"]
median_LEs_per_user = groupby_beyms.size().describe().loc["50%"]
q3_LEs_per_user = groupby_beyms.size().describe().loc["75%"]
max_LEs_per_user = groupby_beyms.size().describe().loc["max"]
mean_LEs_per_user = groupby_beyms.size().describe().loc["mean"]
std_LEs_per_user = groupby_beyms.size().describe().loc["std"]

print("=== BeyMS ===")
print("Users: %d" % len(beyms))
print("Tracks: %d" % n_tracks)
print("Artists: %d" % n_artists)
print("Listening Events (LEs): %d" % n_LEs)
print("Min. LEs per user: %d" % min_LEs_per_user)
print("Q1 LEs per user: %d" % q1_LEs_per_user)
print("Median LEs per user: %d" % median_LEs_per_user)
print("Q3 LEs per user: %d" % q3_LEs_per_user)
print("Max. LEs per user: %d" % max_LEs_per_user)
print("Mean (Std) LEs per user: %f (%f)" % (mean_LEs_per_user, std_LEs_per_user))
print("Mean (Std) Users per country: %f (%f)" % (mean_users_per_country, std_users_per_country))

=== BeyMS ===
Users: 2074
Tracks: 799659
Artists: 83467
Listening Events (LEs): 7941793
Min. LEs per user: 9
Q1 LEs per user: 2398
Median LEs per user: 3558
Q3 LEs per user: 5067
Max. LEs per user: 11177
Mean (Std) LEs per user: 3829.215526 (1940.383957)
Mean (Std) Users per country: 44.127660 (77.669483)


In [30]:
n_artists = events_df[events_df["user_id"].isin(ms)]["artist_id"].nunique()
n_tracks = events_df[events_df["user_id"].isin(ms)]["track_id"].nunique()

ms_countries_df = pd.read_csv("../data/LFM-1b_users.txt", sep="\t")[["user_id", "country"]].set_index("user_id").dropna().loc[ms]
mean_users_per_country = ms_countries_df["country"].value_counts().mean()
std_users_per_country = ms_countries_df["country"].value_counts().std()

events_ms_df = events_df[events_df["user_id"].isin(ms)]
n_LEs = len(events_ms_df)
groupby_ms = events_ms_df.groupby("user_id")
min_LEs_per_user = groupby_ms.size().describe().loc["min"]
q1_LEs_per_user = groupby_ms.size().describe().loc["25%"]
median_LEs_per_user = groupby_ms.size().describe().loc["50%"]
q3_LEs_per_user = groupby_ms.size().describe().loc["75%"]
max_LEs_per_user = groupby_ms.size().describe().loc["max"]
mean_LEs_per_user = groupby_ms.size().describe().loc["mean"]
std_LEs_per_user = groupby_ms.size().describe().loc["std"]

print("=== MS ===")
print("Users: %d" % len(ms))
print("Tracks: %d" % n_tracks)
print("Artists: %d" % n_artists)
print("Listening Events (LEs): %d" % n_LEs)
print("Min. LEs per user: %d" % min_LEs_per_user)
print("Q1 LEs per user: %d" % q1_LEs_per_user)
print("Median LEs per user: %d" % median_LEs_per_user)
print("Q3 LEs per user: %d" % q3_LEs_per_user)
print("Max. LEs per user: %d" % max_LEs_per_user)
print("Mean (Std) LEs per user: %f (%f)" % (mean_LEs_per_user, std_LEs_per_user))
print("Mean (Std) Users per country: %f (%f)" % (mean_users_per_country, std_users_per_country))

=== MS ===
Users: 2074
Tracks: 707005
Artists: 75514
Listening Events (LEs): 8745570
Min. LEs per user: 137
Q1 LEs per user: 2835
Median LEs per user: 4018
Q3 LEs per user: 5407
Max. LEs per user: 10781
Mean (Std) LEs per user: 4216.764706 (1834.878873)
Mean (Std) Users per country: 44.127660 (73.199138)


In [33]:
n_artists = events_df["artist_id"].nunique()
n_tracks = events_df["track_id"].nunique()

all_countries_df = pd.read_csv("../data/LFM-1b_users.txt", sep="\t")[["user_id", "country"]].set_index("user_id").dropna().loc[beyms+ms]
mean_users_per_country = all_countries_df["country"].value_counts().mean()
std_users_per_country = all_countries_df["country"].value_counts().std()

n_LEs = len(events_df)
groupby = events_df.groupby("user_id")
min_LEs_per_user = groupby.size().describe().loc["min"]
q1_LEs_per_user = groupby.size().describe().loc["25%"]
median_LEs_per_user = groupby.size().describe().loc["50%"]
q3_LEs_per_user = groupby.size().describe().loc["75%"]
max_LEs_per_user = groupby.size().describe().loc["max"]
mean_LEs_per_user = groupby.size().describe().loc["mean"]
std_LEs_per_user = groupby.size().describe().loc["std"]

print("=== Overall ===")
print("Users: %d" % (len(beyms)+len(ms)))
print("Tracks: %d" % n_tracks)
print("Artists: %d" % n_artists)
print("Listening Events (LEs): %d" % n_LEs)
print("Min. LEs per user: %d" % min_LEs_per_user)
print("Q1 LEs per user: %d" % q1_LEs_per_user)
print("Median LEs per user: %d" % median_LEs_per_user)
print("Q3 LEs per user: %d" % q3_LEs_per_user)
print("Max. LEs per user: %d" % max_LEs_per_user)
print("Mean (Std) LEs per user: %f (%f)" % (mean_LEs_per_user, std_LEs_per_user))
print("Mean (Std) Users per country: %f (%f)" % (mean_users_per_country, std_users_per_country))

=== Overall ===
Users: 4148
Tracks: 1084922
Artists: 110898
Listening Events (LEs): 16687363
Min. LEs per user: 9
Q1 LEs per user: 2604
Median LEs per user: 3766
Q3 LEs per user: 5252
Max. LEs per user: 11177
Mean (Std) LEs per user: 4022.990116 (1898.060313)
Mean (Std) Users per country: 88.255319 (150.124074)
