In [1]:
import pandas as pd
import numpy as np
import sqlalchemy
from sklearn.preprocessing import MinMaxScaler

# Generation of Rating Dataset

In [2]:
beyms = pd.read_csv("data/beyms.csv")["user_id"].tolist()
ms = pd.read_csv("data/ms.csv")["user_id"].tolist()

In [3]:
db_options_df = pd.read_csv("db_credentials.txt", sep="=", header=None)
db_options_df.columns = ["variable", "value"]
db_options_df = db_options_df.apply(lambda col: col.str.strip())
db_options_df.set_index("variable", inplace=True)
db_options = db_options_df["value"].to_dict()

In [4]:
connection = sqlalchemy.create_engine('mysql+pymysql://' + db_options["DB_USERNAME"] + ":" + db_options["DB_PW"] + db_options["DB_PATH"])

events_df = pd.read_sql(con=connection, sql="SELECT user_id, track_id FROM events WHERE user_id IN " + str(tuple(beyms+ms)))
playcounts_df = events_df.groupby(["user_id", "track_id"]).size().reset_index()
playcounts_df.columns = ["user_id", "track_id", "playcount"]

  result = self._query(query)


In [18]:
ratings_df = pd.DataFrame()
for user_id, data in playcounts_df.groupby("user_id"):
    ratings = MinMaxScaler(feature_range=(1, 1000)).fit_transform(data["playcount"].values.reshape(-1, 1).astype(float))
    new_rows = data[["user_id", "track_id"]].copy()
    new_rows["rating"] = ratings
    ratings_df = ratings_df.append(new_rows)
ratings_df.columns = ["user_id", "item_id", "rating"]
ratings_df.head()

Unnamed: 0,user_id,item_id,rating
0,1002693,542,1.0
1,1002693,580,67.6
2,1002693,2553,1.0
3,1002693,6399,1.0
4,1002693,7462,1.0


In [19]:
ratings_df.to_csv("data/ratings.csv", index=False)

# Basic Statistics of Rating Data

In [6]:
user_groups_df = pd.read_csv("data/user_groups.csv")
U1 = user_groups_df[user_groups_df["user group"] == 0]["user_id"].tolist()
U2 = user_groups_df[user_groups_df["user group"] == 1]["user_id"].tolist()
U3 = user_groups_df[user_groups_df["user group"] == 2]["user_id"].tolist()
U4 = user_groups_df[user_groups_df["user group"] == 3]["user_id"].tolist()

In [9]:
ratings_U1_df = ratings_df[ratings_df["user_id"].isin(U1)]
ratings_U2_df = ratings_df[ratings_df["user_id"].isin(U2)]
ratings_U3_df = ratings_df[ratings_df["user_id"].isin(U3)]
ratings_U4_df = ratings_df[ratings_df["user_id"].isin(U4)]
ratings_beyms_df = ratings_df[ratings_df["user_id"].isin(beyms)]
ratings_ms_df = ratings_df[ratings_df["user_id"].isin(ms)]

In [12]:
sparsity_U1 = 1 - len(ratings_U1_df) / (ratings_U1_df["user_id"].nunique() * ratings_U1_df["item_id"].nunique())
sparsity_U2 = 1 - len(ratings_U2_df) / (ratings_U2_df["user_id"].nunique() * ratings_U2_df["item_id"].nunique())
sparsity_U3 = 1 - len(ratings_U3_df) / (ratings_U3_df["user_id"].nunique() * ratings_U3_df["item_id"].nunique())
sparsity_U4 = 1 - len(ratings_U4_df) / (ratings_U4_df["user_id"].nunique() * ratings_U4_df["item_id"].nunique())
sparsity_beyms = 1 - len(ratings_beyms_df) / (ratings_beyms_df["user_id"].nunique() * ratings_beyms_df["item_id"].nunique())
sparsity_ms = 1 - len(ratings_ms_df) / (ratings_ms_df["user_id"].nunique() * ratings_ms_df["item_id"].nunique())
sparsity_all = 1 - len(ratings_df) / (ratings_df["user_id"].nunique() * ratings_df["item_id"].nunique())

In [17]:
print("[Users] U_1: %d, U_2: %d, U_3: %d, U_4: %d, beyms: %d, ms: %d, all: %d" % (ratings_U1_df["user_id"].nunique(), ratings_U2_df["user_id"].nunique(), ratings_U3_df["user_id"].nunique(), ratings_U4_df["user_id"].nunique(), ratings_beyms_df["user_id"].nunique(), ratings_ms_df["user_id"].nunique(), ratings_df["user_id"].nunique()))
print("[Tracks] U_1: %d, U_2: %d, U_3: %d, U_4: %d, beyms: %d, ms: %d, all: %d" % (ratings_U1_df["item_id"].nunique(), ratings_U2_df["item_id"].nunique(), ratings_U3_df["item_id"].nunique(), ratings_U4_df["item_id"].nunique(), ratings_beyms_df["item_id"].nunique(), ratings_ms_df["item_id"].nunique(), ratings_df["item_id"].nunique()))
print("[Ratings] U_1: %d, U_2: %d, U_3: %d, U_4: %d, beyms: %d, ms: %d, all: %d" % (len(ratings_U1_df), len(ratings_U2_df), len(ratings_U3_df), len(ratings_U4_df), len(ratings_beyms_df), len(ratings_ms_df), len(ratings_df)))
print("[Sparsity] U_1: %f, U_2: %f, U_3: %f, U_4: %f, beyms: %f, ms: %f, all: %f" % (sparsity_U1, sparsity_U2, sparsity_U3, sparsity_U4, sparsity_beyms, sparsity_ms, sparsity_all))

[Users] U_1: 369, U_2: 919, U_3: 143, U_4: 642, beyms: 2074, ms: 2074, all: 4148
[Tracks] U_1: 257013, U_2: 396557, U_3: 119524, U_4: 377768, beyms: 799659, ms: 710447, all: 1087295
[Ratings] U_1: 433060, U_2: 935411, U_3: 164203, U_4: 848247, beyms: 2380922, ms: 3001305, all: 5382227
[Sparsity] U_1: 0.995434, U_2: 0.997433, U_3: 0.990393, U_4: 0.996502, beyms: 0.998564, ms: 0.997963, all: 0.998807
