In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sqlalchemy
import seaborn as sns
import ast
from collections import Counter

# Prepare User Data

In [None]:
user_data_df = pd.read_csv("data/LFM-1b_users.txt", sep="\t")[["user_id", "country", "age", "gender"]].set_index("user_id")
user_data_df.head()

In [None]:
user_data_add_df = pd.read_csv("data/LFM-1b_users_additional.txt", sep="\t").set_index("user-id")
user_data_add_df = user_data_add_df[["cnt_listeningevents"]]
user_data_add_df.head()

In [None]:
user_mainstreaminess_df = pd.read_csv("data/user_mainstreaminess.txt", sep="\t").set_index("user_id")
user_mainstreaminess_df = user_mainstreaminess_df[["M_global_R_APC"]]
user_mainstreaminess_df.head()

In [None]:
data_df = user_data_df.merge(user_data_add_df, left_index=True, right_index=True).merge(user_mainstreaminess_df, left_index=True, right_index=True)
data_df.dropna(inplace=True)
data_df.head()

# Listening Event Kernel Density Estimation

In [None]:
listening_events = data_df[data_df['cnt_listeningevents'] <= 25000]['cnt_listeningevents']
LE_kde = sns.distplot(listening_events).get_lines()[0].get_data()
plt.xlabel('Listening Events', fontsize=20)
plt.ylabel('Density', fontsize=20)

In [None]:
LE_kde_df = pd.DataFrame(LE_kde).T
LE_kde_df.columns = ["x", "y"]
LE_kde_df.head()

# Listening Events Threshold

In [None]:
LE_gradient = np.gradient(LE_kde_df["y"])

In [None]:
indices = np.where(np.abs(LE_gradient) < 0.000001)[0]
longest_seq, current_seq = [], []
for v in indices:
    if len(current_seq) == 0:
        current_seq.append(v)
    else:
        if current_seq[-1] == v-1:
            current_seq.append(v)
        elif len(longest_seq) < len(current_seq):
            longest_seq = current_seq
            current_seq = [v]

print("The longest consecutive sequence of points with a gradient < 0.000001 goes from %d to %d" % (LE_kde_df.iloc[longest_seq[0]]["x"], LE_kde_df.iloc[longest_seq[-1]]["x"]))

In [None]:
fig, axes = plt.subplots(2, 1, sharex=True)
axes[0].plot(LE_kde_df["x"], LE_kde_df["y"], label="")
axes[0].axhline(y=0, linestyle="--", color="grey", linewidth=1)
axes[0].axvline(x=4688, linestyle="--", c="black", label="Lower bound (4,688)")
axes[0].axvline(x=14787, linestyle="dotted", c="black", label="Upper bound (14,787)")
axes[0].legend(loc="upper right")
axes[0].set_ylabel("Density")
axes[0].grid(False)

axes[1].plot(LE_kde_df["x"], LE_gradient, label="")
axes[1].axhline(y=0, linestyle="--", color="grey", linewidth=1)
axes[1].axvline(x=4688, linestyle="--", c="black", label="Lower bound (4,688)")
axes[1].axvline(x=14787, linestyle="dotted", c="black", label="Upper bound (14,787)")
axes[1].set_ylabel("Gradient")
axes[1].grid(False)
axes[1].legend(loc="upper right")

plt.xlabel("No. of listening events per user")

# Mainstreaminess Kernel Density Estimation

In [None]:
within_LE_thresh_df = data_df[(data_df["cnt_listeningevents"] >= 4688) & (data_df["cnt_listeningevents"] <= 14787)]['M_global_R_APC']
M_kde = sns.distplot(within_LE_thresh_df, kde=True).get_lines()[0].get_data()
plt.xlabel('Mainstreaminess', fontsize=20)
plt.ylabel('Density', fontsize=20)

In [None]:
M_kde_df = pd.DataFrame(M_kde).T
M_kde_df.columns = ["x", "y"]
M_kde_df.head()

# Mainstreaminess Threshold

In [None]:
M_gradient = np.gradient(M_kde_df["y"])
max_gradient = M_kde_df.iloc[np.argmax(M_gradient)]["x"]
print("The point with the maximal gradient is %f" % max_gradient)

In [None]:
fig, axes = plt.subplots(2, 1, sharex=True)
axes[0].plot(M_kde_df["x"], M_kde_df["y"] / len(M_kde_df), label="", linewidth=1)
axes[0].axvline(x=0.097732, linestyle="--", c="black", label="Lower bound (0.097732)", linewidth=1)
axes[0].set_ylabel("Density")
axes[0].grid(False)
axes[0].legend(loc="upper right")

axes[1].plot(M_kde_df["x"], M_gradient, linewidth=1)
axes[1].axhline(y=0, linestyle="-", c="grey", linewidth=1)
axes[1].axvline(x=0.097732, linestyle="--", c="black", label="Lower bound (0.097732)", linewidth=1)
axes[1].set_ylabel("Gradient")
axes[1].grid(False)
plt.xlabel("Mainstreaminess")
plt.legend(loc="upper right")

# Construction of BeyMS and MS

In [None]:
within_LE_thresh_df = data_df[(data_df["cnt_listeningevents"] >= 4688) & (data_df["cnt_listeningevents"] <= 14787)]
beyms = within_LE_thresh_df[within_LE_thresh_df["M_global_R_APC"] < 0.097732].index.tolist()
ms = np.random.choice(within_LE_thresh_df[within_LE_thresh_df["M_global_R_APC"] >= 0.097732].index.tolist(), size=len(beyms), replace=False).tolist()

# Identification of too-general Genres

In [None]:
acoustic_features_df = pd.read_csv("data/acoustic_features_lfm_id.tsv", sep="\t").set_index("track_id")
acoustic_features_df = acoustic_features_df[["danceability", "energy", "speechiness", "acousticness", "instrumentalness", "tempo", "valence", "liveness"]]
acoustic_features_df.dropna(inplace=True)
af_annotations = acoustic_features_df.index.tolist()

In [None]:
db_options_df = pd.read_csv("db_credentials.txt", sep="=", header=None)
db_options_df.columns = ["variable", "value"]
db_options_df = db_options_df.apply(lambda col: col.str.strip())
db_options_df.set_index("variable", inplace=True)
db_options = db_options_df["value"].to_dict()
db_options

In [None]:
connection = sqlalchemy.create_engine('mysql+pymysql://' + db_options["DB_USERNAME"] + ":" + db_options["DB_PW"] + db_options["DB_PATH"])

In [None]:
tracks = pd.read_sql(con=connection, sql="SELECT DISTINCT(track_id) FROM events WHERE user_id IN " + str(tuple(beyms)))["track_id"].tolist()

In [None]:
genre_annotations_df = pd.read_csv("data/genre_annotations.csv")
genre_annotations_df.columns = ["track_id", "genres"]
genre_annotations_df.set_index("track_id", inplace=True)
genre_annotations_df["genres"] = genre_annotations_df["genres"].apply(ast.literal_eval)
genre_annotations_df = genre_annotations_df[genre_annotations_df["genres"].apply(len) != 0]

In [None]:
#
# TODO remove this
# 
# genre_annotations_df["genres"] = genre_annotations_df["genres"].apply(lambda l: l[:-1])

In [None]:
len(genre_annotations_df)

In [None]:
relevant_tracks = set(tracks).intersection(genre_annotations_df.index).intersection(af_annotations)
beyms_genre_annotations_df = genre_annotations_df[genre_annotations_df.index.isin(relevant_tracks)].copy()

In [None]:
n_documents = len(relevant_tracks)
n_documents

In [None]:
unique_genres = set()
for genres in beyms_genre_annotations_df["genres"]:
    unique_genres = unique_genres.union(genres)
len(unique_genres)

In [None]:
beyms_genre_annotations_df["genres"] = beyms_genre_annotations_df["genres"].apply(lambda r: Counter(set(r)))
n_rel_docs = Counter(dict.fromkeys(unique_genres, 0))
for _, counts in beyms_genre_annotations_df.iterrows():
    n_rel_docs = Counter(n_rel_docs + counts.values[0])
n_rel_docs

In [None]:
idf_scores = {genre: np.log10(n_documents / n_rel) for genre, n_rel in dict(n_rel_docs).items()}
idf_scores_df = pd.DataFrame.from_dict(idf_scores, orient="index", columns=["idf"]).sort_values(by="idf", ascending=True)
idf_scores_df.head(16)

In [None]:
plt.plot(list(range(0, 100)), idf_scores_df.iloc[:100], "-o")
plt.xlabel("Top-100 genres")
plt.ylabel("Genre IDF-score")
plt.axhline(y=0.9, linestyle="--", color="black", label="Lower bound (0.90)")
plt.legend()
plt.grid(False)
plt.show()

# Basic statistics of BeyMS

In [None]:
too_general_genres = idf_scores_df[idf_scores_df["idf"] < 0.9].index.unique().tolist()
too_general_genres

In [None]:
df = beyms_genre_annotations_df.copy()
df["genres"] = df["genres"].apply(lambda genres: [g for g in genres if g not in too_general_genres])
more_specific_tracks_df = df[df["genres"].apply(len) != 0]

In [None]:
beyms_tracks = more_specific_tracks_df.index.unique().tolist()

In [None]:
len(beyms_tracks)

In [None]:
beyms_events_df = pd.read_sql(con=connection, sql="SELECT * FROM events WHERE user_id IN " + str(tuple(beyms)) + " AND track_id IN " + str(tuple(beyms_tracks)))
beyms_events_df.head()

In [None]:
n_users = len(beyms)
n_tracks = len(beyms_tracks)
n_artists = beyms_events_df["artist_id"].nunique()
n_LEs = len(beyms_events_df)

In [None]:
groupby_beyms = beyms_events_df.groupby("user_id")
min_LEs_per_user = groupby_beyms.size().describe().loc["min"]
q1_LEs_per_user = groupby_beyms.size().describe().loc["25%"]
median_LEs_per_user = groupby_beyms.size().describe().loc["50%"]
q3_LEs_per_user = groupby_beyms.size().describe().loc["75%"]
max_LEs_per_user = groupby_beyms.size().describe().loc["max"]
mean_LEs_per_user = groupby_beyms.size().describe().loc["mean"]
std_LEs_per_user = groupby_beyms.size().describe().loc["std"]

In [None]:
beyms_countries_df = pd.read_csv("data/LFM-1b_users.txt", sep="\t")[["user_id", "country"]].set_index("user_id").dropna().loc[beyms]
beyms_countries_df.head()

In [None]:
mean_users_per_country = beyms_countries_df["country"].value_counts().mean()
std_users_per_country = beyms_countries_df["country"].value_counts().std()

In [None]:
print("Users: %d" % n_users)
print("Tracks: %d" % n_tracks)
print("Artists: %d" % n_artists)
print("Listening Events (LEs): %d" % n_LEs)
print("Min. LEs per user: %d" % min_LEs_per_user)
print("Q1 LEs per user: %d" % q1_LEs_per_user)
print("Median LEs per user: %d" % median_LEs_per_user)
print("Q3 LEs per user: %d" % q3_LEs_per_user)
print("Max. LEs per user: %d" % max_LEs_per_user)
print("Mean (Std) LEs per user: %f (%f)" % (mean_LEs_per_user, std_LEs_per_user))
print("Mean (Std) Users per country: %f (%f)" % (mean_users_per_country, std_users_per_country))

# Save dataset for BeyMS and MS

In [None]:
# The dataset comprises tracks that have been listened by at least one user in BeyMS.
# Each track is annotated with more-specific genres and its representation by acoustic features can be obtained.

In [None]:
more_specific_tracks_df.to_csv("data/beyms_more_specific_tracks.csv")

In [None]:
idf_scores_df.reset_index().rename(columns={"index": "genre"}).to_csv("data/genre_idf_scores.csv", index=False)

In [None]:
beyms_df = pd.DataFrame(beyms)
beyms_df.columns = ["user_id"]
beyms_df.to_csv("data/beyms.csv", index=False)

In [None]:
ms_df = pd.DataFrame(ms)
ms_df.columns = ["user_id"]
ms_df.to_csv("data/ms.csv", index=False)