In [1]:
import pandas as pd
import ast
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
%matplotlib qt

plt.style.use("seaborn")
SMALL_SIZE = 15
MEDIUM_SIZE = 18
BIGGER_SIZE = 21

plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)  # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)  # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)  # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [2]:
dominating_genres = ["rock", "pop", "electronic", "metal", "alternativerock", "indierock"]

In [3]:
track_genres_df = pd.read_csv("feature_engineering/data/track_genres.csv", sep=";", header=None)
track_genres_df.columns = ["track_id", "genres"]
track_genres_df["genres"] = track_genres_df["genres"].apply(lambda r: ast.literal_eval(r))
track_genres_df["genres"] = track_genres_df["genres"].apply(lambda r: [g for g in r if g not in dominating_genres])
track_genres_df.set_index("track_id", inplace=True)
track_genres_df.head()

Unnamed: 0_level_0,genres
track_id,Unnamed: 1_level_1
4868,"[soul, singersongwriter, blues, jazz, pianoroc..."
2900,"[indiepop, shoegaze, dreampop, postpunk, elect..."
572665,"[soul, singersongwriter, blues, drama]"
2897,"[indiepop, postpunk, ambient, dreampop, shoega..."
15100,"[folk, indiefolk, banjo, folkrock, bluegrass, ..."


In [4]:
len(track_genres_df)

147156

In [5]:
events_df = pd.read_csv("feature_engineering/data/lowms_events_nondominating.csv", sep=";")[["user_id", "track_id"]]
events_df.head()

Unnamed: 0,user_id,track_id
0,31435741,53
1,31435741,53
2,31435741,53
3,31435741,53
4,31435741,53


In [6]:
events_with_genres_df = events_df.merge(track_genres_df, left_on="track_id", right_index=True)
events_with_genres_df.head()

Unnamed: 0,user_id,track_id,genres
0,31435741,53,"[soul, poprock, poprock, rap, singersongwriter..."
1,31435741,53,"[soul, poprock, poprock, rap, singersongwriter..."
2,31435741,53,"[soul, poprock, poprock, rap, singersongwriter..."
3,31435741,53,"[soul, poprock, poprock, rap, singersongwriter..."
4,31435741,53,"[soul, poprock, poprock, rap, singersongwriter..."


# for usergroups

In [7]:
classification_df = pd.read_csv("clustering/classification_clean.csv", sep=";", index_col="user_id")
classification_df.head()

Unnamed: 0_level_0,cluster
user_id,Unnamed: 1_level_1
10883488,1
35212267,3
38189090,3
22113634,3
3704198,3


In [8]:
genres_per_cluster_df = events_with_genres_df.merge(classification_df, left_on="user_id", right_index=True)[["genres", "cluster"]]
genres_per_cluster_df["genres"] = genres_per_cluster_df["genres"].apply(Counter)
genres_per_cluster_df = genres_per_cluster_df.groupby("cluster")["genres"].sum()
genres_per_cluster_df.head()

cluster
1    {'soul': 91367, 'poprock': 131041, 'rap': 1862...
2    {'soul': 134582, 'poprock': 330144, 'rap': 115...
3    {'postpunk': 8859, 'indiepop': 24682, 'experim...
4    {'soul': 55444, 'poprock': 74077, 'rap': 49874...
Name: genres, dtype: object

In [9]:
sorted(genres_per_cluster_df.loc[4].items(), key=lambda t: t[1])[::-1]

[('experimental', 313467),
 ('ambient', 249720),
 ('electronica', 247655),
 ('deathmetal', 237602),
 ('hardrock', 208089),
 ('progressivemetal', 200126),
 ('hardcore', 194467),
 ('triphop', 173374),
 ('punk', 167509),
 ('postrock', 167393),
 ('electro', 162737),
 ('downtempo', 153073),
 ('progressiverock', 152423),
 ('metalcore', 142238),
 ('hiphop', 142235),
 ('blackmetal', 138083),
 ('industrial', 133577),
 ('synthpop', 131294),
 ('melodicdeathmetal', 125662),
 ('postpunk', 122001),
 ('thrashmetal', 120441),
 ('indiepop', 119598),
 ('classicrock', 113646),
 ('singersongwriter', 105238),
 ('folk', 104456),
 ('electropop', 97564),
 ('posthardcore', 95250),
 ('alternativemetal', 92701),
 ('emo', 89933),
 ('doommetal', 87549),
 ('numetal', 83329),
 ('techno', 81044),
 ('lounge', 80128),
 ('newwave', 79704),
 ('darkwave', 78253),
 ('avantgarde', 78212),
 ('gothicmetal', 74241),
 ('poprock', 74077),
 ('jazz', 73905),
 ('technicaldeathmetal', 72912),
 ('soundtrack', 72552),
 ('easylistening

In [10]:
n_events_df = events_df.merge(classification_df, left_on="user_id", right_index=True).groupby("cluster").size()
n_events_df

cluster
1     783090
2    2094082
3     186921
4    1618048
dtype: int64

In [12]:
%matplotlib qt
c = 1
for count in genres_per_cluster_df.values:
    sorted_occs = sorted(dict(count).items(), key=lambda tup: tup[1])[::-1][:30]
    ys = list(zip(*sorted_occs))[1]
    ys = np.array(ys) / n_events_df.loc[c]
    plt.plot(list(range(30)), ys, "-o", label=r"$U_{C_" + str(c) + "}$")
    c += 1
plt.legend()
plt.ylabel("relative genre importance")
plt.xlabel("Top 30 genres")
plt.grid(False)
plt.show()

In [14]:
gradients = []
c = 1
for count in genres_per_cluster_df.values:
    sorted_occs = sorted(dict(count).items(), key=lambda tup: tup[1])[::-1][:10]
    ys = list(zip(*sorted_occs))[1]
    ys = np.array(ys) / n_events_df.loc[c]
    gradients.append(np.mean(np.gradient(ys)))
    c += 1

In [15]:
gradients

[-0.03545294921401117,
 -0.01207357209507555,
 -0.04215497456144574,
 -0.011001249653903962]

# for track cluster

In [13]:
track_classification_df = pd.read_csv("clustering/track_to_cluster.csv", sep=";").set_index("track_id")
track_classification_df = track_classification_df[track_classification_df["cluster"] != -1]
track_classification_df["cluster"] += 1
track_classification_df.head()

Unnamed: 0_level_0,cluster
track_id,Unnamed: 1_level_1
53,2
127,2
182,2
219,2
289,2


In [14]:
track_classification_df["cluster"].unique()

array([2, 4, 1, 3], dtype=int64)

In [15]:
genres_per_cluster_df = events_with_genres_df.merge(track_classification_df, left_on="track_id", right_index=True)[["genres", "cluster"]]
genres_per_cluster_df["genres"] = genres_per_cluster_df["genres"].apply(Counter)
genres_per_cluster_df = genres_per_cluster_df.groupby("cluster")["genres"].sum()
genres_per_cluster_df.head()

cluster
1    {'synthpop': 8211, 'newwave': 9237, 'electro':...
2    {'soul': 206755, 'poprock': 463608, 'rap': 170...
3    {'progressiverock': 14460, 'experimental': 561...
4    {'bluesrock': 12383, 'blues': 13823, 'garagero...
Name: genres, dtype: object

In [16]:
sorted(genres_per_cluster_df.loc[4].items(), key=lambda t: t[1])[::-1]

[('experimental', 159261),
 ('electronica', 152556),
 ('ambient', 133745),
 ('deathmetal', 117630),
 ('postrock', 110616),
 ('electro', 106561),
 ('triphop', 102768),
 ('downtempo', 86755),
 ('progressivemetal', 86385),
 ('hardrock', 80644),
 ('hardcore', 77933),
 ('blackmetal', 70682),
 ('industrial', 69987),
 ('progressiverock', 69557),
 ('postpunk', 68073),
 ('punk', 67945),
 ('synthpop', 65585),
 ('techno', 61147),
 ('indiepop', 59399),
 ('metalcore', 59063),
 ('melodicdeathmetal', 58228),
 ('electropop', 50651),
 ('trance', 49517),
 ('lounge', 48932),
 ('thrashmetal', 47104),
 ('house', 46883),
 ('shoegaze', 44083),
 ('hiphop', 41663),
 ('doommetal', 39688),
 ('classicrock', 38356),
 ('brutaldeathmetal', 38264),
 ('newwave', 38229),
 ('posthardcore', 37767),
 ('nujazz', 36670),
 ('avantgarde', 36470),
 ('alternativemetal', 35360),
 ('grindcore', 35063),
 ('technicaldeathmetal', 34950),
 ('noise', 34753),
 ('darkwave', 34226),
 ('deathcore', 33981),
 ('emo', 33814),
 ('jazz', 33787

In [17]:
genres_per_cluster_df.loc[2]["cluster"].values

AttributeError: 'int' object has no attribute 'values'

In [None]:
genres_per_cluster_df.loc[3]["cluster"].values

In [None]:
genres_per_cluster_df.loc[4]["cluster"].values

In [19]:
n_tracks_df = events_df.merge(track_classification_df, left_on="track_id", right_index=True).groupby("cluster").size()
n_tracks_df

cluster
1     328962
2    2988736
3     171468
4     769203
dtype: int64

In [20]:
%matplotlib qt
c = 1
for count in genres_per_cluster_df.values:
    sorted_occs = sorted(dict(count).items(), key=lambda tup: tup[1])[::-1][:30]
    ys = list(zip(*sorted_occs))[1]
    ys = np.array(ys) / n_tracks_df.loc[c]
    plt.plot(list(range(30)), ys, "-o", label=r"$C_" + str(c) + "$")
    c += 1
plt.legend()
plt.ylabel("relative genre importance")
plt.xlabel("Top 30 genres")
plt.grid(False)
plt.show()

In [22]:
gradients = []
c = 1
for count in genres_per_cluster_df.values:
    sorted_occs = sorted(dict(count).items(), key=lambda tup: tup[1])[::-1][:10]
    ys = list(zip(*sorted_occs))[1]
    ys = np.array(ys) / n_tracks_df.loc[c]
    gradients.append(np.mean(np.gradient(ys)))
    c += 1

In [23]:
gradients

[-0.052009057867709506,
 -0.010682634400516079,
 -0.05582437942096445,
 -0.01066137238341366]