In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast

%matplotlib qt
plt.style.use("seaborn")

In [2]:
events_df = pd.read_csv("../feature_engineering/data/lowms_events_nondominating.csv", sep=";")
events_df.head()

Unnamed: 0,user_id,track_id,artist_id,timestamp
0,31435741,53,21,1370977938
1,31435741,53,21,1370977728
2,31435741,53,21,1370977518
3,31435741,53,21,1370977308
4,31435741,53,21,1370977098


# artist heterogenity usergroup-based

In [3]:
predictions_df = pd.read_csv("classification_clean.csv", sep=";", index_col="user_id")
predictions_df.head()

Unnamed: 0_level_0,cluster
user_id,Unnamed: 1_level_1
10883488,1
35212267,3
38189090,3
22113634,3
3704198,3


In [5]:
df = predictions_df.merge(events_df[["user_id", "track_id", "artist_id"]], left_index=True, right_on="user_id").sort_index()
df.head()

Unnamed: 0,cluster,user_id,track_id,artist_id
0,2,31435741,53,21
1,2,31435741,53,21
2,2,31435741,53,21
3,2,31435741,53,21
4,2,31435741,53,21


In [188]:
n_dist_tracks = df.groupby(by=["cluster", "user_id"])["track_id"].nunique()
n_dist_tracks.head()

cluster  user_id
1        1072752     442
         2095434     861
         2109450     343
         2246769    1242
         2246867     545
Name: track_id, dtype: int64

In [190]:
n_dist_artists = df.groupby(by=["cluster", "user_id"])["artist_id"].nunique()
n_dist_artists.head()

cluster  user_id
1        1072752    123
         2095434    298
         2109450    131
         2246769    179
         2246867    149
Name: artist_id, dtype: int64

In [209]:
artist_heterogenity = np.log2(n_dist_tracks / n_dist_artists)
artist_heterogenity_df = pd.DataFrame(artist_heterogenity)
artist_heterogenity_df.columns = ["score"]
artist_heterogenity_df.reset_index(inplace=True)
artist_heterogenity_df = artist_heterogenity_df[["cluster", "score"]]
artist_heterogenity_df["score"] = artist_heterogenity_df["score"]
artist_heterogenity_df.groupby("cluster").mean()

Unnamed: 0_level_0,score
cluster,Unnamed: 1_level_1
1,1.806829
2,2.011049
3,1.805321
4,2.153887


# artist heterogenity trackcluster-based

In [14]:
track_clusters_df = pd.read_csv("track_to_cluster.csv", sep=";", index_col="track_id")
track_clusters_df = track_clusters_df[track_clusters_df["cluster"] != -1]
track_clusters_df["cluster"] += 1
track_clusters_df.head()

Unnamed: 0_level_0,cluster
track_id,Unnamed: 1_level_1
53,2
127,2
182,2
219,2
289,2


In [15]:
track_clusters_df["cluster"].unique()

array([2, 4, 1, 3], dtype=int64)

In [16]:
df = events_df[["track_id", "artist_id"]].merge(track_clusters_df, left_on="track_id", right_index=True)
df.head()

Unnamed: 0,track_id,artist_id,cluster
0,53,21,2
1,53,21,2
2,53,21,2
3,53,21,2
4,53,21,2


In [17]:
n_dist_tracks = df.groupby(by="cluster")["track_id"].nunique()
n_dist_tracks.head()

cluster
1    11588
2    85663
3     6446
4    27446
Name: track_id, dtype: int64

In [18]:
n_dist_artists = df.groupby(by="cluster")["artist_id"].nunique()
n_dist_artists.head()

cluster
1     3325
2    11172
3     2209
4     5443
Name: artist_id, dtype: int64

In [19]:
artist_heterogenity = np.log2(n_dist_tracks / n_dist_artists)
artist_heterogenity_df = pd.DataFrame(artist_heterogenity)
artist_heterogenity_df.reset_index(inplace=True)
artist_heterogenity_df.columns = ["track_cluster", "artist_heterogenity"]
artist_heterogenity_df.set_index("track_cluster", inplace=True)
artist_heterogenity_df.head()

Unnamed: 0_level_0,artist_heterogenity
track_cluster,Unnamed: 1_level_1
1,1.801205
2,2.938785
3,1.545011
4,2.334122


# genre heterogenity usergroup-based

In [5]:
genre_idfs_df = pd.read_csv("track_genres_idf_dist.csv", sep=";")
genre_idfs_df.columns = ["genre", "idf"]
genre_idfs_df.set_index("genre", inplace=True)
genre_idfs_df.head()

Unnamed: 0_level_0,idf
genre,Unnamed: 1_level_1
rock,0.3912
pop,0.658374
electronic,0.71756
metal,0.726397
alternativerock,0.744237


In [6]:
track_genres_df = pd.read_csv("track_genres.csv", sep=";", header=None)
track_genres_df.columns = ["track_id", "genres"]
track_genres_df.set_index("track_id", inplace=True)
track_genres_df["genres"] = track_genres_df["genres"].apply(ast.literal_eval)
track_genres_df.head()

Unnamed: 0_level_0,genres
track_id,Unnamed: 1_level_1
4868,"[soul, pop, singersongwriter, blues, jazz, ind..."
2900,"[electronic, indiepop, shoegaze, dreampop, pos..."
572665,"[soul, pop, singersongwriter, blues, drama]"
2897,"[indierock, electronic, indiepop, postpunk, ro..."
15100,"[folk, indiefolk, banjo, folkrock, bluegrass, ..."


In [12]:
dominant_genres = ["rock", "pop", "electronic", "metal", "alternativerock", "indierock"]
top_genres_df = pd.DataFrame()
for track_id, data in track_genres_df.iterrows():
    scores = [(genre, genre_idfs_df.loc[genre]["idf"]) for genre in data["genres"] if genre not in dominant_genres]
    if len(scores) > 0:
        max_genre = sorted(scores, key=lambda t: t[1])[:-2:-1][0][0]
        top_genres_df = top_genres_df.append(pd.DataFrame(data={"track_id": [track_id], "top_genre": [max_genre]}))

top_genres_df.set_index("track_id", inplace=True)
top_genres_df.head()

Unnamed: 0_level_0,top_genre
track_id,Unnamed: 1_level_1
4868,pianorock
2900,minimal
572665,drama
2897,chillgroove
15100,contemporaryfolk


In [284]:
df = events_df[["user_id", "track_id"]].merge(top_genres_df, left_on="track_id", right_index=True)
df.head()

Unnamed: 0,user_id,track_id,top_genre
0,31435741,53,rhythmandblues
1,31435741,53,rhythmandblues
2,31435741,53,rhythmandblues
3,31435741,53,rhythmandblues
4,31435741,53,rhythmandblues


In [285]:
df = df.merge(predictions_df, left_on="user_id", right_on="user_id")
df.head()

Unnamed: 0,user_id,track_id,top_genre,cluster
0,31435741,53,rhythmandblues,2
1,31435741,53,rhythmandblues,2
2,31435741,53,rhythmandblues,2
3,31435741,53,rhythmandblues,2
4,31435741,53,rhythmandblues,2


In [286]:
n_dist_tracks = df.groupby(by=["cluster", "user_id"])["track_id"].nunique()
n_dist_tracks.head()

cluster  user_id
1        1072752     435
         2095434     838
         2109450     335
         2246769    1192
         2246867     540
Name: track_id, dtype: int64

In [315]:
df = events_df[["user_id", "track_id"]].merge(track_genres_df, left_on="track_id", right_index=True)
df.head()

Unnamed: 0,user_id,track_id,genres,cluster
0,31435741,53,"[pop, rock, soul, poprock, poprock, rap, singe...",2
1,31435741,53,"[pop, rock, soul, poprock, poprock, rap, singe...",2
2,31435741,53,"[pop, rock, soul, poprock, poprock, rap, singe...",2
3,31435741,53,"[pop, rock, soul, poprock, poprock, rap, singe...",2
4,31435741,53,"[pop, rock, soul, poprock, poprock, rap, singe...",2


In [316]:
df.drop_duplicates(subset=["user_id", "track_id"], inplace=True)

In [327]:
n_dist_genres = df.groupby("user_id")["genres"].sum().apply(set)
n_dist_genres_df = pd.DataFrame(n_dist_genres)
#n_dist_tracks_df["genres"] = n_dist_tracks_df["genres"].apply(len)
n_dist_genres_df = n_dist_genres_df.merge(predictions_df, left_index=True, right_on="user_id")
n_dist_genres_df.reset_index(inplace=True)
n_dist_genres = n_dist_genres_df.groupby(by=["cluster", "user_id"]).apply(len)
n_dist_genres.head()
#n_dist_tracks_df.head()

cluster  user_id
1        1072752    1
         2095434    1
         2109450    1
         2246769    1
         2246867    1
dtype: int64

In [293]:
n_dist_genres = df.groupby(by=["cluster", "user_id"])["top_genre"].nunique()
n_dist_genres.head()

KeyError: 'cluster'

In [328]:
genre_heterogenity = np.log2(n_dist_tracks / n_dist_genres)
genre_heterogenity_df = pd.DataFrame(genre_heterogenity)
genre_heterogenity_df.columns = ["score"]
genre_heterogenity_df.reset_index(inplace=True)
genre_heterogenity_df = genre_heterogenity_df[["cluster", "score"]]
genre_heterogenity_df.groupby("cluster").mean()

Unnamed: 0_level_0,score
cluster,Unnamed: 1_level_1
1,13.500344
2,16.386385
3,12.654188
4,14.744308


# genre heterogenity trackcluster-based

In [7]:
track_clusters_df = pd.read_csv("track_to_cluster.csv", sep=";", index_col="track_id")
track_clusters_df = track_clusters_df[track_clusters_df["cluster"] != -1]
track_clusters_df["cluster"] += 1
track_clusters_df.head()

Unnamed: 0_level_0,cluster
track_id,Unnamed: 1_level_1
53,2
127,2
182,2
219,2
289,2


In [8]:
df = events_df[["user_id", "track_id"]].merge(top_genres_df, left_on="track_id", right_index=True)
df.head()

NameError: name 'top_genres_df' is not defined

In [13]:
df = df[["track_id", "top_genre"]].merge(track_clusters_df, left_on="track_id", right_index=True)
df.head()

NameError: name 'df' is not defined

In [None]:
n_dist_tracks = df.groupby(by="cluster")["track_id"].nunique()
n_dist_tracks.head()

In [11]:
n_dist_genres = df.groupby("cluster")["top_genre"].nunique()
n_dist_genres.head()

NameError: name 'df' is not defined

In [None]:
n_dist_genres_df = track_clusters_df.merge(track_genres_df, left_index=True, right_index=True)
n_dist_genres_df["genres"] = n_dist_genres_df.groupby("cluster")["genres"].sum().apply(set)
n_dist_genres_df.reset_index(inplace=True)
n_dist_genres = n_dist_genres_df.groupby(by=["cluster"])["genres"].apply(len)
n_dist_genres.head()

In [255]:
genre_heterogenity = np.log2(n_dist_tracks / n_dist_genres)
genre_heterogenity_df = pd.DataFrame(genre_heterogenity)
genre_heterogenity_df.reset_index(inplace=True)
genre_heterogenity_df.columns = ["track_cluster", "genre_heterogenity"]
genre_heterogenity_df.set_index("track_cluster", inplace=True)
genre_heterogenity_df.head()

Unnamed: 0_level_0,genre_heterogenity
track_cluster,Unnamed: 1_level_1
1,4.174069
2,6.277829
3,3.603359
4,5.076911


In [None]:
d