In [1]:
import pandas as pd
import ast
from collections import Counter
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances
import seaborn as sns

In [2]:
track_genres_df = pd.read_csv("feature_engineering/data/track_genres.csv", sep=";", header=None)
track_genres_df.columns = ["track_id", "genres"]
track_genres_df.head()

Unnamed: 0,track_id,genres
0,4868,"['soul', 'pop', 'singersongwriter', 'blues', '..."
1,2900,"['electronic', 'indiepop', 'shoegaze', 'dreamp..."
2,572665,"['soul', 'pop', 'singersongwriter', 'blues', '..."
3,2897,"['indierock', 'electronic', 'indiepop', 'postp..."
4,15100,"['folk', 'indiefolk', 'banjo', 'folkrock', 'bl..."


In [3]:
events_df = pd.read_csv("feature_engineering/data/lowms_events_nondominating.csv", sep=";")
events_df.head()

Unnamed: 0,user_id,track_id,artist_id,timestamp
0,31435741,53,21,1370977938
1,31435741,53,21,1370977728
2,31435741,53,21,1370977518
3,31435741,53,21,1370977308
4,31435741,53,21,1370977098


In [4]:
list_of_users = list(set(events_df["user_id"]))

In [5]:
all_genres_df = pd.read_csv("feature_engineering/unique_spotify_microgenres.txt", sep="\n", header=None)
all_genres_df.columns = ["genre"]
all_genres_df.head()

Unnamed: 0,genre
0,djent
1,tribalhouse
2,nasheed
3,ambientidm
4,sleazerock


In [6]:
list_of_genres = list(all_genres_df["genre"].values)
list_of_genres

['djent',
 'tribalhouse',
 'nasheed',
 'ambientidm',
 'sleazerock',
 'pixie',
 'filthstep',
 'noisepop',
 'germanpunkrock',
 'electroindustrial',
 'colombianpop',
 'swedishhiphop',
 'moombahton',
 'nashvillesound',
 'russianfolk',
 'veganstraightedge',
 'dixieland',
 'brazilianblackmetal',
 'germanpoprock',
 'numetal',
 'detroithiphop',
 'sega',
 'bemani',
 'banjo',
 'viola',
 'neworleansrap',
 'hardcoretechno',
 'electricblues',
 'boogiewoogie',
 'israelimetal',
 'irishsingersongwriter',
 'ukrainianrock',
 'oi',
 'nudisco',
 'canadianpunk',
 'aggrotech',
 'avantgardejazz',
 'nica',
 'blackenedcrust',
 'ccm',
 'wave',
 'lebanesepop',
 'technicaldeathmetal',
 'flamencoguitar',
 'finnishclassical',
 'finnishhiphop',
 'minimalwave',
 'germanindie',
 'rhythmandblues',
 'easycore',
 'iskelma',
 'fidgethouse',
 'heartlandrock',
 'folkmusik',
 'synthpop',
 'chicanorap',
 'cumbia',
 'experimentalrock',
 'japaneseexperimental',
 'redneck',
 'disney',
 'latinpop',
 'modernhardrock',
 'manguebit'

In [7]:
dominant_genres = ["rock", "pop", "electronic", "metal", "alternativerock", "indierock"]

In [8]:
user_track_df = pd.read_csv("feature_engineering/data/user_track.csv", sep=";")
genres_per_user_df = user_track_df.merge(track_genres_df, left_on="track_id", right_on="track_id")
genres_per_user_df.set_index("user_id", inplace=True)
genres_per_user_df = genres_per_user_df["genres"].apply(lambda row: ast.literal_eval(row)).to_frame()
genres_per_user_df = genres_per_user_df["genres"].apply(lambda l: [g for g in l if g not in dominant_genres]).to_frame()
genres_per_user_df.head()

Unnamed: 0_level_0,genres
user_id,Unnamed: 1_level_1
31435741,"[soul, poprock, poprock, rap, singersongwriter..."
4664425,"[soul, poprock, poprock, rap, singersongwriter..."
36371774,"[soul, poprock, poprock, rap, singersongwriter..."
19058646,"[soul, poprock, poprock, rap, singersongwriter..."
4649427,"[soul, poprock, poprock, rap, singersongwriter..."


In [9]:
user_track_df["track_id"].nunique()

147156

In [10]:
classification_df = pd.read_csv("clustering/classification_clean.csv", sep=";", index_col="user_id")
classification_df.head()

Unnamed: 0_level_0,cluster
user_id,Unnamed: 1_level_1
10883488,1
35212267,3
38189090,3
22113634,3
3704198,3


In [11]:
genre_counts_df = genres_per_user_df["genres"].apply(Counter).to_frame()
genre_counts_df.head()

Unnamed: 0_level_0,genres
user_id,Unnamed: 1_level_1
31435741,"{'soul': 1, 'poprock': 2, 'rap': 1, 'singerson..."
4664425,"{'soul': 1, 'poprock': 2, 'rap': 1, 'singerson..."
36371774,"{'soul': 1, 'poprock': 2, 'rap': 1, 'singerson..."
19058646,"{'soul': 1, 'poprock': 2, 'rap': 1, 'singerson..."
4649427,"{'soul': 1, 'poprock': 2, 'rap': 1, 'singerson..."


In [12]:
#genre_counts_df = genres_per_user_df.merge(classification_df, left_index=True, right_index=True)["genres"].apply(Counter).to_frame()
#genre_counts_df.head()

In [13]:
genre_counts_df.reset_index(inplace=True)
genre_counts_df = genre_counts_df.groupby(by="user_id").sum()
genre_counts_df.head()

Unnamed: 0_level_0,genres
user_id,Unnamed: 1_level_1
1049656,"{'southernrock': 5, 'poprock': 77, 'rb': 23, '..."
1055118,"{'progressiverock': 56, 'progressivemetal': 48..."
1056935,"{'punk': 143, 'classicrock': 74, 'newwave': 96..."
1070023,"{'house': 40, 'electronica': 129, 'techno': 46..."
1072752,"{'classicrock': 104, 'progressiverock': 44, 'p..."


In [14]:
representation_df = pd.DataFrame()
for uid, row in genre_counts_df.iterrows():
    counts = dict(row[0])
    new_row = pd.DataFrame(index=[uid], data=counts)
    representation_df = representation_df.append(new_row)
representation_df = representation_df.fillna(0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [15]:
representation_df.head()

Unnamed: 0,abstract,abstractbeats,abstracthiphop,abstractidm,acappella,accordeon,accordion,acidhouse,acididm,acidjazz,...,worldfusion,worship,wrestling,wrock,yachtrock,yeye,zeuhl,zouglou,zouk,zydeco
1049656,36.0,0.0,7.0,0.0,2.0,1.0,2.0,6.0,0.0,35.0,...,2.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
1055118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1056935,1.0,0.0,0.0,0.0,1.0,0.0,4.0,1.0,0.0,6.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1070023,1.0,0.0,6.0,0.0,0.0,0.0,0.0,2.0,0.0,7.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1072752,8.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,9.0,...,1.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0


In [16]:
n_genres_per_user = (representation_df != 0).sum(axis=1)

In [17]:
representation_normed_df = pd.DataFrame()
for cname in representation_df:
    representation_normed_df[cname] = representation_df[cname] - np.mean(representation_df[cname])
    #representation_df[cname] -= np.mean(representation_df[cname])
#representation_normed_df = (representation_df.T - representation_df.mean(axis=1)).T

In [18]:
representation_df.head()

Unnamed: 0,abstract,abstractbeats,abstracthiphop,abstractidm,acappella,accordeon,accordion,acidhouse,acididm,acidjazz,...,worldfusion,worship,wrestling,wrock,yachtrock,yeye,zeuhl,zouglou,zouk,zydeco
1049656,36.0,0.0,7.0,0.0,2.0,1.0,2.0,6.0,0.0,35.0,...,2.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
1055118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1056935,1.0,0.0,0.0,0.0,1.0,0.0,4.0,1.0,0.0,6.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1070023,1.0,0.0,6.0,0.0,0.0,0.0,0.0,2.0,0.0,7.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1072752,8.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,9.0,...,1.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0


In [19]:
representation_normed_df.head()

Unnamed: 0,abstract,abstractbeats,abstracthiphop,abstractidm,acappella,accordeon,accordion,acidhouse,acididm,acidjazz,...,worldfusion,worship,wrestling,wrock,yachtrock,yeye,zeuhl,zouglou,zouk,zydeco
1049656,33.420646,-0.062711,5.460203,-0.000482,1.471298,0.900627,0.942595,5.407139,-0.006271,26.492041,...,-0.339604,-0.766522,-0.50603,-0.009648,2.652677,-0.085384,-0.007718,-0.000482,-0.011095,-0.013025
1055118,-2.579354,-0.062711,-1.539797,-0.000482,-0.528702,-0.099373,-1.057405,-0.592861,-0.006271,-8.507959,...,-2.339604,-0.766522,-0.50603,-0.009648,-0.347323,-0.085384,-0.007718,-0.000482,-0.011095,-0.013025
1056935,-1.579354,-0.062711,-1.539797,-0.000482,0.471298,-0.099373,2.942595,0.407139,-0.006271,-2.507959,...,-1.339604,-1.766522,-0.50603,-0.009648,-0.347323,0.914616,-0.007718,-0.000482,-0.011095,-0.013025
1070023,-1.579354,-0.062711,4.460203,-0.000482,-0.528702,-0.099373,-1.057405,1.407139,-0.006271,-1.507959,...,-1.339604,-1.766522,0.49397,-0.009648,-0.347323,-0.085384,-0.007718,-0.000482,-0.011095,-0.013025
1072752,5.420646,-0.062711,-1.539797,-0.000482,1.471298,-0.099373,-1.057405,0.407139,-0.006271,0.492041,...,-1.339604,-1.766522,-0.50603,-0.009648,2.652677,2.914616,-0.007718,-0.000482,-0.011095,-0.013025


In [20]:
representation_df = representation_df.merge(classification_df, left_index=True, right_index=True)
representation_df.head()

Unnamed: 0,abstract,abstractbeats,abstracthiphop,abstractidm,acappella,accordeon,accordion,acidhouse,acididm,acidjazz,...,worship,wrestling,wrock,yachtrock,yeye,zeuhl,zouglou,zouk,zydeco,cluster
1049656,36.0,0.0,7.0,0.0,2.0,1.0,2.0,6.0,0.0,35.0,...,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3
1055118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1056935,1.0,0.0,0.0,0.0,1.0,0.0,4.0,1.0,0.0,6.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2
1070023,1.0,0.0,6.0,0.0,0.0,0.0,0.0,2.0,0.0,7.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1072752,8.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,9.0,...,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,1


In [21]:
representation_normed_df = representation_normed_df.merge(classification_df, left_index=True, right_index=True)
representation_normed_df.head()

Unnamed: 0,abstract,abstractbeats,abstracthiphop,abstractidm,acappella,accordeon,accordion,acidhouse,acididm,acidjazz,...,worship,wrestling,wrock,yachtrock,yeye,zeuhl,zouglou,zouk,zydeco,cluster
1049656,33.420646,-0.062711,5.460203,-0.000482,1.471298,0.900627,0.942595,5.407139,-0.006271,26.492041,...,-0.766522,-0.50603,-0.009648,2.652677,-0.085384,-0.007718,-0.000482,-0.011095,-0.013025,3
1055118,-2.579354,-0.062711,-1.539797,-0.000482,-0.528702,-0.099373,-1.057405,-0.592861,-0.006271,-8.507959,...,-0.766522,-0.50603,-0.009648,-0.347323,-0.085384,-0.007718,-0.000482,-0.011095,-0.013025,2
1056935,-1.579354,-0.062711,-1.539797,-0.000482,0.471298,-0.099373,2.942595,0.407139,-0.006271,-2.507959,...,-1.766522,-0.50603,-0.009648,-0.347323,0.914616,-0.007718,-0.000482,-0.011095,-0.013025,2
1070023,-1.579354,-0.062711,4.460203,-0.000482,-0.528702,-0.099373,-1.057405,1.407139,-0.006271,-1.507959,...,-1.766522,0.49397,-0.009648,-0.347323,-0.085384,-0.007718,-0.000482,-0.011095,-0.013025,2
1072752,5.420646,-0.062711,-1.539797,-0.000482,1.471298,-0.099373,-1.057405,0.407139,-0.006271,0.492041,...,-1.766522,-0.50603,-0.009648,2.652677,2.914616,-0.007718,-0.000482,-0.011095,-0.013025,1


In [22]:
representation_normed_df.head()

Unnamed: 0,abstract,abstractbeats,abstracthiphop,abstractidm,acappella,accordeon,accordion,acidhouse,acididm,acidjazz,...,worship,wrestling,wrock,yachtrock,yeye,zeuhl,zouglou,zouk,zydeco,cluster
1049656,33.420646,-0.062711,5.460203,-0.000482,1.471298,0.900627,0.942595,5.407139,-0.006271,26.492041,...,-0.766522,-0.50603,-0.009648,2.652677,-0.085384,-0.007718,-0.000482,-0.011095,-0.013025,3
1055118,-2.579354,-0.062711,-1.539797,-0.000482,-0.528702,-0.099373,-1.057405,-0.592861,-0.006271,-8.507959,...,-0.766522,-0.50603,-0.009648,-0.347323,-0.085384,-0.007718,-0.000482,-0.011095,-0.013025,2
1056935,-1.579354,-0.062711,-1.539797,-0.000482,0.471298,-0.099373,2.942595,0.407139,-0.006271,-2.507959,...,-1.766522,-0.50603,-0.009648,-0.347323,0.914616,-0.007718,-0.000482,-0.011095,-0.013025,2
1070023,-1.579354,-0.062711,4.460203,-0.000482,-0.528702,-0.099373,-1.057405,1.407139,-0.006271,-1.507959,...,-1.766522,0.49397,-0.009648,-0.347323,-0.085384,-0.007718,-0.000482,-0.011095,-0.013025,2
1072752,5.420646,-0.062711,-1.539797,-0.000482,1.471298,-0.099373,-1.057405,0.407139,-0.006271,0.492041,...,-1.766522,-0.50603,-0.009648,2.652677,2.914616,-0.007718,-0.000482,-0.011095,-0.013025,1


In [23]:
representation_normed_df.head()

Unnamed: 0,abstract,abstractbeats,abstracthiphop,abstractidm,acappella,accordeon,accordion,acidhouse,acididm,acidjazz,...,worship,wrestling,wrock,yachtrock,yeye,zeuhl,zouglou,zouk,zydeco,cluster
1049656,33.420646,-0.062711,5.460203,-0.000482,1.471298,0.900627,0.942595,5.407139,-0.006271,26.492041,...,-0.766522,-0.50603,-0.009648,2.652677,-0.085384,-0.007718,-0.000482,-0.011095,-0.013025,3
1055118,-2.579354,-0.062711,-1.539797,-0.000482,-0.528702,-0.099373,-1.057405,-0.592861,-0.006271,-8.507959,...,-0.766522,-0.50603,-0.009648,-0.347323,-0.085384,-0.007718,-0.000482,-0.011095,-0.013025,2
1056935,-1.579354,-0.062711,-1.539797,-0.000482,0.471298,-0.099373,2.942595,0.407139,-0.006271,-2.507959,...,-1.766522,-0.50603,-0.009648,-0.347323,0.914616,-0.007718,-0.000482,-0.011095,-0.013025,2
1070023,-1.579354,-0.062711,4.460203,-0.000482,-0.528702,-0.099373,-1.057405,1.407139,-0.006271,-1.507959,...,-1.766522,0.49397,-0.009648,-0.347323,-0.085384,-0.007718,-0.000482,-0.011095,-0.013025,2
1072752,5.420646,-0.062711,-1.539797,-0.000482,1.471298,-0.099373,-1.057405,0.407139,-0.006271,0.492041,...,-1.766522,-0.50603,-0.009648,2.652677,2.914616,-0.007718,-0.000482,-0.011095,-0.013025,1


In [24]:
dominating_genres = ["rock", "pop", "electronic", "metal", "alternativerock", "indierock"]

In [38]:
avg_pairwise_jaccard_df = pd.DataFrame()
for group, data in representation_df.groupby(by="cluster"):
    indicators = data != 0
    sims = 1 - pairwise_distances(indicators.to_numpy(), metric="jaccard")
    
    # exclude sim of 1 when comparing u_x to u_x
    avg_sim = (np.sum(sims, axis=0) - 1) / (len(sims) - 1)
    #avg_sim = np.mean(sims, axis=0)
    
    new_row = pd.DataFrame(data={"similarity":avg_sim, "cluster": [group] * len(avg_sim)})
    avg_pairwise_jaccard_df = avg_pairwise_jaccard_df.append(new_row)

In [39]:
avg_pairwise_jaccard_df.head()

Unnamed: 0,similarity,cluster
0,0.469932,1
1,0.474288,1
2,0.437926,1
3,0.477697,1
4,0.469394,1


In [26]:
len(representation_normed_df.columns)

1186

In [25]:
intra_group_variances = []
avg_pairwise_similarities_df = pd.DataFrame()
for group, data in representation_normed_df.groupby(by="cluster"):
    sims = cosine_similarity(data, data)
    intra_group_variances.append(np.var(sims))
    # exclude sim of 1 when comparing u_x to u_x
    avg_sim = (np.sum(sims, axis=0) - 1) / (len(sims) - 1)
    #avg_sim = np.mean(sims, axis=0)
    #print(group, np.var(sims))
    
    new_row = pd.DataFrame(data={"similarity":avg_sim, "cluster": [group] * len(avg_sim)})
    avg_pairwise_similarities_df = avg_pairwise_similarities_df.append(new_row)

In [26]:
intra_group_variances

[0.13291214471504376,
 0.1350055511530832,
 0.11457129759360819,
 0.10989395143311106]

In [35]:
avg_pairwise_similarities_df.groupby("cluster").mean()

Unnamed: 0_level_0,similarity
cluster,Unnamed: 1_level_1
1,0.255211
2,0.130614
3,0.292039
4,0.067837


In [30]:
%matplotlib inline
avg_pairwise_jaccard_df.boxplot(by="cluster")
plt.suptitle("")
plt.title("")
plt.xticks([1, 2, 3, 4], [r"$U_{C1}$", r"$U_{C2}$", r"$U_{C3}$", r"$U_{C4}$"])
plt.xlabel("")
plt.ylabel("Average pairwise user similarity (jaccard)")

NameError: name 'avg_pairwise_jaccard_df' is not defined

In [None]:
%matplotlib qt
avg_pairwise_similarities_df.boxplot(by="cluster")
plt.suptitle("")
plt.title("")
plt.xticks([1, 2, 3, 4], [r"$U_{C_1}$", r"$U_{C_2}$", r"$U_{C_3}$", r"$U_{C_4}$"])
plt.xlabel("")
plt.ylabel("Average pairwise user similarity")

  x2 = take(ap, indices_above, axis=axis) * weights_above
  wiskhi = x[x <= hival]
  wisklo = x[x >= loval]
  x[x < stats['whislo']],
  x[x > stats['whishi']],


In [None]:
cosine_similarity(representation_df, representation_df)

In [30]:
n_genres_per_user = n_genres_per_user.to_frame()
n_genres_per_user.index.name = "user_id"
n_genres_per_user.columns = ["n_genres"]

In [60]:
representation_df.head()

Unnamed: 0,abstract,abstractbeats,abstracthiphop,abstractidm,acappella,accordeon,accordion,acidhouse,acididm,acidjazz,...,worship,wrestling,wrock,yachtrock,yeye,zeuhl,zouglou,zouk,zydeco,cluster
1049656,36.0,0.0,7.0,0.0,2.0,1.0,2.0,6.0,0.0,35.0,...,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3
1055118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1056935,1.0,0.0,0.0,0.0,1.0,0.0,4.0,1.0,0.0,6.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2
1070023,1.0,0.0,6.0,0.0,0.0,0.0,0.0,2.0,0.0,7.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
1072752,8.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,9.0,...,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,1


In [29]:
representation_df.drop("cluster", axis=1, inplace=True)

In [30]:
genres_per_cluster_df = representation_df.merge(classification_df, left_index=True, right_index=True).groupby(by="cluster").sum()

In [31]:
genres_per_cluster_df.head()

Unnamed: 0_level_0,abstract,abstractbeats,abstracthiphop,abstractidm,acappella,accordeon,accordion,acidhouse,acididm,acidjazz,...,worldfusion,worship,wrestling,wrock,yachtrock,yeye,zeuhl,zouglou,zouk,zydeco
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,619.0,14.0,312.0,0.0,359.0,77.0,926.0,112.0,2.0,4036.0,...,1460.0,1114.0,49.0,1.0,291.0,76.0,3.0,0.0,9.0,4.0
2,737.0,22.0,821.0,0.0,427.0,58.0,558.0,257.0,0.0,3332.0,...,1123.0,2232.0,717.0,9.0,251.0,45.0,1.0,1.0,12.0,8.0
3,774.0,11.0,360.0,1.0,42.0,17.0,117.0,84.0,0.0,1449.0,...,516.0,113.0,7.0,0.0,40.0,11.0,0.0,0.0,1.0,0.0
4,3217.0,83.0,1699.0,0.0,268.0,54.0,591.0,776.0,11.0,8820.0,...,1751.0,203.0,276.0,10.0,138.0,45.0,12.0,0.0,1.0,15.0


In [32]:
(genres_per_cluster_df != 0).sum(axis=1)

cluster
1    1085
2    1095
3     918
4    1128
dtype: int64

In [33]:
avg_pairwise_similarities_df

Unnamed: 0,similarity,cluster
0,0.222162,1
1,0.210844,1
2,0.325082,1
3,0.308836,1
4,0.371306,1
...,...,...
670,0.035255,4
671,0.090361,4
672,0.136734,4
673,0.086760,4


In [27]:
similarity = cosine_similarity(representation_normed_df, representation_normed_df)
avg_pairwise_similarities_all = (np.sum(similarity, axis=0) - 1) / (len(similarity) - 1)

In [28]:
np.var(similarity)

0.12522518355709683

In [31]:
avg_pairwise_similarities_all_df = pd.DataFrame(avg_pairwise_similarities_all)
avg_pairwise_similarities_all_df.columns = ["similarity"]
avg_pairwise_similarities_all_df.head()

Unnamed: 0,similarity
0,-0.099249
1,0.152976
2,-0.103268
3,0.029684
4,-0.041838


In [62]:
fig, (ax_groups, ax_all) = plt.subplots(ncols=2, sharey=True, gridspec_kw={"width_ratios": (0.8, 0.2)})
#sns.set_context(font_scale=1.3)
sns.set_context("talk")
sns.boxplot(data=avg_pairwise_similarities_df, x="cluster", y="similarity", ax=ax_groups, orient="v", showfliers=False)
sns.boxplot(data=avg_pairwise_similarities_all_df, ax=ax_all, color="grey", showfliers=False)
ax_groups.set_ylabel("Average pairwise user similarity")
ax_groups.set_xticklabels([r"$U_{C_1}$", r"$U_{C_2}$", r"$U_{C_3}$", r"$U_{C_4}$"])
ax_all.set_xticklabels([r"$U_{all}$"])
ax_groups.grid(False)
ax_all.grid(False)
ax_groups.set_xlabel("")

Text(0.5, 0, '')

In [44]:
similarity = cosine_similarity(representation_normed_df, representation_normed_df)
avg_pairwise_similarities = (np.sum(similarity, axis=0) - 1) / (len(similarity) - 1)

In [64]:
plt.boxplot(avg_pairwise_similarities, showmeans=True)

{'whiskers': [<matplotlib.lines.Line2D at 0x22a12960ef0>,
  <matplotlib.lines.Line2D at 0x22a12960fd0>],
 'caps': [<matplotlib.lines.Line2D at 0x22a1296b5c0>,
  <matplotlib.lines.Line2D at 0x22a1296b908>],
 'boxes': [<matplotlib.lines.Line2D at 0x22a12960b00>],
 'medians': [<matplotlib.lines.Line2D at 0x22a1296bc50>],
 'fliers': [<matplotlib.lines.Line2D at 0x22a12f75358>],
 'means': [<matplotlib.lines.Line2D at 0x22a1296bc18>]}

In [23]:
avg_pairwise_jaccard_df.head()

Unnamed: 0,similarity,cluster
0,0.472579,1
1,0.477419,1
2,0.484106,1
3,0.472435,1
4,0.479199,1


In [24]:
plt.boxplot(avg_pairwise_jaccard_df["similarity"], showmeans=True)
plt.show()

In [44]:
((representation_df != 0).sum(axis=0) != 0).sum()

1191