In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [17]:
def q1(x):
    return x.quantile(0.25)
def q3(x):
    return x.quantile(0.75)

In [2]:
weights_df = pd.read_csv("weights.csv", sep=";", usecols=["user_id", "weight", "cluster"])
weights_df = weights_df[weights_df["cluster"] != -1]
weights_df.set_index("user_id", inplace=True)
weights_df.head()

Unnamed: 0_level_0,cluster,weight
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1056935,0,0.155086
1056935,1,0.64149
1056935,2,0.046324
1056935,3,0.1571
1070023,0,0.093506


In [3]:
tracks_df = pd.read_csv("lowms_tracks.csv", sep=";")
tracks_df.set_index("track_id", inplace=True)
tracks_df.head()

Unnamed: 0_level_0,danceability,energy,speechiness,acousticness,instrumentalness,tempo,valence,liveness
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.465795,0.876998,0.009626,0.000111,6e-06,0.650347,0.697997,0.520183
2097156,0.56338,0.782996,0.027819,0.179719,0.0355,0.572305,0.551996,0.081508
6,0.459759,0.386988,0.010366,0.730924,0.127,0.408689,0.04099,0.108768
4194311,0.263581,0.701994,0.022319,0.157631,0.527,0.456066,0.248992,0.335097
4194312,0.480885,0.635993,0.009731,0.295181,7e-06,0.350923,0.243992,0.140957


In [4]:
track_to_cluster_df = pd.read_csv("track_to_cluster.csv", sep=";")
track_to_cluster_df = track_to_cluster_df[track_to_cluster_df["cluster"] != -1]
track_to_cluster_df.head()

Unnamed: 0,track_id,cluster
0,4868,1
1,2900,2
2,572665,1
3,2897,1
4,15100,1


In [5]:
track_with_cluster_df = track_to_cluster_df.merge(tracks_df, left_on="track_id", right_index=True)
track_with_cluster_df.set_index("track_id", inplace=True)
track_with_cluster_df.head()

Unnamed: 0_level_0,cluster,danceability,energy,speechiness,acousticness,instrumentalness,tempo,valence,liveness
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4868,1,0.734406,0.769995,0.008674,0.138554,0.0,0.340169,0.521995,0.041675
2900,2,0.610664,0.788996,0.006558,0.461847,0.907,0.319257,0.157992,0.122851
572665,1,0.60664,0.669993,0.003491,0.004096,2e-06,0.354029,0.450995,0.106757
2897,1,0.868209,0.473989,0.052465,0.752008,0.183,0.427008,0.674997,0.141963
15100,1,0.520121,0.49199,0.005923,0.02761,3.2e-05,0.493154,0.445994,0.081911


In [6]:
cluster_means_df = track_with_cluster_df.groupby(by="cluster").mean()
cluster_means_df

Unnamed: 0_level_0,danceability,energy,speechiness,acousticness,instrumentalness,tempo,valence,liveness
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.487092,0.360137,0.027126,0.688367,0.01537,0.391598,0.295463,0.175495
1,0.509569,0.786385,0.070238,0.090201,0.013236,0.437227,0.488453,0.212733
2,0.382037,0.364176,0.033356,0.775108,0.854743,0.398694,0.167278,0.177616
3,0.445931,0.800953,0.057729,0.028128,0.69151,0.439331,0.356867,0.212297


In [42]:
cluster_features_df = track_with_cluster_df.groupby(by="cluster").agg(["mean", "std"])
cluster_features_df.columns = [' '.join(col).strip() for col in cluster_features_df.columns.values]
cluster_features_df.head()

Unnamed: 0_level_0,danceability mean,danceability std,energy mean,energy std,speechiness mean,speechiness std,acousticness mean,acousticness std,instrumentalness mean,instrumentalness std,tempo mean,tempo std,valence mean,valence std,liveness mean,liveness std
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0.487092,0.13902,0.360137,0.180225,0.027126,0.053139,0.688367,0.202805,0.01537,0.037454,0.391598,0.136083,0.295463,0.1573,0.175495,0.16317
1,0.509569,0.165427,0.786385,0.171779,0.070238,0.08921,0.090201,0.159074,0.013236,0.032713,0.437227,0.134246,0.488453,0.237296,0.212733,0.169406
2,0.382037,0.172455,0.364176,0.224917,0.033356,0.062276,0.775108,0.19471,0.854743,0.100009,0.398694,0.141985,0.167278,0.137952,0.177616,0.162532
3,0.445931,0.186514,0.800953,0.176655,0.057729,0.064755,0.028128,0.056149,0.69151,0.215513,0.439331,0.125686,0.356867,0.242937,0.212297,0.167963


In [43]:
weighted_features_df = pd.DataFrame()
for user_id, row in weights_df.iterrows():
    c, weight = int(row[0]), float(row[1])
    """features = cluster_means_df.loc[c] * weight
    new_row = pd.DataFrame(data={"user_id": [user_id], "danceability": [features[0]], "energy": [features[1]],
                                "speechiness": [features[2]], "acousticness": [features[3]], "instrumentalness": [features[4]],
                                "tempo": [features[5]], "valence": [features[6]], "liveness": [features[7]]})"""
    
    features = cluster_features_df.loc[c] * weight
    #new_row = pd.DataFrame(data={"user_id": [user_id]})
    new_row = features
    new_row["user_id"] = user_id

    
    weighted_features_df = weighted_features_df.append(new_row)
    weighted_features_df["user_id"] = weighted_features_df["user_id"].astype("int")
weighted_features_df = weighted_features_df.groupby(by="user_id").sum()

In [44]:
weighted_features_df.head()

Unnamed: 0_level_0,acousticness mean,acousticness std,danceability mean,danceability std,energy mean,energy std,instrumentalness mean,instrumentalness std,liveness mean,liveness std,speechiness mean,speechiness std,tempo mean,tempo std,valence mean,valence std
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1056935,0.204944,0.151338,0.490178,0.16497,0.70301,0.176317,0.159106,0.065284,0.205263,0.167893,0.059878,0.078527,0.428696,0.133545,0.422972,0.221174
1070023,0.15798,0.142805,0.489441,0.16765,0.73424,0.175527,0.186738,0.074063,0.207882,0.168269,0.062234,0.079713,0.432002,0.132898,0.431044,0.22739
1072752,0.277748,0.150129,0.47866,0.164626,0.652516,0.179764,0.23975,0.0797,0.200834,0.167023,0.054219,0.073149,0.423642,0.133541,0.385693,0.210909
2052756,0.106471,0.146067,0.497826,0.167961,0.771739,0.173585,0.124141,0.059903,0.211242,0.168944,0.066876,0.084484,0.435853,0.133209,0.459962,0.234607
2095434,0.261692,0.150391,0.483615,0.163796,0.661809,0.177885,0.202589,0.075612,0.201597,0.1672,0.055073,0.073848,0.424386,0.133361,0.395953,0.213366


In [49]:
similarity_df = pd.DataFrame()
for user_id, description in weighted_features_df.iterrows():
    sim = []
    mse = []
    #for _, cluster_description in cluster_means_df.iterrows():
    for _, cluster_description in cluster_features_df.iterrows():
        similarity = cosine_similarity(description.values.reshape(1, -1), cluster_description.values.reshape(1, -1))
        sim.append(similarity[0][0])
        
        error = mean_squared_error(description.values.reshape(1, -1), cluster_description.values.reshape(1, -1))
        mse.append(error)
    
    preferred_cluster = np.argmax(sim)
    #preferred_cluster = np.argmin(mse)
    new_row = pd.DataFrame(data={"user_id": [user_id], "cluster": [preferred_cluster], "similarity": [sim[preferred_cluster]]})
    
    similarity_df = similarity_df.append(new_row)

In [50]:
similarity_df.set_index("user_id", inplace=True)

In [51]:
similarity_df.groupby("cluster").count()

Unnamed: 0_level_0,similarity
cluster,Unnamed: 1_level_1
0,177
1,1309
