In [3]:
import pandas as pd
import streamlit as st
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity
from scipy import spatial
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

#using same code from modeling, adjusting variables
music = pd.read_csv('../data/music_data_w_genre.csv')

music['release_date']= pd.to_datetime(music['release_date'])
music['Seconds']=music['release_date'].map(pd.Timestamp.timestamp)

day = 60*60*24
year = 365.2425*day
music['year'] = np.cos(music['Seconds'] * (2*np.pi/year))
music.drop(columns=['duration_ms_y', 'Seconds', 'mode'], inplace = True)
float_cols = music.dtypes[music.dtypes == 'float64'].index.values
int_cols = music.dtypes[music.dtypes == 'int64'].index.values

def create_feature_set(df, float_cols):
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(df['Values'])
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = [i for i in tfidf.get_feature_names()]
    genre_df.reset_index(drop = True, inplace=True)

    floats = df[float_cols].reset_index(drop = True)
    ints =  df[int_cols].reset_index(drop = True)
    pop = df[['popularity']].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns)
    pop_scaled = pd.DataFrame(scaler.fit_transform(pop), columns = pop.columns) 
    genre_scaled = pd.DataFrame(scaler.fit_transform(genre_df[genre_df.columns]), columns = genre_df.columns)
    ints_scaled = pd.DataFrame(scaler.fit_transform(ints), columns = ints.columns)

    final = pd.concat([floats_scaled, pop_scaled, genre_df, ints_scaled], axis = 1)
    
    final['id']=df['id'].values
    
    return final

features = create_feature_set(music, float_cols=float_cols)

features.set_index('id', inplace=True)

pca = PCA(n_components=5, random_state = 42)

sc = StandardScaler()

sc.fit(features)

features_sc = sc.transform(features)

pca.fit(features_sc)

features_pca = pca.transform(features_sc)

columns = [f'PCA_{i+1}' for i in pd.DataFrame(features_pca).columns]
features_df = pd.DataFrame(data = features_pca, columns=columns)
features_df['id'] = music['id']
features_df.set_index('id', inplace=True)

2022-02-07 13:28:10.689 INFO    numexpr.utils: NumExpr defaulting to 4 threads.


In [4]:
X1 = features_df.copy()

sc = StandardScaler()

X1_sc = sc.fit_transform(X1)

In [5]:
k = 3
km = KMeans(n_clusters=k, random_state=42)
km.fit(X1_sc)

KMeans(n_clusters=3, random_state=42)

In [6]:
km.cluster_centers_

array([[ 1.87592312e-04, -4.90079168e-03,  1.05513643e-04,
        -3.45501514e-03, -2.20761908e-04],
       [-6.98926447e+00,  2.03080705e+02,  3.11475821e-01,
         2.97907725e-01,  2.06930120e+00],
       [-1.53310411e+00, -9.24748735e-01, -9.34766379e+00,
         2.85092477e+02,  1.41157582e+01]])

In [7]:
km.labels_

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [8]:
pred = km.predict(X1_sc)

In [9]:
features_df['cluster'] = km.labels_

In [10]:
features_df.head()

Unnamed: 0_level_0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3egj44N2tiuW5F2MrqmBjW,-0.238289,-0.016405,0.204194,0.023664,-0.421243,0
1WAWbKpQe3XMp4b96bjazD,-0.265159,-0.019199,0.375159,0.007361,-0.13662,0
1HFIgvDlSCjC95Ch6h6gF7,-0.425057,-0.018486,0.662985,0.030375,-0.204053,0
0mEYJIIBcUt2GH4ftYKJ5Q,-0.183932,-0.010588,0.18951,0.019291,-0.47359,0
0vQ5VW753vuqUdi4mipjVl,-0.285411,-0.008498,0.321663,0.032669,-0.492147,0


In [11]:
def similarity(id1, id2):
    a = features_df.drop(columns=['cluster']).loc[[id1]]
    b = features_df.drop(columns=['cluster']).loc[[id2]]

    distance = spatial.distance.cosine(a[a.columns], b[b.columns])
    return distance

In [12]:
def songs2(id):
    distances = {}
    group = features_df.loc[features_df['cluster'] == features_df["cluster"].loc[id]]
    float_cols = group.dtypes[group.dtypes == 'float64'].index.values
    for i in list(group[float_cols].index):
        dist = similarity(id, i)
        distances.update({i:dist})
    return distances 

In [13]:
feature_recs = songs2('08mG3Y1vljYA6bvDt4Wqkj')

In [14]:
feature_rec_df = pd.DataFrame(feature_recs.items())
feature_rec_df = feature_rec_df.sort_values(by=1)
final = feature_rec_df.head(20)

In [15]:
silhouette_score(X1_sc, km.labels_)

0.9906444134824692

In [16]:
cl = KMeans(n_clusters=k)
cl.fit(X1_sc)
inertia = cl.inertia_
inertia

256750.22686941

In [17]:
for i in final[0]:
    print(music[['artist', 'name']].loc[music['id']==i])

      artist           name
80009  AC/DC  Back In Black
        artist        name
36169  Chromeo  Call Me Up
        artist                                  name
36063  Chromeo  Lost on the Way Home (feat. Solange)
        artist                                  name
36087  Chromeo  Lost on the Way Home (feat. Solange)
          artist        name
80737  blink-182  First Date
      artist               name
79868  AC/DC  Rock N Roll Train
             artist            name
79382  Fall Out Boy  Alone Together
         artist                                        name
82238  The 1975  Girls - Live From The O2, London. 16.12.16
              artist                name
82684  Kings of Leon  California Waiting
              artist        name
82658  Kings of Leon  The Bucket
            artist      name
54946  cleopatrick  WHY JULY
              artist  name
82575  Kings of Leon  Pyro
              artist name
82622  Kings of Leon   17
          artist                  name
80718  blink-