In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
from pandas_profiling import ProfileReport

import pickle

from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.cluster import MeanShift

import time

In [2]:
# a = pd.read_csv('a1.csv')
# prof = ProfileReport(a)
# prof.to_file(output_file='outputOld.html')

In [3]:
df = pd.read_csv('a0_650.csv')
# display(a.head())
df.drop(["Unnamed: 0", "Unnamed: 0.1"], axis = 1, inplace = True)
display(df.head())
# display(df.head())

Unnamed: 0,album,id,name,uri,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,artist
0,Dark Lane Demo Tapes,3IvMYBE7A3c7to1aEcfFJk,Deep Pockets,spotify:track:3IvMYBE7A3c7to1aEcfFJk,0.482,0.473,0.824,0.0,0.605,-3.68,0.163,77.888,0.374,60.0,Drake
1,Dark Lane Demo Tapes,5TCBWmEBrin7etRa4Lswr1,When To Say When,spotify:track:5TCBWmEBrin7etRa4Lswr1,0.252,0.41,0.82,0.0,0.538,-6.808,0.533,170.718,0.526,62.0,Drake
2,Dark Lane Demo Tapes,4wVOKKEHUJxHCFFNUWDn0B,Chicago Freestyle (feat. Giveon),spotify:track:4wVOKKEHUJxHCFFNUWDn0B,0.629,0.735,0.449,0.0,0.113,-7.507,0.347,122.947,0.0397,82.0,Drake
3,Dark Lane Demo Tapes,3Q4gttWQ6hxqWOa3tHoTNi,Not You Too (feat. Chris Brown),spotify:track:3Q4gttWQ6hxqWOa3tHoTNi,0.342,0.458,0.452,1.9e-05,0.0703,-9.299,0.047,86.318,0.316,66.0,Drake
4,Dark Lane Demo Tapes,466cKvZn1j45IpxDdYZqdA,Toosie Slide,spotify:track:466cKvZn1j45IpxDdYZqdA,0.289,0.83,0.49,3e-06,0.113,-8.82,0.209,81.604,0.845,75.0,Drake


In [4]:
# column2Cluster = ['acousticness', 'danceability', 'energy', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
## Removed all the exponentail distribution
column2Cluster = ['danceability', 'energy', 'liveness' , 'tempo', 'valence']
df2Cluster = df[column2Cluster]
df2Cluster.shape

(64818, 5)

# Finind number of Clusters
Using **DBSCAN** to find the number of clusters to read more about the same visit: [sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html)

In [25]:
# dbscan = DBSCAN(min_samples = 6).fit(df2Cluster)
dbscan = DBSCAN(min_samples = 10).fit(df2Cluster)

In [26]:
uniLabels = np.unique(dbscan.labels_)
labels = dbscan.labels_
print(uniLabels)
print("Number of Noisy point:",len(labels[labels == -1]))
noClusters = len(uniLabels)-1
print("Number of Clusters to GMM:", noClusters)

[-1  0  1  2  3  4  5  6]
Number of Noisy point: 459
Number of Clusters to GMM: 7


# Fitting GMM 
Using GMM to get the probabilistic distribution of each point towards the Cluster Center.

In [27]:
t1 = time.time()
gmm = GaussianMixture(n_components = noClusters, verbose  = 1)
gmm.fit(df2Cluster)
t2 = time.time() -t1
print(t2)

Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
Initialization converged: True
6.705728054046631


# Save and Load Model

In [5]:
def saveModel(modelClass, fileName):
	fileName+='.sav'
	pickle.dump(modelClass, open('models/'+fileName, 'wb'))
	return 'models/'+fileName

def loadModel(fileName):
	model = pickle.load(open(fileName, 'rb'))
	return model

In [7]:
# saveModel(dbscan, 'dbscanForCluster')
# saveModel(gmm, 'GMM')
dbscan = loadModel('models/dbscanForCluster.sav')
gmmUni = loadModel('models/GMM.sav')

In [8]:
uniLabels = np.unique(dbscan.labels_)
labels = dbscan.labels_
print(uniLabels)
print("Number of Noisy point:",len(labels[labels == -1]))
noClusters = len(uniLabels)-1
print("Number of Clusters to GMM:", noClusters)

[-1  0  1  2  3  4  5  6]
Number of Noisy point: 459
Number of Clusters to GMM: 7


# The probability density function for multivariate_normal is
$f(x) = \frac{1}{\sqrt{{2\pi}^k det(\sum)}}exp(\frac{-1}{2}{(x-\mu)}^T)\sum^{-1}(x-\mu)$

## So from the above equation we can safetly store $det(\sum)$ and $\sum^{-1}$

In [31]:
!mkdir savedData

mkdir: cannot create directory ‘savedData’: File exists


In [32]:
det = [] 
for i in gmmUni.covariances_:
    det.append(np.linalg.det(i))
inv = []
for i in gmmUni.covariances_:
    inv.append(np.linalg.inv(i))
det = np.array(det)
inv = np.array(inv)

In [33]:
np.save('savedData/CovDeterminant.npy', det)
np.save('savedData/CovInverse.npy', inv)

# Making Cluster related Data

In [34]:
X = df2Cluster.values

pro = gmmUni.predict_proba(X)

newCols = ['album', 'name', 'artist']
futureDf = {i: df[i].values for i in newCols}
for i in range(noClusters):
    futureDf['Cluster'+str(i)] = 0
futureDf['maxClass'] =  0
futureDf['sumProbabs'] = 0


In [35]:
for i in range(noClusters): 
    futureDf['Cluster'+str(i)] = pro[:, i]
futureDf['maxClass'] = np.argmax(pro, axis = 1)
futureDf['sum'] = np.sum(pro, axis = 1)
futureDf = pd.DataFrame(futureDf)
print("Columns =",str(futureDf.columns))

Columns = Index(['album', 'name', 'artist', 'Cluster0', 'Cluster1', 'Cluster2',
       'Cluster3', 'Cluster4', 'Cluster5', 'Cluster6', 'maxClass',
       'sumProbabs', 'sum'],
      dtype='object')


## Saving the above data: 

In [36]:
futureDf.to_csv('savedData/clusteredUniverse.csv', index = False)

In [37]:

for i in range(noClusters):
    temp = df[futureDf['maxClass']==i]
    temp.to_csv('savedData/Clustered_Songs/Cluster'+str(i)+'.csv', index = False)

# Extra-Polating songs

In [21]:
clusters = []
for i in range(noClusters):
    clusters.append(pd.read_csv('savedData/Clustered_Songs/Cluster'+str(i)+'.csv'))

In [48]:
def getSongsFromQuery(name, album,  Artist):
    totalSongs = len(album)
    sampleDf = pd.DataFrame(columns = df.columns)
    featureSpace = pd.DataFrame(columns = column2Cluster)
    for i in range(totalSongs):
        f1 = df['name']==name[i]
        f2 = df['album']==album[i]
        temp = df.loc[f1&f2]
        if len(temp) == 1:
            featureSpace = pd.concat([featureSpace, temp[column2Cluster]], ignore_index = True)
            sampleDf = pd.concat([sampleDf, temp], ignore_index = False)
        else:
            f3 = df['Artist']==Artist[i]
            temp = temp.loc[f3]
            featureSpace = pd.concat([featureSpace, temp[column2Cluster]], ignore_index = True)
            sampleDf = pd.concat([sampleDf, temp], ignore_index = False)
    return featureSpace.values, sampleDf
def findNumberCluster(x):
    ms = MeanShift(bandwidth=5).fit(x)
    print(ms.predict(x))
    return len(np.unique(ms.fit_predict(x)))

def naivePlaylist(sampleNoClusters, x):
    sampleGmm = GaussianMixture(n_components = sampleNoClusters).fit(x)
    probabilities = sampleGmm.predict_proba(x)
    gmm_labels = sampleGmm.predict(x)
    finalPlaylist = []
    
    for _ in range(sampleNoClusters):
        finalPlaylist.append([])
        
    for i in range(len(x)):  
        for j in range(sampleNoClusters):
            if probabilities[i][j]>=1/sampleNoClusters:
                finalPlaylist[j].append(i)
#                 print("I: ", i)
#         print(finalPlaylist)
    return finalPlaylist

def playLsit2Song(toChoose, finalPlaylist):
    toRet = []
    for i in range(len(finalPlaylist)):
        toRet.append([])
        for j in range(len(finalPlaylist[i])):
            list(toChoose.iloc[finalPlaylist[i][j],:][['name', 'album', 'artist']])
            toRet[-1].append(list(toChoose.iloc[finalPlaylist[i][j],:][['name', 'album', 'artist']]))
    return toRet


def extrapolation(finalPlaylist, maxSongsPerPlaylist, localSongsData, songsPlayList): 
    for playlistNo in range(len(finalPlaylist)):
        if len(finalPlaylist[playlistNo])>=maxSongsPerPlaylist:
            continue
        currentSongs = len(finalPlaylist[playlistNo])
        moreSongsReq = maxSongsPerPlaylist - currentSongs
        newSongsPerCS = moreSongsReq/currentSongs
        
        predictions = gmmUni.predict(localSongsData[finalPlaylist[playlistNo]]) ## A prediction for each plalist
        
        countOfPrediction = {i: len(predictions[predictions==i]) for i in np.unique(predictions)}
        songs = []
        tupples = []
        s1 = len(finalPlaylist[playlistNo])
        forPlaylist = [] 
        for i in range(len(np.unique(predictions))): 
            temp = clusters[predictions[i]][column2Cluster].values - localSongsData[finalPlaylist[playlistNo]][i]
    
            temp = np.linalg.norm(temp,axis = 1)
            sortedArg = np.argsort(temp)[:moreSongsReq]
#             sortedArg = np.randomnp.argsort(temp)[::-1]
            indices = np.random.choice(sortedArg, 1+int(moreSongsReq*countOfPrediction[predictions[i]]/len(predictions)))
            
#             songsPlayList[playlistNo]+=clusters[predictions[i]].iloc[indices,:][['name', 'album', 'artist']].values
#             songsPlayList[playlistNo].append(list(clusters[predictions[i]].iloc[indices,:][['name', 'album', 'artist']].values.tolist()))
            songsPlayList[playlistNo]+=clusters[predictions[i]].iloc[indices,:][['name', 'album', 'artist']].values.tolist()
            s1+=len(indices)
#          if s1<maxSongsPerPlaylist:
#                 songsPlayList
    return songsPlayList
# def reformattor(songsPlayList):
#     new = []
#     for i in range(len(songsPlayList)):
        

In [54]:
# toChoose = np.random.randint(0,len(df), (10))
# g = df.iloc[toChoose, :]
# gdf = pd.DataFrame(df.iloc[toChoose, :])
# # display(g)
# data = g[column2Cluster].values
# g = g[['album', 'name', 'artist']]
# g = g.values
# maxSongsPerPlaylist = 10
# x = getSongsFromQuery(g[:,0], g[:, 1], g[:,2])
# sampleClusters = findNumberCluster(x)
# # print(sampleClusters)
# clusteredPlaylist = naivePlaylist(sampleClusters, x)
# print(clusteredPlaylist)
# songsPlayList = playLsit2Song(gdf, clusteredPlaylist)

In [50]:
# newSongs = [['Different World', 'Lily', 'Alan Walker'], ['Different World', 'Darkside', 'Alan Walker'], ['My House','Wobble','Flo Rida'], ['Nu-mixx Klazzics', 'Heartz Of Men', '2Pac']]
newSongs = [['For Those About to Rock (We Salute You)', 'Evil Walks', 'AC/DC'],
['Highway to Hell', 'Get It Hot', 'AC/DC'],
['Different World', 'Faded', 'Alan Walker'],
['Revolution Radio', 'Too Dumb to Die', 'Green Day'],
['House Of Balloons (Original)', 'Coming Down - Original', 'Daft Punk'],
['Beauty Behind The Madness', "Can't Feel My Face - Martin Garrix Remix",'Daft Punk']]

newSongs = np.array(newSongs)
maxSongsPerPlaylist = 5
x,g = getSongsFromQuery(newSongs[:,0], newSongs[:,1], newSongs[:,2])
sampleClusters = findNumberCluster(x)
clusteredPlaylist = naivePlaylist(sampleClusters, x)
s = playLsit2Song(g, clusteredPlaylist)

[2 0 3 4 1 0]


In [51]:
s = extrapolation(clusteredPlaylist, maxSongsPerPlaylist, x, s)

In [56]:
s

[[['Get It Hot', 'Highway to Hell', 'AC/DC'],
  ["Can't Feel My Face - Martin Garrix Remix",
   'Beauty Behind The Madness',
   'Daft Punk'],
  ['Proud Mary - Remastered / Live At The Oakland Coliseum, Oakland, CA / January 31, 1970',
   'The Concert',
   'Creedence Clearwater Revival'],
  ['You Shook Me All Night Long - Live at River Plate Stadium, Buenos Aires, Argentina - December 2009',
   'Live at River Plate',
   'AC/DC'],
  ['Weight Of The World', 'Peachtree Road', 'Elton John'],
  ['Cuando Más Te Amaba - Live',
   'Rompiendo Fronteras (Deluxe)',
   'Alejandro FernÃ¡ndez']],
 [['Faded', 'Different World', 'Alan Walker'],
  ['Dead Friends', 'The World Is Yours', 'Rich The Kid'],
  ['Dead Friends', 'The World Is Yours', 'Rich The Kid'],
  ['Dead Friends', 'The World Is Yours', 'Rich The Kid'],
  ['Kobe Bryant', 'Rich Rich', 'Ufo361'],
  ['777', 'Nectar', 'Joji']],
 [['Evil Walks', 'For Those About to Rock (We Salute You)', 'AC/DC'],
  ['Rock Me Baby - Live',
   'Sticky Fingers Liv

In [59]:
s = extrapolation(clusteredPlaylist, maxSongsPerPlaylist, data, songsPlayList)

In [60]:
# for i in songsPlayList:
#     print(len(i))
for i in s:
    print(len(i))

12
12
11
11
12


In [56]:
s[0]

[['Esa Nena Mueve el Culo', 'Vol. 4', 'Cartel De Santa'],
 ['My Type', 'Memories...Do Not Open', 'Daya'],
 ['Jubilee',
  'Satchmo: The Decca And Verve Years 1924-1967',
  'Louis Armstrong'],
 ['Wasted', 'Some Hearts', 'Carrie Underwood'],
 ['El Calimete', 'Dembow Exitos Vol. 2', 'El Alfa'],
 ['If No One Will Listen', 'All I Ever Wanted', 'Kelly Clarkson'],
 ['Pure Water (with Migos)', 'Perfect Ten', 'Mustard'],
 ['Alive', 'Blossom (Deluxe)', 'Milky Chance']]

In [33]:
def formattor(s):
    for i in range(len())

['Tu y yo', 'Kisses', 'Anitta']

In [38]:
isinstance(s[0][0], list)

True

In [46]:
isinstance(s[0][4], )

TypeError: 'type' object is not iterable

In [47]:
b = np.array(s[0])

  """Entry point for launching an IPython kernel.


In [51]:
len(b.flatten())

7