In [None]:
import mysql.connector
import scrapy
from scrapy.crawler import CrawlerProcess
from multiprocessing import Process, Queue
from twisted.internet import reactor
import logging
import re
import time
import html2text
from datetime import datetime
from dateutil import parser
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
import re
import math

from sklearn.cluster import KMeans, OPTICS, SpectralClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import hdbscan

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Getting the data

In [None]:
conn = mysql.connector.connect(host='localhost', database='discogs3', user='root', password='root')
cursor = conn.cursor(buffered=True)

In [None]:
def get_data_for_k_means():
    cursor.execute("""SELECT a.versions, a.released, GROUP_CONCAT(DISTINCT g.genre), GROUP_CONCAT(DISTINCT s.style), a.album_name FROM album a, album_genre g, album_style s 
                        WHERE a.id = g.album_id AND a.id = s.album_id GROUP BY a.id""")
    return pd.DataFrame(cursor.fetchall())
    
data = get_data_for_k_means()

In [None]:
data.head()

In [None]:
data.shape

# Extracting features

In [None]:
# Building one-hot encoded features for styles

all_style_features_lst = []
for idx, x in enumerate(data[3]):
    if x:
        corrected = []
        for w in x.split(','):
            corrected += [re.sub('\W+', '', w.lower())]
        
        data.at[idx, 3] = corrected
        all_style_features_lst += corrected
        
all_style_features_lst = list(set(all_style_features_lst))
all_style_features = {}

for idx, x in enumerate(all_style_features_lst):
    all_style_features[x] = idx
    
style_feature_vector = []
for x in data[3]:
    feature_v = np.zeros(len(all_style_features) + 1)
    if x:
        for w in x:
            feature_v[all_style_features[w]] = 1
    else: feature_v[-1] = 1
    style_feature_vector.append(feature_v)
        
print('Features from styles have the following shape (one-hot encoded): ', np.shape(style_feature_vector))

In [None]:
# Building one-hot encoded features for genres

all_genre_features_lst = []
for idx, x in enumerate(data[2]):
    if x:
        corrected = []
        for w in x.lower().split(','):
            corrected += [re.sub('\W+', '', w.lower())]
        
        data.at[idx, 2] = corrected
        all_genre_features_lst += corrected
    else: all_genre_features_lst.append(None)
        
all_genre_features_lst_unq = list(set(all_genre_features_lst))
all_genre_features = {}

for idx, x in enumerate(all_genre_features_lst_unq):
    all_genre_features[x] = idx
    
genre_feature_vector = []
for x in data[2]:
    feature_v = np.zeros(len(all_genre_features) + 1)
    if x:
        for w in x:
            feature_v[all_genre_features[w]] = 1
    else: feature_v[-1] = 1
    genre_feature_vector.append(feature_v)
        
print('Features from genres have the following shape (one-hot encoded): ', np.shape(genre_feature_vector))

In [None]:
# Building features for release year

all_year_features_lst = []

for idx, x in enumerate(data[1]):
    if x:
        all_year_features_lst.append([x.year])
        data.at[idx, 1] = x.year
    else: 
        all_year_features_lst.append([-1])

print('Features from release year have the following shape: ', np.shape(all_year_features_lst))

In [None]:
# Building features for number of released versions

all_version_feature_lst = []

for idx, x in enumerate(data[0]):
    if x:
        all_version_feature_lst.append([x])
    else: all_version_feature_lst.append([-1])
        
print('Features from versions have the following shape: ', np.shape(all_version_feature_lst))

# Clustering algorithms

In [None]:
# Select a value for the number of clusters
K = 25

# Choose the features used for clusterisation. Options are: year, versions, style, genre. Put values in the list below as strings and run subsequent cells for example generation and clusterings
FEATURES = ["genre"]

In [None]:
# Making input examples and running PCA if there are any one-hot encoded features, as both KMeans and HDBSCAN do not handle categorical data

examples = np.empty((len(data), 1), dtype=np.int)
for f in FEATURES:
    if f == 'year':
        examples = np.concatenate((examples, all_year_features_lst), axis=1)
    elif f == 'versions':
        examples = np.concatenate((examples, all_version_feature_lst), axis=1)
    elif f == 'style':
        examples = np.concatenate((examples, style_feature_vector), axis=1)
    elif f == 'genre':
        examples = np.concatenate((examples, genre_feature_vector), axis=1)

examples = examples[:, 1:]

if np.shape(examples)[1] > 2:
    pca = PCA(n_components=round(math.sqrt(np.shape(examples)[1])))
    examples = pca.fit_transform(examples)
    print('PCA done')

print(np.shape(examples))

In [None]:
# Kmeans clustering with the above examples and K value

kmeans = KMeans(n_clusters=K)
kmeans.fit(examples)
y_kmeans = kmeans.predict(examples)

for i in range(len(y_kmeans)):
    print(data.iloc[[i], [4, 2, 3, 1, 0]].to_string(index=False, header=False), ' -- cluster --> ', y_kmeans[i])

In [None]:
# HDBSCAN clustering

cluster = hdbscan.HDBSCAN(metric="chebyshev")
cluster.fit(examples)

for i in range(len(cluster.labels_)):
    print(data.iloc[[i], [4, 2, 3, 1, 0]].to_string(index=False, header=False), ' -- cluster --> ', cluster.labels_[i])

In [None]:
cursor.close()
conn.close()