In [1]:
TYPE_CLUSTER = ["popularity_type","quality_of_song","type_of_running", "quantity_of_fade", "song_style", "estimated_region"]
NUM_CLUSTER = [6, 4, 4, 3, 5, 10]

FEATURES = [
  ["artist_hotttnesss", "artist_familiarity","song_hotttnesss"],
  ["variability", "song_hotttnesss"],
  ["duration", "tempo"],
  ["fadiness"],
  ["density", "tempo"],
  ["artist_latitude", "artist_longitude"]]

NAMES = [
  ["Average","Popular", "Undefined",'One-hit Wonder','Superstar','Underdog Song'],#singer_type
  ["Average", "Complex", "Unpopular", "Popular"],
  ["Jogging", "Cardio", "Marathon", "Sprint"], #type_of_running 
  ["Long", "Short", "Average"], #fading
  ["Rock/Hip-Hop", "Virtuous", "Dub/Instrumental", "Pop", "Ambient"], #density 
  ["South America","Easten America ","Europe","Middle East","Oceania","North, Central America","North Europe", "Africa","Western America","No data"]] 

GENERAL_QUERY = """ 
  SELECT track_id, artist_location, artist_latitude, artist_longitude,
  duration, tempo, density, fadiness
  FROM songs
 """

QUERY_HOTNESS = """ 
  SELECT track_id,
  artist_familiarity, artist_hotttnesss, song_hotttnesss, variability
  FROM songs WHERE artist_familiarity!=0 AND artist_hotttnesss !=0 AND song_hotttnesss!=0
 """

In [2]:
# PIPELINE --> 1. DATA INGESTION
from os.path import abspath
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,when
from pyspark.sql.types import IntegerType
  
spark = SparkSession.builder.appName("Milion Songs Dataset").config("spark.sql.warehouse.dir", abspath('/user/hive/warehouse/songs')).enableHiveSupport().getOrCreate()

def load_spark_dataset(QUERY_SQL):
  spark_dataset = spark.sql(QUERY_SQL) 
  return  spark_dataset

# DATA INGESTION: LOAD data and extraxt training and testing dataframe
spark_dataset = load_spark_dataset(GENERAL_QUERY)
spark_dataset_hotnesses = load_spark_dataset(QUERY_HOTNESS)

# DATA PREPARATION: TURN "NaN" values into 0 values (to insert them into clustering model)
spark_dataset = spark_dataset.withColumn("artist_longitude", when(col("artist_longitude")=="NaN", 0).otherwise(col("artist_longitude")))
spark_dataset = spark_dataset.withColumn("artist_latitude", when(col("artist_latitude")=="NaN", 0).otherwise(col("artist_latitude")))

# DATA PREPARATION: Add column "features" for the clustering algorithm (one column for the features selected)
assembler_c1 = VectorAssembler(inputCols=FEATURES[0],outputCol='features_C1')
assembler_c2 = VectorAssembler(inputCols=FEATURES[1],outputCol='features_C2')
assembler_stages_hotness = [assembler_c1, assembler_c2]

assembler_c3 = VectorAssembler(inputCols=FEATURES[2],outputCol='features_C3')
assembler_c4 = VectorAssembler(inputCols=FEATURES[3],outputCol='features_C4')
assembler_c5 = VectorAssembler(inputCols=FEATURES[4],outputCol='features_C5')
assembler_c6 = VectorAssembler(inputCols=FEATURES[5],outputCol='features_C6')
assembler_stages = [assembler_c3, assembler_c4,assembler_c5,assembler_c6]

In [3]:
# PIPELINE --> 2. CLUSTERING (GaussianMixture)
from pyspark.ml import Pipeline
from pyspark.ml.clustering import GaussianMixture

# DEFINE MODEL: Gaussian Mixture Model (GMM) 
gmm_c1 = GaussianMixture(k=NUM_CLUSTER[0] , featuresCol="features_C1", predictionCol=TYPE_CLUSTER[0],probabilityCol='probability_' + TYPE_CLUSTER[0], seed=1)
gmm_c2 = GaussianMixture(k=NUM_CLUSTER[1] , featuresCol="features_C2", predictionCol=TYPE_CLUSTER[1],probabilityCol='probability_' + TYPE_CLUSTER[1], seed=1)
clustering_stages_hotness = [gmm_c1,gmm_c2]

gmm_c3 = GaussianMixture(k=NUM_CLUSTER[2] , featuresCol="features_C3", predictionCol=TYPE_CLUSTER[2],probabilityCol='probability_' + TYPE_CLUSTER[2], seed=1)
gmm_c4 = GaussianMixture(k=NUM_CLUSTER[3] , featuresCol="features_C4", predictionCol=TYPE_CLUSTER[3],probabilityCol='probability_' + TYPE_CLUSTER[3], seed=1)
gmm_c5 = GaussianMixture(k=NUM_CLUSTER[4] , featuresCol="features_C5", predictionCol=TYPE_CLUSTER[4],probabilityCol='probability_' + TYPE_CLUSTER[4], seed=1)
gmm_c6 = GaussianMixture(k=NUM_CLUSTER[5] , featuresCol="features_C6", predictionCol=TYPE_CLUSTER[5],probabilityCol='probability_' + TYPE_CLUSTER[5], seed=1)
clustering_stages = [gmm_c3,gmm_c4, gmm_c5, gmm_c6]

# TRAINIG MODEL
pipeline_hotnesses = Pipeline(stages = assembler_stages_hotness + clustering_stages_hotness)
pipeline = Pipeline(stages = assembler_stages + clustering_stages)

model_hotness = pipeline_hotnesses.fit(spark_dataset_hotnesses)
model = pipeline.fit(spark_dataset) 
models = [model_hotness, model]

# PREDICT CLUSTERS
song_labelled_hotnesses = model_hotness.transform(spark_dataset_hotnesses)
song_labelled_general  = model.transform(spark_dataset)
song_labelled = song_labelled_general.join(song_labelled_hotnesses, "track_id")

# DISPLAY OUTCOMES (clustering models)
index_cluster = 0
for item in models:
  for stage in item.stages:
     if(isinstance(stage, VectorAssembler) != True):        
        print("--------------------CLUSTER (type:{0})----------------------------------".format(TYPE_CLUSTER[index_cluster]))
        print("NUMER OF CLUSTER: {0} \nCLUSTERING_FEATURES: {1}".format(NUM_CLUSTER[index_cluster], FEATURES[index_cluster]))

        # Centroid
        centroid = stage.gaussiansDF.select(col("mean").alias("Centroid")).show(truncate=False)

        # Count occurrances
        print("Number of occurrences for each cluster")
        song_labelled.groupBy(TYPE_CLUSTER[index_cluster]).count().orderBy(TYPE_CLUSTER[index_cluster]).show()
        
        index_cluster += 1

In [4]:
#PIPELINE ---> 3. CLUSTERS LABELLING 
from pyspark.sql.functions import lit,udf
from pyspark.sql.types import FloatType

# Mapper --> Go deep into the probability array
map_array = udf(lambda prob,i: float(prob[i]), FloatType())

for i in range(len(TYPE_CLUSTER)):
  print("\n",TYPE_CLUSTER[i])
  
  # Create "confidence column": probability of the predicted cluster
  song_labelled = song_labelled.withColumn("confidence_"+TYPE_CLUSTER[i],map_array(col("probability_"+TYPE_CLUSTER[i]), lit(col(TYPE_CLUSTER[i]))))

  # Labelling 
  for cluster, name in enumerate(NAMES[i]):
    song_labelled = song_labelled.withColumn(TYPE_CLUSTER[i], when(col(TYPE_CLUSTER[i]) == cluster, name).otherwise(col(TYPE_CLUSTER[i])))
    print("LABELLED --> cluster {0} = {1}".format(cluster, name))

In [5]:
# PIPELINE --> 4. Display raw outcomes of model(table way)

# Build string for querying
confidence_labels = ["confidence_"+ item for item in TYPE_CLUSTER]
feature_labels = []
for type_cluster in FEATURES:
  for feature in type_cluster:
    if feature not in feature_labels:
      feature_labels.append(feature)

# Display
songs = song_labelled.select(feature_labels + TYPE_CLUSTER + confidence_labels)
display(songs)

artist_hotttnesss,artist_familiarity,song_hotttnesss,variability,duration,tempo,fadiness,density,artist_latitude,artist_longitude,popularity_type,quality_of_song,type_of_running,quantity_of_fade,song_style,estimated_region,confidence_popularity_type,confidence_quality_of_song,confidence_type_of_running,confidence_quantity_of_fade,confidence_song_style,confidence_estimated_region
0.5666228875581625,0.9379647503973342,0.685642423804993,10,275.3824,155.038,4.05740000000003,3.021253355334255,0.0,0.0,Popular,Popular,Sprint,Short,Rock/Hip-Hop,No data,0.93947047,0.9940802,0.9121802,0.8821467,0.48187315,1.0
0.4090277445700956,0.58400755313893,0.3759843015004421,7,135.18322,121.371,4.53922,3.9280023067951775,29.18752,-82.14039,Average,Average,Sprint,Short,Virtuous,"North, Central America",0.89266664,0.6323276,0.58188075,0.87567014,0.49349067,0.88058984
0.5124556425769484,0.8063641247652552,0.2150803185092279,7,253.67465,87.033,3.067650000000014,3.9775357924018016,0.0,0.0,Underdog Song,Unpopular,Cardio,Short,Dub/Instrumental,No data,0.8668233,0.8799447,0.6029878,0.89179075,0.8329793,1.0
0.6017297557100237,0.8216973167394691,0.7155021691230083,10,203.59791,178.339,9.333910000000005,2.5786119317236604,0.0,0.0,Popular,Popular,Sprint,Short,Ambient,No data,0.77559495,0.9949004,0.99315524,0.7019136,0.7064056,1.0
0.4724954766649901,0.658016591872584,0.3458022339653294,8,170.84036,138.471,10.062359999999984,3.482783576433578,0.0,0.0,Average,Average,Sprint,Short,Pop,No data,0.65142417,0.7159334,0.8419952,0.6485843,0.5833124,1.0
0.3801092082188088,0.4420305132891076,0.3027240513818609,8,179.06893,111.199,9.501929999999987,2.647025366153693,38.8991,-77.029,Average,Average,Cardio,Short,Pop,Easten America,0.6820597,0.6402286,0.558064,0.69040996,0.45026106,0.9879655
0.4260093004837602,0.595645308494379,0.212045405483719,9,162.35057,146.474,14.568569999999994,4.293178644214184,0.0,0.0,Underdog Song,Unpopular,Sprint,Average,Pop,No data,0.82125676,0.89422625,0.92755,0.77495575,0.7230485,1.0
0.3493663611090005,0.5889320858529523,0.26586104921065,9,320.31302,120.032,4.841020000000015,2.7348248285380343,0.0,0.0,Underdog Song,Unpopular,Jogging,Short,Pop,No data,0.7887402,0.7746611,0.39107782,0.8709395,0.37360683,1.0
0.5121972430113413,0.5771581238137459,0.3829350900750192,10,233.32526,151.051,5.28225999999998,3.651554915228639,0.0,0.0,Undefined,Average,Sprint,Short,Pop,No data,0.49581197,0.655262,0.92806745,0.8629934,0.66925114,1.0
0.3604457658067229,0.5856831498303938,0.3795233244221881,8,289.64526,137.746,9.444259999999986,4.194786408726316,18.11526,-77.27348,Average,Average,Sprint,Short,Pop,"North, Central America",0.5925545,0.63555765,0.7142116,0.6944118,0.5963888,0.98672724


In [6]:
COLUMNS = ["artist_location", "estimated_region"] + FEATURES[5]
TO_FILTER = "-"
#display(song_labelled.select(COLUMNS).filter(song_labelled.estimated_region == TO_FILTER))

In [7]:
# PIPELINE --> 5. Display outcomes (graphical way)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "DejaVu Serif"

df = songs.toPandas()

fig, axes = plt.subplots(3,2,figsize=(20, 18))
sns.set_style("whitegrid")

#Plot a scatterplot for each CLUSTER
popolarity = sns.scatterplot(x= df[FEATURES[0][0]], y=df[FEATURES[0][2]], size = df[FEATURES[0][1]], hue=df[TYPE_CLUSTER[0]], ax=axes[0][0])
popolarity.set_title(TYPE_CLUSTER[0].upper().replace("_", " "), fontsize=25, pad=10)

quality = sns.scatterplot(x= df[FEATURES[1][0]], y=df[FEATURES[1][1]], hue=df[TYPE_CLUSTER[1]], ax=axes[0][1])
quality.set_title(TYPE_CLUSTER[1].upper().replace("_", " "), fontsize=25, pad=10)

running = sns.scatterplot(x= df[FEATURES[2][0]], y=df[FEATURES[2][1]], hue=df[TYPE_CLUSTER[2]], ax=axes[1][0])
running.set_title(TYPE_CLUSTER[2].upper().replace("_", " "), fontsize=25, pad=10)

fading = sns.scatterplot(x= np.linspace(0, 10, num=len(df[FEATURES[3][0]])), y=df[FEATURES[3][0]], hue=df[TYPE_CLUSTER[3]], ax=axes[1][1])
fading.set_title(TYPE_CLUSTER[3].upper().replace("_", " "), fontsize=25, pad=10)

genre = sns.scatterplot(x= df[FEATURES[4][1]], y=df[FEATURES[4][0]], hue=df[TYPE_CLUSTER[4]], ax=axes[2][0])
genre.set_title(TYPE_CLUSTER[4].upper().replace("_", " "), fontsize=25, pad=10)

region = sns.scatterplot(x= df[FEATURES[5][1]], y=df[FEATURES[5][0]], hue=df[TYPE_CLUSTER[5]], ax=axes[2][1])
region.set_title(TYPE_CLUSTER[5].upper().replace("_", " "), fontsize=25, pad=10)

#fig.suptitle("Clusters", fontsize=60, fontweight="regular",x = 0.5)
plt.tight_layout(pad = 3, rect=[0, 0, 1, 0.96]) #(left, bottom, right, top)
plt.show()

In [8]:
#PIPELINE --> 6. Show goodness (confidence of our clusters)

df = songs.toPandas()
plt.rcParams["font.family"] = "DejaVu Serif"
fig, axes = plt.subplots(nrows = 6, figsize=(20, 32))
sns.set_style("whitegrid")
sns.set(font='DejaVu Sans')

col,row = 0,-1
# Plot boxplots
for i, cluster in enumerate(TYPE_CLUSTER):
  bx = sns.boxplot(x = df["confidence_" + cluster], y = df[cluster],ax = axes[i])
  
  bx.set_title(cluster.upper().replace("_", " "), fontsize=30, pad=10)
  bx.set_xlabel("Confidence",fontsize = 20)
  bx.set_ylabel("CLUSTERS",fontsize = 30, color = "tomato",labelpad =30)
  bx.tick_params(axis = "x",labelsize = 17)
  bx.tick_params(axis = "y",labelsize = 20)
  bx.set_xlim(0.2,1)
  
#fig.suptitle("Confidence of the clusters", fontsize=40, fontweight="bold",x = 0.6)
plt.tight_layout(pad = 3, rect=[0, 0, 1, 0.96]) #(left, bottom, right, top)
plt.show()

In [9]:
# PIPELINE --> 7. Save outcomes
clusters = song_labelled.select(["track_id"]+ TYPE_CLUSTER + confidence_labels)
clusters.write.mode('overwrite').format("parquet").saveAsTable("clusters_")
display(clusters)

track_id,popularity_type,quality_of_song,type_of_running,quantity_of_fade,song_style,estimated_region,confidence_popularity_type,confidence_quality_of_song,confidence_type_of_running,confidence_quantity_of_fade,confidence_song_style,confidence_estimated_region
TRBCKSF128F932FA70,Popular,Popular,Sprint,Short,Rock/Hip-Hop,No data,0.93947047,0.9940802,0.9121802,0.8821467,0.48187315,1.0
TRBCLGH12903CCA1D0,Average,Average,Sprint,Short,Virtuous,"North, Central America",0.89266664,0.6323276,0.58188075,0.87567014,0.49349067,0.88058984
TRBCMHI128F148B17F,Underdog Song,Unpopular,Cardio,Short,Dub/Instrumental,No data,0.8668233,0.8799447,0.6029878,0.89179075,0.8329793,1.0
TRBCMJT128F14B12CC,Popular,Popular,Sprint,Short,Ambient,No data,0.77559495,0.9949004,0.99315524,0.7019136,0.7064056,1.0
TRBCMNO128F92D58E5,Average,Average,Sprint,Short,Pop,No data,0.65142417,0.7159334,0.8419952,0.6485843,0.5833124,1.0
TRBCNAJ128F934266F,Average,Average,Cardio,Short,Pop,Easten America,0.6820597,0.6402286,0.558064,0.69040996,0.45026106,0.9879655
TRBCNSM128F933D958,Underdog Song,Unpopular,Sprint,Average,Pop,No data,0.82125676,0.89422625,0.92755,0.77495575,0.7230485,1.0
TRBCNUB12903CA79C7,Underdog Song,Unpopular,Jogging,Short,Pop,No data,0.7887402,0.7746611,0.39107782,0.8709395,0.37360683,1.0
TRBCOBZ128F428005D,Undefined,Average,Sprint,Short,Pop,No data,0.49581197,0.655262,0.92806745,0.8629934,0.66925114,1.0
TRBCOKA128F4280D91,Average,Average,Sprint,Short,Pop,"North, Central America",0.5925545,0.63555765,0.7142116,0.6944118,0.5963888,0.98672724
