In [1]:
# STATISTICAL ANALYSIS 0: Retrieve dataset from HIVE
from os.path import expanduser, join, abspath
from pyspark.sql import SparkSession
from pyspark import SparkContext

# Get data warehouse
spark = SparkSession.builder.appName("Milion Songs Dataset").config("spark.sql.warehouse.dir", abspath('/user/hive/warehouse/songs')).enableHiveSupport().getOrCreate()
display(spark.sql("SELECT count(*) as NUM_SONGS FROM songs"))

NUM_SONGS
9900


In [2]:
# STATISTICAL ANALYSIS 1: Missing values
import missingno as msno
from pyspark.sql.functions import when, col
import numpy as np
import pandas as pd

miss_values = spark.sql("SELECT * FROM songs")

# Turn every 0 values into NaN --> only for visualizing them into this type of graph
for feature in miss_values.columns:
  try:
    miss_values = miss_values.withColumn(feature, when(col(feature) == 0,np.nan).otherwise(col(feature)))
  except:
    pass

# Visualize matrix of missing values (NaN or 0)
df = miss_values.toPandas()
msno.matrix(df)

In [3]:
# STATISTICAL ANALYSIS 3: Durations across years (filter all valid songs (!=0)) 
baseQuery = spark.sql("select avg(duration) as duration, year from songs group by year")
df_filtered = baseQuery.filter(baseQuery.year !=0)
display(df_filtered)

duration,year
173.50485,1950
192.80934,1936
160.44798166666666,1958
237.46428063829785,1983
234.02261739130435,1972
249.2144480565372,2007
226.0566886111111,1979
221.76072409836064,1988
235.3493777142857,1986
259.225669375,1969


In [4]:
# STATISTICAL ANALYSIS 4: Tempo across years

baseQuery2 = spark.sql("select avg(tempo) as tempo, year from songs group by year")
df_filtered = baseQuery2.filter(baseQuery2.year !=0)
display(df_filtered)

tempo,year
93.707,1950
38.443,1936
131.442,1958
126.14368085106383,1983
129.9973913043478,1972
125.96294699646644,2007
137.51694444444445,1979
124.96954098360656,1988
124.96834285714286,1986
133.5631875,1969


In [5]:
# STATISTICAL ANALYSIS 5: Loudiness across years

baseQuery3 = spark.sql("select avg(loudness) as loudness, year from songs group by year")
df_filtered = baseQuery3.filter(baseQuery3.year !=0)
display(df_filtered)

loudness,year
-16.472,1950
-10.697,1936
-12.376,1958
-12.30287234042553,1983
-11.709739130434784,1972
-8.033597173144877,2007
-11.879083333333332,1979
-10.131918032786883,1988
-11.537228571428573,1986
-9.866375,1969


In [6]:
# STATISTICAL ANALYSIS 6: Display distribution and correlations
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["font.family"] = "DejaVu Serif"

Query=spark.sql("select duration, loudness, tempo, year, density, fadiness, variability, song_hotttnesss, artist_hotttnesss, artist_familiarity from songs") #EXCLUDE COORDINATES(LAT+LONG) SINCE THE CORRELATION WOLULD MAKE NO SENSE
df_filtered = Query.filter(Query.year !=0).filter(Query.song_hotttnesss !=0).filter(Query.tempo !=0).filter(Query.artist_hotttnesss !=0)
# Trasfrom into pandas for ploting using libraries 
df = df_filtered.toPandas()

# Pair plot
pp = sns.pairplot(df)

plt.suptitle("Distributions and Correlations", fontsize=50, fontweight="regular", x = 0.5, y = 1.03)
sns.set_context("notebook", font_scale=1, labelsize=10)
plt.show()

In [7]:
# STATISTICAL ANALYSIS 7: Display correlation through "Heatmap"
import numpy as np

corr_matrix = df.corr()

mask = np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr_matrix,mask=mask)
plt.suptitle("Correlation Heatmap", fontsize=35, fontweight="normal", x = 0.45, y = 1.1)
sns.set_context("notebook", font_scale=1)

In [8]:
# STATISTICAL ANALYSIS 8: Distrubution of duration 
import time
from matplotlib import ticker
dur = spark.sql("SELECT duration FROM SONGS WHERE duration != 0").toPandas()
minutes=dur["duration"]

# DURATION - Distribution (boxplot)
fig,ax = plt.subplots(figsize=(30, 3))
ax.boxplot(minutes, vert=False)
ax.text(2.5, 1.2, ("Median: %s seconds" % str(datetime.timedelta(seconds=round(minutes.median(),0)))[2:]), fontsize = 20,color = "black" ) 

# Personalize some parameters #(FOR VISUALIZATION ONLY)
ax.set_title("Distribution of song duration", fontsize=40,fontweight="regular", pad= 20)
ax.set_xlabel("Minutes", fontsize=25, labelpad=10)
ax.axes.yaxis.set_visible(False)
ax.tick_params(labelsize=15)
ax.xaxis.set_major_locator(ticker.MultipleLocator(90))
labels = ['','00:00','01:30','03:00','04:30','06:00','07:30','09:00','10:30','12:00','13:30','15:00','16:30','18:00','19:30','21:00','22:30','24:00','25:30','27:00','28:30','30:00','31:30','33:00','34:30','36:00']
ax.set_xticklabels(labels)
ax.set_xlim(0)

display(fig)

In [9]:
# STATISTICAL ANALYSIS 9: Word cloud for artist genres
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
import matplotlib.pyplot as plt

c=sqlContext.sql('SELECT artist_genre FROM SONGS').toPandas()
c=Counter(c['artist_genre'].tolist())
wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'white',
    stopwords = STOPWORDS).generate_from_frequencies(c)

fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'w',
    edgecolor = 'w')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()