In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Imports/Data reading

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline
plt.rcParams["patch.force_edgecolor"] = True

In [None]:
top50_df = pd.read_csv("/kaggle/input/top50spotify2019/top50.csv", encoding='ISO-8859-1', index_col=0)
top50_df.head()

In [None]:
# info on null fields in data
top50_df.info()

In [None]:
cat_cols = ['Track.Name', 'Artist.Name', 'Genre']
int_cols = [name for name in top50_df.columns if top50_df[name].dtype in ['int64']]

### Data cleaning

In [None]:
# standardise all int64 columns to same scale
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
transformed = pd.DataFrame(scaler.fit_transform(top50_df[int_cols]), columns=int_cols, index=top50_df.index)

In [None]:
# join back with categorical columns
top50_scaled = top50_df[cat_cols].join(transformed)

In [None]:
top50_scaled.head()

## Descriptive info

In [None]:
top50_scaled[cat_cols].describe()

<div class='alert alert-info'>
    
In 2019, `Ed Sheeran` was the most popular artist with 4 of his songs being in the top 50. While the most popular genre turns out to be `dance pop`.

</div>

In [None]:
top50_scaled.describe()

<div class='alert alert-info'>

In general, the top 50 songs have high `Energy`, `Danceability`, `Loudness`. They have a neutral vibe since `Valence` hovers at 0.5 range. The length of each song is about 3 minutes plus. Surprisingly, majority of the popular songs do not have high `Beats Per Minute`.
<br><br>
    
However, they do not have much `Acousticness` with a skewed mean from the max value. They tend not to have much words inside them as well (from the low mean of `Speechiness`).
</div>

# Relationship between most popular songs

In [None]:
# sort by popularity
top50_sorted = top50_scaled.sort_values('Popularity', ascending=False)

# Top 10 songs 
top50_sorted.head(10)

### Categorical feature relationships (Top 50)

In [None]:
# counts of each genres in Top 50 with more than 1 song
top50_scaled['Genre'].value_counts()[top50_scaled['Genre'].value_counts()>1]

<div class='alert alert-info'>

`dance pop`, `pop` and `latin` are some of the most popular genres in the Top 50 songs.
</div>

In [None]:
# counts of each artist in Top 50 with more than 1 song
top50_scaled['Artist.Name'].value_counts()[top50_scaled['Artist.Name'].value_counts()>1]

<div class='alert alert-info'>

`Ed Sheeran` is the most popular artist with the most songs in the Top 50. While the remaining artists have either 1 or 2 songs in this list. 
</div>

### Numeric feature relationships (Top 50)

In [None]:
# Distributions and relationship between features (pairwise)
g2 = sns.PairGrid(top50_scaled[int_cols])
g2.map_offdiag(sns.regplot, ci=None)
g2.map_diag(sns.distplot, bins=10)

for axes in g2.axes.flat:
    axes.xaxis.label.set_size(15)
    axes.yaxis.label.set_size(15)

<div class='alert alert-info'>

From the pair grid, we can see the relationships between features pairwise and their distributions. We see that `Energy`, `Loudness`, `Danceability`, `Valence`, `Length` have a relatively normal distribution. While `Liveness`, `Acousticness`, `Speechiness` are right skewed. Although `Popularity` seems to be normally distributed, it is slightly left skewed, with more songs having a popularity of between 0.7 ~ 0.9 range. This is expected to even out to a normal distribution if we expand the top songs to a larger value.
</div>

In [None]:
# correlations heatmap
correlations2 = top50_scaled[int_cols].corr()
plt.figure(figsize=(14,7))
sns.heatmap(data=correlations2, annot=True)

<div class='alert alert-info'>

There are a few features that have a positive correlation. These includes `Loudness`, `Energy`, `Speechiness`, `Beats Per Minute`, `Valence`. 

However these features do not have a noticable correlation with popularity of a song. Features that have a weak positive correlation includes `Beats Per Minute` and `Speechiness`. Feature that have a weak negative correlation includes `Valence`. 
</div>

### Categorical feature relationships (Top 10)

In [None]:
# top 10 songs
top10_songs = top50_sorted.iloc[:10,]

In [None]:
# get top 10 artist songs
top10_art = top10_songs['Artist.Name'].unique().tolist()

# get top 10 genre songs
top10_gen = top10_songs['Genre'].unique().tolist()

In [None]:
print("Artists that appeared in Top 10 songs:")
for idx, art in enumerate(top10_art):
    print("{}. {}".format(idx+1, art))
    
print()

print("Genres that appeared in Top 10 songs:")
for idx, gen in enumerate(top10_gen):
    print("{}. {}".format(idx+1, gen))

<div class='alert alert-info'>

Surprisingly, the most popular artist (`Ed Sheeran`) is not in the top 10 songs. This is the same with Genres. 
</div>

### Numeric feature relationships (Top 10)

In [None]:
# Distributions and relationship between features (pairwise)
g = sns.PairGrid(top10_songs[int_cols])
g.map_offdiag(sns.regplot, ci=None)
g.map_diag(sns.distplot, bins=10)

for axes in g.axes.flat:
    axes.xaxis.label.set_size(15)
    axes.yaxis.label.set_size(15)

<div class='alert alert-info'>

At a glance, there seems to be no obvious relationship between any of the features. However, we can see that `Energy`, `Danceability`, `Length` has a normal distribution for the top 10 songs. `Popularity`, `Acousticness`, `Liveness` are right skewed. While `Loudness` is slightly left skewed.
</div>

In [None]:
# correlations heatmap
correlations = top10_songs[int_cols].corr()
plt.figure(figsize=(14,7))
sns.heatmap(data=correlations, annot=True)

<div class='alert alert-info'>

Using pandas corr() function, we see most features pairwise has a weak (positive/negative) correlation. `Acousticness` and `Danceability` have a strong negative correlation while `Acousticness` and `Liveness` have a strong positive correlation. 
<br><br>
We also see that a few features are correlated with popularity. Examples are `Energy`, `Danceability`, `Loudness`, `Acousticness` and `Speechiness`. However these features have a moderate correlation with popularity. The rest are either having a weak correlation or no linear correlation at all.
</div>

## Sentiment Analysis on Top 50 song titles

In this section, we will see some of the more prominent words used in titles of the top 50 popular songs. 

In [None]:
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import stopwords
import string

In [None]:
# stopwords 
stop_words_en = set(stopwords.words("english"))
stop_words_es = set(stopwords.words("spanish"))

# punctuations
punctuations = list(string.punctuation)

In [None]:
## tokenising
titles = top50_scaled['Track.Name'].map(TextBlob)

# print first 5 tokenised titles
for i in range(5):
    print(titles.iloc[i].words)

In [None]:
sentiments = {}
for i in range(titles.shape[0]):
    sentiments[top50_scaled['Track.Name'].iloc[i]] = titles.iloc[i].sentiment.polarity
    
sentiments = pd.DataFrame(sentiments.values(), index=top50_scaled['Track.Name'], columns=['sentiment'])
sentiments.reset_index(drop=False, inplace=True)
sentiments.head()

In [None]:
# descriptive stats on sentiments
sentiments.describe()

In [None]:
sns.distplot(sentiments['sentiment'], kde=False, bins=6)
plt.title("Distribution of sentiment of top 50 song titles")

<div class='alert alert-info'>

While titles usually express some kind of sentiment about the song, we can see that most songs in the top 50 are neutral in their titles. This could be because most of the emotions are expressed through song lyrics instead. While titles are only an indication of what is to be expected from the song.
</div>

In [None]:
title_str = top50_scaled['Track.Name'].map(nltk.word_tokenize)

# text cleaning - lower caps, stopwords, punctuations
for i in range(title_str.shape[0]):
    title_str.iloc[i] = [w.lower() for w in title_str.iloc[i]]
    title_str.iloc[i] = [w for w in title_str.iloc[i] if w not in punctuations]
    title_str.iloc[i] = [w for w in title_str.iloc[i] if w not in stop_words_en]
    title_str.iloc[i] = [w for w in title_str.iloc[i] if w not in stop_words_es]

# forms long paragraph of string for wordcloud
long_titles = ""
for i in range(title_str.shape[0]):
    temp = " ".join(title_str.iloc[i])
    long_titles = long_titles + " " + temp
    
# remove leading and trailing whitespaces
long_titles = long_titles.strip()
long_titles = long_titles.replace('feat', '').replace('ft.', '')
print(long_titles)

In [None]:
plt.figure(figsize=(12,8))
wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color='white',
                      width=1000,
                      height=1000).generate(long_titles)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis("off")
plt.show()

<div class='alert alert-info'>

First, we observe that `remix` was used the most in song titles. This makes sense if the remixed song turns out to be better than the original. <br><br>

Second, hypothetically speaking, songs that we can relate to turns out to be more popular. Songs that deal with emotion such as love contain words such as `boyfriend`, `guy`. Singers such as `justin bieber` tends to make more songs about relationships, which is why they are featured in certain song collaborations. 
</div>

# Conclusion

<div class='alert alert-success'>

Having separated songs into top 50 and top 10, we observed that `Speechiness` has a positve correlation with popularity. While `Beats Per Minute` does not correlate with popularity. While the top 10 songs showed that `Energy`, `Danceability`, `Loudness`, `Acousticness` has correlations with popularity, they are mostly songs relating to relationships such as love. <br><br>

This is further supplemented by the result shown sentiment analysis of song titles. Songs titles that contain words relating to relationships or sung by singers associated with emotional songs tend to be more popular. Furthermore, **remixed** songs are also popular with audiences. 
</div>