In [1]:
,#Project Overview
#I will be analyzing Spotify data from a dataset on kaggle. This dataset was made using Spotify's web API. This Dataset has many variables that describe the songs such as popularity,genre,duration, key, mode, time signature,acoustiness,danceablity,energy,instumentalness,liveness,speechiness,valence,and tempo. These variables will be described in further detail later. The goal of the project is to see what features affect the popularity of a song on spotify. This information would be useful for a music producer or artist who wants to see how to make a hit song.

In [2]:
#Importing Data/ Libraries
import numpy as np 
import pandas as pd # for working with dataframes
import seaborn as sns # for data visualization 
from matplotlib import pyplot as plt # for plotting
# File location and type
file_location = "/FileStore/tables/SpotifyFeatures.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)
#Turning spark dataframe to Pandas
spotify = df.select("*").toPandas()

In [3]:
#Data Cleaning

#A time signature of "0/4" was included in a very small amount of the dataset. I believe this was a mistake from the dataset because this time signature does not exist
spotify = spotify[spotify.time_signature != "0/4"]
#Everything in the dataset was a string at first, converting the strings to numeric so the data can be used in models.
spotify.popularity = pd.to_numeric(spotify.popularity, errors='coerce')
spotify.acousticness = pd.to_numeric(spotify.acousticness, errors='coerce')
spotify.danceability = pd.to_numeric(spotify.danceability, errors='coerce')
spotify.duration_ms = pd.to_numeric(spotify.duration_ms, errors='coerce')
spotify.energy = pd.to_numeric(spotify.energy, errors='coerce')
spotify.instrumentalness = pd.to_numeric(spotify.instrumentalness, errors='coerce')
spotify.liveness = pd.to_numeric(spotify.liveness, errors='coerce')
spotify.loudness = pd.to_numeric(spotify.loudness, errors='coerce')
spotify.speechiness = pd.to_numeric(spotify.speechiness, errors='coerce')
spotify.tempo = pd.to_numeric(spotify.tempo, errors='coerce')
spotify.valence = pd.to_numeric(spotify.valence, errors='coerce')
#Duration variable was given in milliseconds I will convert the value to minutes so it will be easier to understand
spotify.loc[:,'duration_ms'] *= 1/60000 
pd.isnull(spotify).sum()
#Getting rid of duplicate and missing data
spotify = spotify.drop_duplicates(subset=['track_id'], keep='first')
spotify = spotify.dropna()

In [4]:
print("Total number of songs:")
spotify.shape[0]

In [5]:
#Definition of Variables that will be used in analysis
#Source for definitions, https://developer.spotify.com/documentation/web-api/reference/tracks/get-audio-features/ 

#genre = genre of music,there are a total of 26 genres in this dataset.It should be noted that there were some duplicate songs in the dataset because these songs were listed in two different genres. For Example, Ariana Grande's song "7 Rings" was listed as both pop and dance. For the sake of simplicity I deleted duplicates and just kept the one of the rows.

#popularity = The response variable,the popularity of a track is a value between 0 and 100, with 100 being the most popular. Popularity is based mainly on the total number of playbacks.

#acousticness = A confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence the track is acoustic.

#danceability = describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.

#duration_ms = Length of a song in minutes

#energy = a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low on the scale. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy. 

#instrumentalness = the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0. 

#key = the key the song is written in

#liveness = Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live. 

#loudness = The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks. Loudness is the quality of a sound that is the primary psychological correlate of physical strength (amplitude). Values typical range between -60 and 0 db.

#mode = the mode of the song,it is either major or minor

#speechiness = detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value. Values above 0.66 describe tracks that are probably made entirely of spoken words. Values between 0.33 and 0.66 describe tracks that may contain both music and speech, either in sections or layered, including such cases as rap music. Values below 0.33 most likely represent music and other non-speech-like tracks.

#tempo = The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration.

#time signature = the beats per measure in a song

#valence = A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry). 

spotify.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0.0,0.611,0.389,1.656217,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1.0,0.246,0.59,2.28955,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3.0,0.952,0.663,2.837783,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0.0,0.703,0.24,2.54045,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4.0,0.95,0.331,1.377083,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [6]:
#Variable Analysis

In [7]:
#This graph shows the distribution of the popularity variable in the dataset.
pop_dist = sns.distplot(spotify['popularity'])
display(pop_dist)

In [8]:
spotify['popularity'].mean()

In [9]:
spotify["popularity"].quantile(.80)

In [10]:
spotify["popularity"].quantile(.997)

In [11]:
#As you can see in the graph above the distribution is not normal. There are more songs that have a popularity less than 50 than songs that have a popularity more than 50. The average popularity is about 36 and the top 20% of songs on spotify have a popularity of 51. Only the top .3% of songs in the dataset have a popularity greater than 80!

In [12]:
#Next I will look to see how popularity correlates with the dependent variables that are numeric.
#In this heat map variables that are dark blue are positively correlated with each other and variables that are dark red are negatively correlated with each other. A value of  


In [13]:
corr = spotify.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);
display(ax)


In [14]:
#None of the variables are strongly correlated with popularity. If they were strongly correlated they would have a value of either 0.8 or -0.8
#The following variables are positively correlated with popularity:danceability,duration,energy,loudness,tempo,and valence (note:Duration and valence's correlation with popularity is only slightly about 0)
#The following varialbe are negatively correlated with popularity:acousticness,instrumentalness,liveness,and speechiness
#So it seems that people like to stream songs that are fast,happy,loud,and are easy to dance to.

In [15]:
#For this project I will mostly be concerned with what variables are correlated to popularity. But is is interesting to note that acousticness and energy are highly negatively correlated. Loudness and acousticness are also highly negatively correlated. This makes sense though because acoustic songs are usually slow and not loud.

In [16]:
#Next I will see how the none numeric response variables are related to popularity.

In [17]:
#This graph shows the median popularity of each genre
sns.set(rc={'figure.figsize':(30,9)})
result = spotify.groupby(["genre"])['popularity'].aggregate(np.median).reset_index().sort_values('popularity')
result = result.drop(4)
plt = sns.barplot(x = 'genre', y = 'popularity',data = spotify,order=result['genre'])
display(plt)

In [18]:
#The least popular genre is A Capella. The most popular genre is Pop, this should be no suprise considering considering that the variables that were positively correlated with popularity are most commonly seen in pop music. The only suprising part of this graph is that rock is the second most popular genre because I thought that rock was a "dying" genre in the mainstream.

In [19]:
#The median popularity of each time signature
result2 = spotify.groupby(["time_signature"])['popularity'].aggregate(np.median).reset_index().sort_values('popularity')
pop_time_sig = sns.barplot(x = 'time_signature', y = 'popularity',order=result2['time_signature'], data = spotify)
display(pop_time_sig)

In [20]:
#The median popularity of 1/4, 5/4 and 3/4 are around the same but the popularity of songs with 4/4 time signature is about 6 points higher than the rest.

In [21]:
#The median popularity of each key divided into major and minor mode
key_mode = sns.barplot(x = 'mode', y = 'popularity', hue = 'key',data = spotify)
display(key_mode)


In [22]:
#Overall the popularity of each key is around the same,but songs written in minor seem to be more popular than those in major.

In [23]:
#Predictive Analysis: Data Preperation

In [24]:
#There are 4 categorical variables (genre,key, mode, and time signature) that need to be converted from text to numbers using dummy variables. I also define popularity as a binary variable. For our purposes, I will define above 52 as "popular" since that's the border of the top 20% of songs and encode that as 1, and below 75 as "not popular" and encode that as 0.
#Note: I turned popularity into a binary variable because this will make the results easier to understand and make it more accurate.

In [25]:
spotify.loc[spotify['popularity'] < 51, 'popularity'] = 0 
spotify.loc[spotify['popularity'] >= 51, 'popularity'] = 1
spotify = pd.get_dummies(spotify, columns=['genre'])
spotify = pd.get_dummies(spotify, columns=['key'])
spotify = pd.get_dummies(spotify, columns=['mode'])
spotify = pd.get_dummies(spotify, columns=['time_signature'])
#note that get_dummies creates one extra column of all 0s which will not be used in analysis.

In [26]:
#Modified dataset with dummy variables and modified popularity variable 
spotify.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,genre_A Capella,genre_Alternative,genre_Anime,genre_Blues,genre_Children's Music,genre_Children’s Music,genre_Classical,genre_Comedy,genre_Country,genre_Dance,genre_Electronic,genre_Folk,genre_Hip-Hop,genre_Indie,genre_Jazz,genre_Movie,genre_Opera,genre_Pop,genre_R&B,genre_Rap,genre_Reggae,genre_Reggaeton,genre_Rock,genre_Ska,genre_Soul,genre_Soundtrack,genre_World,key_A,key_A#,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#,mode_Major,mode_Minor,time_signature_1/4,time_signature_3/4,time_signature_4/4,time_signature_5/4
0,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0.0,0.611,0.389,1.656217,0.91,0.0,0.346,-1.828,0.0525,166.969,0.814,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
1,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,0.0,0.246,0.59,2.28955,0.737,0.0,0.151,-5.559,0.0868,174.003,0.816,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0
2,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,0.0,0.952,0.663,2.837783,0.131,0.0,0.103,-13.879,0.0362,99.488,0.368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1
3,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0.0,0.703,0.24,2.54045,0.326,0.0,0.0985,-12.178,0.0395,171.758,0.227,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
4,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,0.0,0.95,0.331,1.377083,0.225,0.123,0.202,-21.15,0.0456,140.576,0.39,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0


In [27]:
#I will run multiple machine learning models and see which one can predict popularity the best.

In [28]:
#Importing Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier

from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [29]:
#features are the response variables. There are 50 response variables.
features = ["acousticness","danceability","duration_ms","energy","instrumentalness","liveness","loudness","speechiness","tempo","valence","genre_A Capella","genre_Alternative","genre_Anime","genre_Blues","genre_Children’s Music","genre_Classical","genre_Comedy","genre_Country","genre_Dance","genre_Electronic","genre_Folk","genre_Hip-Hop","genre_Indie","genre_Jazz",	"genre_Movie",	"genre_Opera","genre_Pop","genre_R&B","genre_Rap","genre_Reggae","genre_Reggaeton","genre_Rock","genre_Ska","genre_Soul","genre_Soundtrack","key_A","key_A#","key_B","key_C","key_C#","key_D","key_D#","key_E","key_F","key_F#","key_G","mode_Major","time_signature_1/4","time_signature_3/4","time_signature_4/4"]

In [30]:
#Dividing dataset in train(80% of dataset) and test(20% of the dataset)
training = spotify.sample(frac = 0.80,random_state = 0)
X_train = training[features]
y_train = training['popularity']
X_test = spotify.drop(training.index)[features]
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.20, random_state = 0)

In [31]:
#Logistic Regression
LR_Model = LogisticRegression()
LR_Model.fit(X_train, y_train)
LR_Predict = LR_Model.predict(X_valid)
LR_Accuracy = accuracy_score(y_valid, LR_Predict)
print("Accuracy: " + str(LR_Accuracy))

In [32]:
RFC_Model = RandomForestClassifier()
RFC_Model.fit(X_train, y_train)
RFC_Predict = RFC_Model.predict(X_valid)
RFC_Accuracy = accuracy_score(y_valid, RFC_Predict)
print("Accuracy: " + str(RFC_Accuracy))

In [33]:
KNN_Model = KNeighborsClassifier()
KNN_Model.fit(X_train, y_train)
KNN_Predict = KNN_Model.predict(X_valid)
KNN_Accuracy = accuracy_score(y_valid, KNN_Predict)
print("Accuracy: " + str(KNN_Accuracy))


In [34]:
DT_Model = DecisionTreeClassifier()
DT_Model.fit(X_train, y_train)
DT_Predict = DT_Model.predict(X_valid)
DT_Accuracy = accuracy_score(y_valid, DT_Predict)
print("Accuracy: " + str(DT_Accuracy))

In [35]:
model_performance_accuracy = pd.DataFrame({'Model': ['LogisticRegression', 
                                                      'RandomForestClassifier', 
                                                      'KNeighborsClassifier',
                                                      'DecisionTreeClassifier'],
                                            'Accuracy': [LR_Accuracy,
                                                         RFC_Accuracy,
                                                         KNN_Accuracy,
                                                         DT_Accuracy,
                                                                    ]})

In [36]:
model_performance_accuracy.sort_values(by = "Accuracy", ascending = False)

In [37]:
#Conclusion
#The Logistic Regression Model ran the best and was able to predict Popularity with 87.89% accuracy
