In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn import preprocessing
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
pd.plotting.register_matplotlib_converters()
%matplotlib inline


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import chardet
#traditional encoding UTF-8 was not the correct encoding, try using chardet to guess the encoding by looking at first 10000 characters
with open("../input/top-spotify-songs-from-20102019-by-year/top10s.csv", 'rb') as raw_data:
    result = chardet.detect(raw_data.read(10000))
    #print(result) already found encoding so commenting this line to keep it out of the output


#chardet guessed Windows-1252, plug in encoding and see if it is correct
sp_data = pd.read_csv("../input/top-spotify-songs-from-20102019-by-year/top10s.csv", encoding = 'Windows-1252')

#look at first 10 rows of data
sp_data.head(10)

In [None]:
#Time to check missing data
missing_values_count = sp_data.isnull().sum()

#look at the # of missing points in all of the columns
missing_values_count[0:15]

#this is good, when working on this data set previously, it was noticed that pop has some 0 values which doesnt make sense...lets find them
pop_0 = sp_data.groupby('pop').pop.count()
print(pop_0)

zero_pop = sp_data.loc[sp_data['pop'] == 0]
zero_pop


In [None]:
#Looking at these zero values, it looks as if the data was simply just left off, should not hurt to simply drop these values by creating a new dataframe without them
#lets also remove all of rows with 2019 as the year since that will be the year we predict with the new data set and dont want data leakage
sp_clean = sp_data.copy()
sp_clean = sp_clean.loc[sp_clean['year'] != 2019]
sp_clean = sp_clean.loc[sp_clean['pop'] != 0]
sp_clean.loc[sp_clean['pop'] > 87]
print(sp_clean.groupby('pop').pop.count())

#lets look at some of the other population values, is there a reason why some are so low? 
sp_clean.loc[sp_clean['pop'] <= 30].head(200)

In [None]:
#Okay, so it looks like we still have some low values, but are they outside of the norm?
sp_clean['pop'].describe()
sp_clean['top genre'].unique()
#STD is relatively high for this column and the mean is 60 with teh lower percentile being in the 60 range...lets make a boxplot to spot outliers
sns.boxplot(x=sp_clean['pop'])
sns.stripplot(x=sp_clean['pop'], color='black', alpha = 0.3)

In [None]:
#looks like we have some outliers, lets find them and remove them from the dataset
sp_out = sp_clean.copy()
Q1 = sp_out['pop'].quantile(0.25)
Q3 = sp_out['pop'].quantile(0.75)
IQR = Q3-Q1
print(IQR)

In [None]:
#Find which rows are outliers
filter = (sp_out['pop'] >= Q1 - 1.5 * IQR) & (sp_out['pop'] <= Q3 + 1.5 *IQR)
sp_out = sp_out.loc[filter]
print(sp_out.groupby('pop').pop.count())
#there we go, we have removed all popularity points that were outside of the IQR assuming that they did not make sense


In [None]:
#lets create a new copy and save it to sp
sp = sp_out.copy()
#remove the unnamed column
sp = sp.loc[:, ~sp.columns.str.contains('^Unnamed')]
#break the data into categorical and numerical data
sp_num = sp.drop(columns = ['artist','title','top genre', 'pop','year'])
sp_cat = sp.drop(columns = ['bpm','nrgy', 'dnce', 'dB', 'live', 'val', 'dur','acous','spch','pop'])

#create the prediction variable
y = sp['pop']


In [None]:
#plot numerical data to check distributions
fig, axarr = plt.subplots(2, 5, figsize=(50, 15), squeeze = False)
nr = 0
nc = 0

for col in sp_num.columns:
    if nc == 5:
        nr = 1
        nc = 0
    sns.distplot( a = sp_num[col], kde = True, ax = axarr[nr][nc])
    axarr[nr][nc].set_title("Distribution",fontsize = 20)
    axarr[nr][nc].set_xlabel(col, fontsize = 20)
    axarr[nr][nc].set_ylabel('Frequency', fontsize = 20)
    nc = nc + 1


plt.show()

In [None]:
#lets standardize some of our columns
from scipy import stats


#for min_max scaling
from mlxtend.preprocessing import minmax_scaling
fig, axarr = plt.subplots(2, 5, figsize=(50, 15), squeeze = False)
nr = 0
nc = 0
sp_num_scal = sp_num.copy()
for col in sp_num.columns:
    if (col != 'year') and (col != 'dB'):
        sp_num_s = (minmax_scaling(sp_num[col], columns = [0]))
        if nc == 5:
            nr = 1
            nc = 0
        sns.distplot( a = sp_num_s, kde = True, ax = axarr[nr][nc])
        axarr[nr][nc].set_title("Distribution",fontsize = 20)
        axarr[nr][nc].set_xlabel(col, fontsize = 20)
        axarr[nr][nc].set_ylabel('Frequency', fontsize = 20)
        nc = nc + 1
        sp_num_scal[col] = sp_num_s
        
        
sns.distplot(a = sp_num['dB'], kde = True, ax = axarr[1][3])
        
        






In [None]:
#lets normalize and replot using a boxplot method
#for box-cox transformation

fig, axarr = plt.subplots(2, 5, figsize=(50, 15), squeeze = False)
nr = 0
nc = 0
sp_num_n = []
sp_num_norm = sp_num.copy()
for col in sp_num.columns:
    if (col != 'year') and (col != 'dB'):
        sp_num_n = stats.boxcox(sp_num[col]+1)
        sp_num_n = sp_num_n[0]
        if nc == 5:
            nr = 1
            nc = 0
        sns.distplot( a = sp_num_n, kde = True, ax = axarr[nr][nc])
        axarr[nr][nc].set_title("Distribution",fontsize = 20)
        axarr[nr][nc].set_xlabel(col, fontsize = 20)
        axarr[nr][nc].set_ylabel('Frequency', fontsize = 20)
        nc = nc + 1
        sp_num_norm[col] = sp_num_n
        
sns.distplot(a = sp_num['dB'], kde = True, ax = axarr[1][3])


In [None]:
#convert the year column into a string value
sp_cat['year'] = sp_cat['year'].apply(str)
sp['year'] = sp['year'].apply(str)




#lets begin by joining the data we have and starting with a new data frame for our scaled and normalized data
sp_scaled = sp_cat.join(sp_num_scal)
sp_scaled2 = sp_scaled.copy()
sp_norm = sp_cat.join(sp_num_norm)
sp_original = sp


#create a function that encodes the categorical label
from sklearn.preprocessing import LabelEncoder

def label_encode(df,col):
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    encoded = df[col].apply(encoder.fit_transform)
    df[col] = encoded[col]

    

sp_cat_names = ['title','artist','top genre', 'year']



label_encode(sp_scaled, sp_cat_names)
label_encode(sp_norm, sp_cat_names)
label_encode(sp_original,sp_cat_names)


X_scal = sp_scaled.drop(columns = ['title','artist'])
X_norm = sp_norm.drop(columns = ['title','artist'])
X_orig = sp_original.drop(columns = ['title','artist','pop'])

#going to save the encoding info for top genre
encoder = LabelEncoder()
encoder.fit(sp_scaled2['top genre'])
encoder.transform(sp_scaled2['top genre'])
y_fin = y/100


In [None]:
#time to seperate the data into training, testing, and validation data and creating a model by creating a function that does so

def model_creation(X,y,data):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_absolute_error
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.model_selection import train_test_split
    from xgboost import XGBRegressor
    from matplotlib import pyplot as plt
    pd.plotting.register_matplotlib_converters()
    %matplotlib inline
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import validation_curve



    
    #data splitting
    train_X, val_X, train_y, val_y = train_test_split(X, y, train_size = .8, test_size = .2,random_state = 0)
    
    #model creation
    model_RF = RandomForestRegressor(n_estimators = 100,max_leaf_nodes = 3, random_state = 1)
    model_DT = DecisionTreeRegressor(max_leaf_nodes = 2,random_state = 1)
    model_XGB = XGBRegressor(n_estimators = 500, learning_rate = 0.05, n_jobs = 6)
    #model fitting
    RF = model_RF.fit(train_X, train_y)
    DT = model_DT.fit(train_X, train_y)
    XGB = model_XGB.fit(train_X, train_y,
                 early_stopping_rounds = 5,
                 eval_set = [(val_X,val_y)],
                 verbose = False)
    


    #model predictions

    RF_pred = model_RF.predict(val_X)
    DT_pred = model_DT.predict(val_X)
    XGB_pred = model_XGB.predict(val_X)
    
    
    #Cross Validation
    scores1 = -1 * cross_val_score(model_RF, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')
    scores2 = -1 * cross_val_score(model_DT, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')
    scores3 = -1 * cross_val_score(model_XGB, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')
    
    
   
  
    #print MAE
    print(data + " Model - Random Forest Regression Model: MAE = " + str(mean_absolute_error(val_y,RF_pred)*100) + "\nAverage Cross Validation MAE Score: " + str(scores1.mean()*100) )
    print(data + " Model - Decision Tree Model: MAE = " + str(mean_absolute_error(val_y,DT_pred)*100) + "\nAverage Cross Validation MAE Score: " + str(scores2.mean()*100) )
    print(data + " Model - Gradient Boost Model: MAE = " + str(mean_absolute_error(val_y,XGB_pred)*100) + "\nAverage Cross Validation MAE Score: " + str(scores3.mean()*100)+ "\n")
    return model_RF
    
def KNN_model(X, y, data):
    from matplotlib import pyplot as plt
    pd.plotting.register_matplotlib_converters()
    %matplotlib inline
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import mean_absolute_error
    
    for k in range(3, 15, 2):
        model_KNN = KNeighborsClassifier(n_neighbors = k)
        scores = -1 * cross_val_score(model_KNN, X, y,
                                    cv = 5,
                                    scoring = 'neg_mean_absolute_error')
        score_mean1 = scores.mean()*100
        if k == 3:
            score_mean_best = scores.mean()*100
            k_best = 3
        elif score_mean1 > score_mean_best:
            k_best = k
            score_mean_best = score_mean1
            
    print(data + " Model - KNN: K = " + str(k_best) + " Average Cross Validation MAE Score: " + str(score_mean_best))











    





In [None]:
#Call the function and create 3 models for each data set
model_creation(X_orig,y_fin,'Original')
model_RF = model_creation(X_scal,y_fin,'Scaled')
model_creation(X_norm,y_fin,'Normalized')


In [None]:
#Lets try KNN modeling and create categories for the pop where above a 90 is a 1 (A Tier), 80 is a 2 (B Tier), and so on to where anything under 50 is a 6 (D Tier)
from matplotlib import pyplot as plt
pd.plotting.register_matplotlib_converters()
%matplotlib inline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
    



sp_scaled_knn = sp_scaled.copy()
sp_scaled_knn = sp_scaled_knn.join(y_fin)
sp_scaled_knn['tier'] = ''
sp_scaled_knn = sp_scaled_knn.set_index('pop')
sp_scaled_knn = sp_scaled_knn.reset_index()
for i in range(0,554):
    if sp_scaled_knn.loc[i,'pop'] >= .90:
        sp_scaled_knn.loc[i,'tier'] = 1
    elif sp_scaled_knn.loc[i,'pop'] >= .80:
        sp_scaled_knn.loc[i,'tier'] = 2
    elif sp_scaled_knn.loc[i,'pop'] >= .70:
        sp_scaled_knn.loc[i,'tier'] = 3
    elif sp_scaled_knn.loc[i,'pop'] >= .60:
        sp_scaled_knn.loc[i,'tier'] = 4
    elif sp_scaled_knn.loc[i,'pop'] >= .50:
        sp_scaled_knn.loc[i,'tier'] = 5
    else:
        sp_scaled_knn.loc[i,'tier'] = 6
        
        
y_tier = sp_scaled_knn['tier'].apply(int)
X = sp_scaled_knn.drop(columns = ['pop','title','artist','tier','year','dB'])
X['top genre'] = X['top genre']/100

KNN_model(X,y_tier,'Scaled')


#dive deeper into this later
        

        

   
    

In [None]:
def nn_model(X,y, data, y_test = None):
    from tensorflow import keras
    from tensorflow.keras import layers, callbacks
    from sklearn.metrics import mean_absolute_error
    from sklearn.model_selection import train_test_split
    from learntools.deep_learning_intro.dltools import animate_sgd

    k = 16
    best_MAE = 100000
    best_k = 0
    
    train_X, val_X, train_y, val_y = train_test_split(X, y, train_size = .8, test_size = .2,random_state = 0)



    early_stopping = callbacks.EarlyStopping(
        min_delta = .001,
        patience =10,
        restore_best_weights = True,
    )
    for i in range(1,5):
        print(i)
        model = keras.Sequential([
           layers.Dense(k*4, activation='relu', input_shape=[11]),
           layers.Dropout(0.3),
           layers.Dense(k*2, activation='relu'),
           layers.Dropout(0.3),
           layers.Dense(1)
        ])

        model.compile(
            optimizer = 'adam',
            loss = 'mae',
        )


        history = model.fit(
            train_X, train_y,
            validation_data = (val_X, val_y),
            batch_size = 512,
            epochs = 500,
            callbacks = [early_stopping], # put your callbacks in a list
            verbose = 0) #turn off training log

       
        history_df = pd.DataFrame(history.history)
        MAE = history_df['val_loss'].min()
        print("MAE " + str(MAE) + " K " + str(k) + " i " + str(i))
        if best_MAE > MAE:
            best_MAE = MAE
            best_k = k
            history_df.loc[:,['loss','val_loss']].plot()
            best_model = model
        k = k*2
     
    print("Minimum validation loss: " + str(best_MAE) + " and best k value was " + str(best_k))
    
    
        
    
    return best_model
    











In [None]:
#obtain best NN model
model_NN = nn_model(X_scal, y, 'Scaled')


In [None]:
#Load data set for predictions
new_data = pd.read_csv("../input/top50spotify2019/top50.csv", encoding = 'Windows-1252')
new_data.rename(columns = {list(new_data)[0]:'ranking'}, inplace = True)

list(new_data)

new_data_PredData = new_data.drop(columns = ['Track.Name', 'Artist.Name', 'ranking'])
new_data = new_data_PredData
new_data['year'] = 9
new_data = new_data[['Genre',
 'year',
 'Beats.Per.Minute',
 'Energy',
 'Danceability',
 'Loudness..dB..',
 'Liveness',
 'Valence.',
 'Length.',
 'Acousticness..',
 'Speechiness.',
 'Popularity']]

index = 0
cols = X_scal.columns

for col in cols:
    new_data.rename(columns = {list(new_data)[index]:col}, inplace = True)
    index = index + 1
    
new_data.rename(columns = {list(new_data)[11]:'pop'}, inplace = True)

new_data.head()


In [None]:
#preprocessing on new data
num_cols = ['bpm','nrgy','dnce','live','val','dur','acous','spch']
cat_cols = ['top genre', 'year']
unique_sp = list(sp_scaled2['top genre'].unique())
new_data2 = new_data.copy()
for col in unique_sp: 
    new_data2= new_data2.loc[new_data2['top genre'] != col]
    
unique_nd = list(new_data2['top genre'].unique())
for col in unique_nd: 
    new_data= new_data.loc[new_data['top genre'] != col]

new_data['top genre'] = encoder.transform(new_data['top genre'])
val_y_2019 = new_data['pop']
new_data = new_data.drop(columns = 'pop')
for col in num_cols:
    new_data[col] = (minmax_scaling(new_data[col], columns = [0]))
    
pred_x_NN = model_NN.predict(new_data)
pred_x_RF = model_RF.predict(new_data)
print("Model Final - Neural Network: MAE = " + str(mean_absolute_error(val_y_2019,pred_x_NN)))
print("Model Final - Random Forest: MAE = " + str((mean_absolute_error(val_y_2019/100,pred_x_RF))*100))




In [None]:
#Going to attempt to explain model with SHAP values
#import shap  # package used to calculate Shap values

# Create object that can calculate shap values
#explainer = shap.DeepExplainer(model_NN)

# Calculate Shap values
#shap_values = explainer.shap_values(data_for_prediction)


In [None]:
#EDA of original data, going to take a look at genres of music and see if there is anything informative
#lets create a new copy and save it to sp
sp_EDA = sp_out.copy()
#remove the unnamed column
sp_EDA = sp_EDA.loc[:, ~sp_EDA.columns.str.contains('^Unnamed')]
nr = 0
nc = 0
fig, axarr = plt.subplots(3, 3, figsize=(50, 25), squeeze = False)
fig.subplots_adjust(hspace = 0.5)
fig.suptitle('Music Popularity by Year', fontsize = 50)


for i in range(2010, 2019):
    year_genre = pd.DataFrame(sp_EDA.loc[sp_EDA['year'] == i].groupby('top genre').pop.count()).sort_values(by = 'pop', ascending = False)
    year_genre = year_genre.reset_index()
    sns.barplot(y = year_genre['top genre'] , x = year_genre['pop'], ax = axarr[nr][nc]).set_xticklabels(year_genre['pop'])
    plt.subplot(axarr[nr,nc]).title.set_text(str(i))
    nc = nc +1
    if nc == 3:
        nr = nr+1
        nc = 0
    


In [None]:
by_year= pd.DataFrame(sp_EDA.groupby(['year','top genre']).count())
by_year = by_year.reset_index()
genres = list(by_year['top genre'].unique())
nr = 0
nc = 0
fig, axarr = plt.subplots(3, 5, figsize=(50,15), squeeze = False)
plt.xlim(2010,2018)
fig.subplots_adjust(wspace = .5)
fig.suptitle('Music Popularity by Year', fontsize = 50)
fig.add_axes(autoscale_on = False)

for genre in genres:
    year_plot = by_year.loc[by_year['top genre'] == genre]
    if year_plot['pop'].count() > 3:
        sns.lineplot(x = year_plot['year'], y = year_plot['pop'], ax = axarr[nr][nc])
        plt.xlim(2010,2018)
        plt.ylim(0,60)
        plt.subplot(axarr[nr,nc]).title.set_text("Popularity Trend for " + str(genre))
        nc = nc +1
        if nc == 5:
            nr = nr+1
            nc = 0
            
sp_EDA.head()
            



In [None]:
import plotly.express as px

columns = ['bpm','nrgy','dnce','dB','live','val','dur','acous','spch','pop']
nr = 0
nc = 0
#fig, axarr = plt.subplots(3, 4, figsize=(50,15), squeeze = False)
for col in columns:
    averages = pd.DataFrame(sp_EDA.groupby('top genre')[col].mean())
    averages = averages.reset_index()
    names = list(averages['top genre'])
    #df = pd.DataFrame(dict(
        #r=averages[col],
        #theta=[averages['top genre']]))
    fig = px.line_polar(r=averages[col], theta=list(averages['top genre']), line_close=True)
    
    fig.update_traces(fill='toself')
    fig.show()
    #plt.subplot(axarr[nr,nc]).title.set_text("Average " + str(col) + " for Each Category")
    nc = nc +1
    if nc == 4:
        nr = nr+1
        nc = 0

        
        


In [None]:
genrecount = pd.DataFrame(sp_EDA.groupby(sp_EDA['top genre'])['top genre'].count())
genrecount.rename(columns = {list(genrecount)[0]:'count'}, inplace = True)
genrecount = genrecount.reset_index()
genrecount = genrecount.loc[genrecount['count'] >= 12]
genres = list(genrecount['top genre'].unique())

averages = pd.DataFrame(sp_EDA.groupby('top genre').mean())
averages = averages.reset_index()
top5= averages.loc[averages['top genre'].isin(['barbadian pop','boy band','canadian pop', 'dance pop', 'pop'])]
top5.set_index('top genre')
top5 = top5.drop(columns = ['year','bpm','dur','dB'])
top5 = top5.set_index('top genre')

import plotly.graph_objects as go

categories = genres

fig = go.Figure()

fig.add_trace(go.Scatterpolar(
      r=top5.iloc[0],
      theta=top5.columns,
      fill='toself',
      name='Barbadian Pop'
))
fig.add_trace(go.Scatterpolar(
      r=top5.iloc[1],
      theta=top5.columns,
      fill='toself',
      name='Boy Band'
))
fig.add_trace(go.Scatterpolar(
      r=top5.iloc[2],
      theta=top5.columns,
      fill='toself',
      name='Canadian Pop'
))
fig.add_trace(go.Scatterpolar(
      r=top5.iloc[3],
      theta=top5.columns,
      fill='toself',
      name='Dance Pop'
))
fig.add_trace(go.Scatterpolar(
      r=top5.iloc[4],
      theta=top5.columns,
      fill='toself',
      name='Pop'
))
fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 100]
    )),
  showlegend=True
)

fig.show()

In [None]:
list(sp_EDA['top genre'].unique())
sp_EDA2 = sp_EDA.loc[sp_EDA['top genre'] == 'latin']
sp_EDA2