In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("max_columns", None)

In [None]:
tracks = pd.read_csv("../input/spotify-dataset-19212020-160k-tracks/data.csv")

In [None]:
tracks.head()

In [None]:
tracks.info()

In [None]:
tracks['decade'] = tracks.year.apply(lambda year : year-(year%10))
f, ax = plt.subplots(figsize=(18, 7))
ax=sns.set_style('darkgrid')
ax=sns.distplot(tracks['decade'])

In [None]:
tracks_sample =tracks.loc[(tracks['year'] >= 1950) & (tracks['year'] < 2000)]
f, ax = plt.subplots(figsize=(18, 7))
ax=sns.set_style('darkgrid')
ax=sns.distplot(tracks_sample['decade'])

In [None]:
tracks_eda = tracks_sample.drop(columns=['year', 'key', 'artists', 'release_date', 'name', 'explicit', 'mode', 'id'])
tracks_eda.describe()

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

df=tracks_sample

fig, ax=plt.subplots(1,2, figsize=(16, 5))
sns.barplot(x="mode", y="mode", data=df, estimator=lambda x: len(x) / len(df) * 100, ax=ax[0])
sns.barplot(x="explicit", y="explicit", data=df, estimator=lambda x: len(x) / len(df) * 100, ax=ax[1])
fig.show()

In [None]:
tracks_sample['key'].value_counts()

In [None]:
tracks_sample['artists'].value_counts()

In [None]:
tracks_eda.info()

In [None]:
from sklearn.preprocessing import MinMaxScaler
tracks_eda_scaled = pd.DataFrame(MinMaxScaler().fit_transform(tracks_eda)).rename(columns={0:'valence',1: 'acousticness',2: 'danceability',
                                                                 3:'duration_ms',4:'energy', 5:'instrumentalness',
                                                                 6:'liveness',7:'loudness',8:'popularity',
                                                                 9:'speechiness', 10:'tempo', 11:'decade'})

In [None]:
plt.figure(figsize=(15,8))
ax = sns.boxplot(x="variable", y="value", data=pd.melt(tracks_eda_scaled))

In [None]:
tracks_scaled_w_orig = pd.merge(tracks_eda,tracks_eda_scaled,how = 'left',left_index = True, right_index = True)
tracks_scaled_w_orig.head()

In [None]:
tracks_scaled_w_orig.to_csv('tracks_scaled_w_orig.csv')

In [None]:
tracks_scaled_w_orig_melt=pd.melt(tracks_scaled_w_orig)
tracks_scaled_w_orig_melt.to_csv('tracks_scaled_w_orig_melt.csv')

In [None]:
corr=tracks_eda_scaled.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(10, 8))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
tracks_eda_scaled.head()

In [None]:
#clean it up
#create a list of our conditions
decade_conditions = [
    (tracks_eda_scaled['decade'] == 0.00),
    (tracks_eda_scaled['decade'] == .25),
    (tracks_eda_scaled['decade'] == .50),
    (tracks_eda_scaled['decade'] == .75),
    (tracks_eda_scaled['decade'] == 1.00)
    ]

# create a list of the values we want to assign for each conditionen = ['1', '2', '3', '4']
decade_values = ['1950', '1960', '1970', '1980', '1990']

# create a new column and use np.select to assign values to it using our lists as arguments
tracks_eda_scaled['decade2'] = np.select(decade_conditions, decade_values)
tracks_eda_scaled.info()

In [None]:
columns = tracks_eda_scaled.groupby(['decade']).quantile(0.50).columns
#labels = ["{:02d}'s".format(l%100) for l in sorted(tracks_eda_scaled.decade2.unique())]
fig, ax = plt.subplots(figsize=(20,5)) 
sns.heatmap(tracks_eda_scaled.groupby(['decade']).quantile(0.50).iloc[:,1:20])
plt.ylabel("Release Decade")
plt.xlabel("Features (Mean)")
plt.show()

In [None]:
def boxplot(query, y):
    f, ax = plt.subplots(figsize=(10, 5))
    ax = sns.boxplot(x="decade", y=y, data=tracks.query(query))
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    plt.show()
    
boxplot('year >= 1950 & year <= 2000', "acousticness")

In [None]:
boxplot('year >= 1950 & year <= 2000', "loudness")

In [None]:
boxplot('year >= 1950 & year <= 2000', "energy")

In [None]:
boxplot('year >= 1950 & year <= 2000', "danceability")

In [None]:
f, ax = plt.subplots(figsize=(7, 3))
ax=sns.set_style('darkgrid')
ax=sns.distplot(tracks_eda_scaled['acousticness'])

In [None]:
print("25th percentile: "+tracks_eda_scaled.acousticness.quantile(0.25).astype(str)) 
print("50th percentile: "+tracks_eda_scaled.acousticness.quantile(0.50).astype(str)) 
print("75th percentile: "+tracks_eda_scaled.acousticness.quantile(0.75).astype(str)) 

In [None]:
# create a list of our conditions
ac_conditions = [
    (tracks_eda_scaled['acousticness'] <= .122),
    (tracks_eda_scaled['acousticness'] > .122) & (tracks_eda_scaled['acousticness'] <= .520),
    (tracks_eda_scaled['acousticness'] > .520) & (tracks_eda_scaled['acousticness'] <= .842),
    (tracks_eda_scaled['acousticness'] > .842)
    ]

# create a list of the values we want to assign for each condition
ac_values = ['1', '2', '3', '4']

# create a new column and use np.select to assign values to it using our lists as arguments
tracks_eda_scaled['ac'] = np.select(ac_conditions, ac_values)

# display updated DataFrame
tracks_eda_scaled.head()

In [None]:
f, ax = plt.subplots(figsize=(7, 3))
ax=sns.set_style('darkgrid')
ax=sns.distplot(tracks_eda_scaled['energy'])

In [None]:
print("25th percentile: "+tracks_eda_scaled.energy.quantile(0.25).astype(str)) 
print("50th percentile: "+tracks_eda_scaled.energy.quantile(0.50).astype(str)) 
print("75th percentile: "+tracks_eda_scaled.energy.quantile(0.75).astype(str)) 

In [None]:
# create a list of our conditions
en_conditions = [
    (tracks_eda_scaled['energy'] <= .272),
    (tracks_eda_scaled['energy'] > .272) & (tracks_eda_scaled['energy'] <= .472),
    (tracks_eda_scaled['energy'] > .472) & (tracks_eda_scaled['energy'] <= .691),
    (tracks_eda_scaled['energy'] > .691)
    ]

# create a list of the values we want to assign for each conditionen = ['1', '2', '3', '4']
en_values = ['1', '2', '3', '4']

# create a new column and use np.select to assign values to it using our lists as arguments
tracks_eda_scaled['en'] = np.select(en_conditions, en_values)

# display updated DataFrame
tracks_eda_scaled.head()

In [None]:
f, ax = plt.subplots(figsize=(7, 3))
ax=sns.set_style('darkgrid')
ax=sns.distplot(tracks_eda_scaled['loudness'])

In [None]:
print("25th percentile: "+tracks_eda_scaled.loudness.quantile(0.25).astype(str)) 
print("50th percentile: "+tracks_eda_scaled.loudness.quantile(0.50).astype(str)) 
print("75th percentile: "+tracks_eda_scaled.loudness.quantile(0.75).astype(str)) 

In [None]:
# create a list of our conditions
ld_conditions = [
    (tracks_eda_scaled['loudness'] <= .711),
    (tracks_eda_scaled['loudness'] > .711) & (tracks_eda_scaled['loudness'] <= .764),
    (tracks_eda_scaled['loudness'] > .764) & (tracks_eda_scaled['loudness'] <= .809),
    (tracks_eda_scaled['loudness'] > .809)
    ]

# create a list of the values we want to assign for each conditionen = ['1', '2', '3', '4']
ld_values = ['1', '2', '3', '4']

# create a new column and use np.select to assign values to it using our lists as arguments
tracks_eda_scaled['ld'] = np.select(ld_conditions, ld_values)

# display updated DataFrame
tracks_eda_scaled.head()

In [None]:
tracks_eda_scaled['cluster']=tracks_eda_scaled['ac'].astype(str)+tracks_eda_scaled['en'].astype(str)+tracks_eda_scaled['ld'].astype(str)
tracks_eda_scaled.head()

In [None]:
tracks_eda_scaled.to_csv('tracks_w_simple_cluster.csv')

In [None]:
tracks_eda_scaled['cluster'].value_counts()

In [None]:
tracks_clus = pd.DataFrame(tracks_eda_scaled.groupby(['decade', 'cluster']).count().valence)
tracks_clus.head()

In [None]:
tracks_sum = pd.DataFrame(tracks_eda_scaled.groupby(['decade']).count().valence)
tracks_sum.head()

In [None]:
tracks_clus=tracks_clus.reset_index()
tracks_sum=tracks_sum.reset_index()

In [None]:
tracks_clus.info()

In [None]:
tracks_sum.info()

In [None]:
tracks_sum_all = pd.merge(tracks_clus, tracks_sum, on='decade', how='left')
tracks_sum_all.head()

In [None]:
tracks_sum_all['pct_clus']=tracks_sum_all['valence_x']/tracks_sum_all['valence_y']
tracks_sum_all.head()

In [None]:
tracks_sum_all.to_csv('tracks_sum_all.csv')

In [None]:
tracks_max_clus = pd.DataFrame(tracks_sum_all.groupby(['decade']).max().pct_clus)
tracks_max_clus=tracks_max_clus.reset_index()
tracks_max_clus.info()

In [None]:
tracks_max_clus

In [None]:
tracks_max_clus2 = pd.merge(tracks_sum_all, tracks_max_clus, on='pct_clus', how='inner')
tracks_max_clus2

In [None]:
#clean it up
# create a list of our conditions
decade_conditions = [
    (tracks_max_clus2['decade_x'] == 0.00),
    (tracks_max_clus2['decade_x'] == .25),
    (tracks_max_clus2['decade_x'] == .50),
    (tracks_max_clus2['decade_x'] == .75),
    (tracks_max_clus2['decade_x'] == 1.00)
    ]

# create a list of the values we want to assign for each conditionen = ['1', '2', '3', '4']
decade_values = ['50s', '60s', '70s', '80s', '90s']

# create a new column and use np.select to assign values to it using our lists as arguments
tracks_max_clus2['decade'] = np.select(decade_conditions, decade_values)
tracks_max_clus2['cluster_count'] = tracks_max_clus2['valence_x']
tracks_max_clus2['total_decade_song_count'] = tracks_max_clus2['valence_y']

#drop all of the extra columns
tracks_max_clus2=tracks_max_clus2.drop(columns=['decade_x', 'decade_y', 'valence_x', 'valence_y'])

# display updated DataFrame
tracks_max_clus2 = tracks_max_clus2[['decade', 'cluster', 'cluster_count', 'total_decade_song_count', 'pct_clus']]
tracks_max_clus2

In [None]:
#name the clusters
#create a list of our conditions
cname_conditions = [
    (tracks_max_clus2['cluster'] == '411'),
    (tracks_max_clus2['cluster'] == '144')
    
    ]

#create a list of the values we want to assign for each conditionen = ['1', '2', '3', '4']
cname_values = ['Acoustic,Quiet,Low Energy', 'Not Acoustic, Loud, High Energy']

#create a new column and use np.select to assign values to it using our lists as arguments
tracks_max_clus2['cluster_name'] = np.select(cname_conditions, cname_values)
tracks_max_clus2 = tracks_max_clus2[['decade', 'cluster', 'cluster_name','cluster_count', 'total_decade_song_count', 'pct_clus']]
tracks_max_clus2

In [None]:
tracks_max_clus2.to_csv('tracks_largest_cluster.csv')

In [None]:
tracks_eda_scaled.head()

In [None]:
!pip install kneed
from sklearn.cluster import KMeans
from kneed import KneeLocator
from sklearn.metrics import silhouette_score

kmeans_kwargs = {"init": "k-means++","n_init": 10,"max_iter": 300,"random_state": 0}

# A list holds the SSE values for each k
sse = []
for k in range(1, 11):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(tracks_eda_scaled.drop(columns=['decade', 'popularity']))
        sse.append(kmeans.inertia_)

In [None]:
kmeans.labels_[0:10]

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
kl = KneeLocator(range(1, 11), sse, curve="convex", direction="decreasing")

kl.elbow

In [None]:
# set number of clusters
kclusters = 3

# run k-means clustering
kmeans = KMeans(init="k-means++",n_clusters=kclusters,n_init=10,max_iter=300,random_state=0).fit(tracks_eda_scaled.drop(columns=['decade', 'popularity']))

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
#tracks_eda_scaled.drop(columns=['Kmeans'], inplace=True)
#tracks_eda_scaled.info()

In [None]:
#kmeans.labels_[0:10]

In [None]:
# add clustering labels
tracks_eda_scaled.insert(0, 'Kmeans', kmeans.labels_)

In [None]:
tracks_eda_scaled.head()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2).fit(tracks_eda_scaled.drop(columns=['decade', 'Kmeans', 'popularity']))

In [None]:
x_pca = pca.transform(tracks_eda_scaled.drop(columns=['decade', 'Kmeans', 'popularity']))
print(tracks_eda_scaled.drop(columns=['decade', 'Kmeans']).shape, x_pca.shape)

In [None]:
percent = pca.explained_variance_ratio_
print(percent)
print(sum(percent))

In [None]:
def pca_explained(X, threshold):
    '''
    prints optimal principal components based on threshold of PCA's explained variance

    Parameters
    ----------
    X : dataframe or array
        of features
    threshold : float < 1
        percentage of explained variance as cut off point
    '''

    # find total no. of features
    features = X.shape[1]
    # iterate till total no. of features,
    # and find total explained variance for each principal component
    for i in range(2, features):
        pca = PCA(n_components = i).fit(X)
        sum_ = pca.explained_variance_ratio_
        # add all components explained variances
        percent = sum(sum_)
        print('{} components at {:.2f}% explained variance'.format(i, percent*100))
        if percent > threshold:
            break

pca_explained(tracks_eda_scaled.drop(columns=['decade', 'Kmeans', 'popularity']), 0.85)
# 2 components at 61.64% explained variance
# 3 components at 77.41% explained variance
# 4 components at 86.63% explained variance

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0], x_pca[:,1], c=tracks_eda_scaled['Kmeans'], cmap='plasma', alpha=0.4, edgecolors='black', s=65);
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')

In [None]:
fig = plt.figure(figsize=(8, 4))
plt.imshow(pca.components_, interpolation = 'none', cmap = 'plasma')
feature_names = list(tracks_eda_scaled.drop(columns=['decade', 'Kmeans', 'popularity']).columns)

plt.gca().set_xticks(np.arange(-.5, len(feature_names)));
plt.gca().set_yticks(np.arange(0.5, 2));
plt.gca().set_xticklabels(columns, rotation=90, ha='left', fontsize=12);
plt.gca().set_yticklabels(['First PC', 'Second PC'], va='bottom', fontsize=12);

plt.colorbar(orientation='horizontal', ticks=[pca.components_.min(), 0,
                                              pca.components_.max()], pad=0.65);

In [None]:
#create the target feature for decade 1990
# create a list of our conditions
target_conditions = [
    (tracks_eda_scaled['decade2'] == '1950'),
    (tracks_eda_scaled['decade2'] == '1960'),
    (tracks_eda_scaled['decade2'] == '1970'),
    (tracks_eda_scaled['decade2'] == '1980'),
    (tracks_eda_scaled['decade2'] == '1990')
    ]

# create a list of the values we want to assign for each conditionen = ['1', '2', '3', '4']
target_values = [0, 0, 0, 0, 1]

# create a new column and use np.select to assign values to it using our lists as arguments
tracks_eda_scaled['target'] = np.select(target_conditions, target_values)
tracks_eda_scaled_modeling=tracks_eda_scaled.drop(columns=['decade', 'decade2'])
tracks_eda_scaled_modeling.head()


In [None]:
#tracks_eda_scaled_modeling.groupby(['decade2']).describe()

In [None]:
tracks_eda_scaled.to_csv('check_target_tags2.csv')

In [None]:
#tracks_eda_scaled_modeling=tracks_eda_scaled.drop(columns=['decade2'])
tracks_eda_scaled_modeling = tracks_eda_scaled_modeling.apply(pd.to_numeric)
tracks_eda_scaled_modeling.info()

In [None]:
import xgboost as xgb
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [None]:
# split data into X and y
X = tracks_eda_scaled_modeling.drop(columns=['target', 'popularity'])
Y = tracks_eda_scaled_modeling.drop(columns=['acousticness','Kmeans', 'valence', 'loudness', 'danceability', 'energy','duration_ms', 'instrumentalness', 'liveness','popularity', 'speechiness', 'tempo', 'ac', 'en', 'ld', 'cluster'])

In [None]:
#Y=np.ravel(Y)

In [None]:
print("Total records:" + str(len(X)))

In [None]:
print("% True:" + np.mean(Y).astype(str))

In [None]:
# split data into train and test sets
seed = 7
test_size = 0.70
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [None]:
# fit model on training data
model = XGBClassifier()
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True)

In [None]:
# make predictions for test data
y_hats = model.predict(X_test)
predictions = [round(value) for value in y_hats]
predictions[0:10]

In [None]:
y_hats_df=pd.DataFrame(y_hats, columns = ['y_hats'])
y_hats_df.head()

In [None]:
preds = model.predict_proba(X_test)
pred_df = pd.DataFrame(preds).rename(columns={0:'prob_no', 1:'prob_yes'})
pred_df.loc[(check2[1]>0),:].head()

In [None]:
pred_merge = pd.merge(pred_df, y_hats_df, left_index = True, right_index = True)
pred_merge.head()

In [None]:
y_hats_df = pd.DataFrame(data = y_hats, columns = ['y_hats'], index = X_test.index.copy())
df_out_test = pd.merge(tracks_eda_scaled_modeling, pred_merge, left_index = True, right_index = True)
df_out_test.to_csv('testing_y_hats_merge.csv')

In [None]:
df_out_test2 = pd.merge(df_out_test, tracks_eda_scaled, left_index = True, right_index = True)
df_out_test2.to_csv('testing_y_hats_merge2.csv')

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
plot_importance(model)
pyplot.show()

In [None]:
# define data_dmatrix
data_dmatrix = xgb.DMatrix(data=X,label=Y)

from xgboost import cv

params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

xgb_cv = cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=123)

In [None]:
xgb_cv.head()

In [None]:
import matplotlib.pyplot as plt
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)
xgb.plot_tree(xg_reg,num_trees=0,rankdir='LR')
plt.rcParams['figure.figsize'] = [25, 5]
plt.show()