In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt, seaborn as sns

from kmodes.kprototypes import KPrototypes

In [None]:
df = pd.read_csv('/kaggle/input/spotify-top-200-charts-20202021/spotify_dataset.csv')
spotify = df.copy()
df.head()

In [None]:
# All the genres available in the data set are extracted
genre = []
new_genre = []

for j in df.index:
    lst = df.loc[j,'Genre'][1:-1].split(', ')
    for i in lst:
        genre.append(i[1:-1])
        

new_genre = []
for i in genre:
    if i not in new_genre:
        new_genre.append(i)
        
new_genre[:10]

In [None]:
# Dummies are created for all genres
new = pd.DataFrame()

for j in df.index:
    lst = [i[1:-1] for i in df.iloc[j].Genre[1:-1].split(', ')]
    for k in lst:
        new.loc[j,k]=1
        
new = new.fillna(0)

df = pd.concat([df,new], axis=1)

In [None]:
df

In [None]:
# Highest charting months and years are extracted from 'Week of Highest Charting'

df['Week of Highest Charting'] = df['Week of Highest Charting'].apply(lambda x: x.split('--')[0])

df['Year_Week of Highest Charting'] = pd.to_datetime(df['Week of Highest Charting'])
df['Month_Week of Highest Charting'] = pd.to_datetime(df['Week of Highest Charting'])

df['Year_Week of Highest Charting'] = df['Year_Week of Highest Charting'].dt.year
df['Month_Week of Highest Charting'] = df['Month_Week of Highest Charting'].dt.month

df = df.drop('Week of Highest Charting', axis=1)

In [None]:
# Release year is extracted from Release date
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')
df['Release Year'] = df['Release Date'].dt.year
df = df.drop('Release Date', axis=1)

In [None]:
df.Streams = pd.to_numeric(df.Streams.apply(lambda x: x.replace(',','')))
df.head()

In [None]:
# Unnecessary columns are dropped
to_drop = ['Index', 'Song Name', 'Artist', 'Song ID', 'Genre', 'Weeks Charted']
df = df.drop(to_drop,axis=1)
df.head()

In [None]:
# All object type columns are selected which are to be converted to numeric data type
org = [i for i in df.columns if i not in new_genre]
cols = df[org].select_dtypes(include=['object']).columns
cols = [i for i in cols if i not in ['Chord']]
cols

In [None]:
# 'cols' are converted to numeric data type
for i in cols:    
    df[i] = pd.to_numeric(df[i], errors='coerce')

In [None]:
# Dummies are created for categorical variabes
cat_var = ['Chord', 'Year_Week of Highest Charting', 'Month_Week of Highest Charting', 'Release Year']
df = pd.get_dummies(data=df, columns=cat_var, prefix=cat_var, drop_first=True)
df.head()

In [None]:
# Numerical and categorical columns are selected
num_cols = df.select_dtypes(include=['int64','float64']).columns
cat_cols = [i for i in df.columns if i not in num_cols]

In [None]:
plt.figure(figsize=(20,12))
for i in enumerate(cols):
    plt.subplot(3,4,i[0]+1)
    sns.violinplot(data=df, x=i[1])
plt.show()

In [None]:
for i in cols:
    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)
    IQR = Q3 - Q1
    upper = Q3 + (1.5*IQR)
    lower = Q1 - (1.5*IQR)
    df = df[(df[i]>=lower) & (df[i]<=upper)]

In [None]:
plt.figure(figsize=(20,12))
for i in enumerate(cols):
    plt.subplot(3,4,i[0]+1)
    sns.violinplot(data=df, x=i[1])
plt.show()

In [None]:
# Null valued rows are dropped
df = df.dropna()
df.head()

In [None]:
# Index number of categorical variables are selected
categorical = [i[0] for i in enumerate(df) if i[1] in cat_cols]

In [None]:
# Plotting costs to find the optimal number of clusters

clusters = [2,3,4,5,6,7,8,9,10]
costs = []

for i in clusters:
    kp = KPrototypes(n_clusters=i, n_init=5)
    kp.fit_predict(np.array(df), categorical=categorical)
    costs.append(kp.cost_)
    

plt.plot(clusters, costs)
plt.grid(alpha=0.7)
plt.show()

In [None]:
kp = KPrototypes(n_clusters=6, n_init=10, random_state=20)
kp.fit_predict(np.array(df), categorical=categorical)
df['cluster_id'] = kp.labels_

In [None]:
plt.figure(figsize=(20,12))
for i in enumerate(cols):
    plt.subplot(3,4,i[0]+1)
    sns.boxplot(x=df['cluster_id'], y=df[i[1]])
    plt.xlabel(i[1], fontsize=14)
    plt.ylabel('cluster_id', fontsize=14)
plt.show()

In [None]:
release = [i for i in df.columns if 'Release' in i]

for i in np.unique(df.cluster_id):
    percent = 100*df[df['cluster_id']==i][release].sum()/df[df['cluster_id']==0][release].sum().sum()
    plt.figure(figsize=(20,8))
    percent.plot.bar()
    plt.title('Cluster {0}'.format(i), fontsize=20)
    plt.show()

## Cluster 3 contains the set of music which are most popular and are most recent

In [None]:
spotify['cluster_id'] = df.cluster_id
spotify.head()

In [None]:
spotify[spotify.cluster_id==2][['Song Name', 'Artist']]