In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import plotnine
from plotnine import *
import itertools
import math
from prettytable import PrettyTable
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import plot_roc_curve, balanced_accuracy_score
from xgboost import XGBClassifier
from scipy.spatial import distance
from imblearn.over_sampling import SMOTE
import copy
import warnings
warnings.filterwarnings("ignore")
plotly.offline.init_notebook_mode(connected = True)
%matplotlib inline
sns.set(style = "ticks", context = "talk")

In [None]:
data = pd.read_csv("/kaggle/input/dataset-of-songs-in-spotify/genres_v2.csv")
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.drop("Unnamed: 0", axis=1, inplace=True)
data.columns

In [None]:
numeric_data = data._get_numeric_data()
numeric_data.head()

In [None]:
numeric_data.describe()

In [None]:
numeric_data.info()

In [None]:
numeric_data.hist(layout=(7,2),figsize=(20, 30))

In [None]:
px.box(data_frame = data, y = 'duration_ms', color='genre')

In [None]:
x = list(data.corr().columns)
y = list(data.corr().index)
values = np.array(data.corr().values)
fig = go.Figure(data = go.Heatmap(
    x = x,
    y = y,
    z = values,
    hoverongaps = False
))
fig.show()

In [None]:
plt.figure(figsize=(20, 12))
sns.heatmap(data.corr(), annot=True, cmap='viridis')

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(data.danceability, bins=40)

In [None]:
sns.kdeplot(data.danceability)

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(data.instrumentalness, bins=40)

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(data.duration_ms, bins=40)

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(data.energy, bins=40)

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(data.loudness, bins=40)

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(data.speechiness, bins=40)

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(data.acousticness, bins=40)

In [None]:
px.histogram(data.acousticness)

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(data.liveness, bins=40)

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(data.valence, bins=40)

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(data.tempo, bins=40)

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(data.duration_ms, bins=40)

In [None]:
data['genre'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))
data['genre'].value_counts().plot(kind="bar", color="lightblue", title="Genres")

In [None]:
data.genre.unique()

In [None]:
px.pie(data.genre,
       labels = data.genre.value_counts().index,
       values = data.genre.value_counts().values,
       names = data.genre.value_counts().index,
       title = "Distribution of Genres on Spotify"
      )

In [None]:
px.histogram(data.genre)

In [None]:
display(data[['danceability', 'energy', 'key', 'loudness', 'mode','speechiness','acousticness', 'instrumentalness', 'liveness','valence','tempo','type','id','uri','track_href','analysis_url','duration_ms','time_signature','genre']].groupby(['mode','key','genre']).agg(['max','mean','min']).style.background_gradient(cmap="flare"))

In [None]:
features = ['danceability', 'energy', 'key', 'loudness', 'mode','speechiness','acousticness', 'instrumentalness', 'liveness','valence','tempo','duration_ms']
def plotting(feature):
    plt.figure(figsize=(20, 10))
    plot = sns.catplot(x=feature, y='genre', data=data, kind='bar')
    plot.set_ylabels('Genre')
    plt.show()
    
for x in features:
    plotting(x)

In [None]:
def plotting2(feature):
    plt.figure(figsize=(12, 6))
    plot = sns.regplot(x=data[feature], y=data['genre'], fit_reg = False)
    plt.show()
    
for x in features:
    plotting2(x)

In [None]:
dims = (20,12)
fig, ax = plt.subplots(figsize=dims)
sns.boxplot(x='key', y='genre', data=data, ax=ax)

In [None]:
data['mode'].value_counts()

In [None]:
sns.countplot(x='mode', data=data, hue='genre', palette='bright')
plt.legend(bbox_to_anchor = (2, 1), borderaxespad=0)
plt.tight_layout()

In [None]:
data['time_signature'].value_counts()

In [None]:
sns.countplot(x='time_signature', data=data, hue='genre', palette='bright')
plt.legend(bbox_to_anchor = (2, 1), borderaxespad=0)
plt.tight_layout()

In [None]:
from plotly.subplots import make_subplots
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'duration_ms', 'time_signature', 'genre']
rows = 3
cols = 5
fig = make_subplots(rows=rows, cols=cols, subplot_titles=features)
x, y = np.meshgrid(np.arange(rows)+1, np.arange(cols)+1)
count  = 0
for row, col in zip(x.T.reshape(-1), y.T.reshape(-1)):
    fig.add_trace(
        go.Histogram(x = data[features[count]].values),
        row = row,
        col = col
    )
    count+=1
    
fig.update_layout(height=900, width=900, title_text='Feature Distribution', showlegend=False)
fig.show()

In [None]:
box_cols = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness','liveness', 'valence', 'tempo', 'duration_ms']
rows = 3
cols = 4
fig = make_subplots(rows=rows, cols=cols, subplot_titles=box_cols)
x, y = np.meshgrid(np.arange(rows)+1, np.arange(cols)+1)
count = 0
for row, col in zip(x.T.reshape(-1), y.T.reshape(-1)):
    try:
        fig.add_trace(
            go.Box(x = data[box_cols[count]].values, name=''),
            row = row,
            col = col
        )
        count+=1
    except:
        break
    
fig.update_layout(height=900, width=900, title_text='Boxplot Distribution', showlegend=False)
fig.show()

In [None]:
px.imshow(img=data.isna(), title='Missing Values(Yellow: Missing, Blue: Not Missing)')

In [None]:
per_miss = data.isna().sum(axis=0) / data.shape[0] * 100
per_miss = per_miss.reset_index().rename(columns={'index':'feature', 0:'% missing'})
px.bar(per_miss, x='feature', y='% missing', title='% missing values for each feature')

In [None]:
cols = list(data.columns[11:])
cols

In [None]:
del cols[7]
cols

In [None]:
df = copy.deepcopy(data)
df.drop(columns = cols, inplace = True)
df.head()

In [None]:
sns.pairplot(df, corner=True, hue='genre')

In [None]:
sns.pairplot(data, hue='genre')

In [None]:
grouped_genre = data.groupby("genre")
for col in numeric_data.columns:
    fig, ax = plt.subplots()
    for i, d in grouped_genre:
        d[col].hist(alpha=0.4, ax=ax, label=i, figsize=(20, 10))
        ax.set_title(col)
        
    ax.legend()
    plt.show()

In [None]:
grouped_genre.describe()

In [None]:
len(grouped_genre)

**Thank you! Further EDA and predictions in this notebook will be done in future as soon as I get time for it so keep visiting in future. Stay Tuned! Peace out!**