In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
genres = pd.read_csv('/kaggle/input/dataset-of-songs-in-spotify/genres_v2.csv')
playlist = pd.read_csv('/kaggle/input/dataset-of-songs-in-spotify/playlists.csv')
genres.head()

In [None]:
genres.describe()

In [None]:
genres.info()

In [None]:
genres.isnull().sum()

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plot

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
genres['genre_encoded'] = le.fit_transform(genres['genre'])
genres.head() 

# Visualization

In [None]:
print(genres.columns)

In [None]:
X = genres[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',  'duration_ms',
       'time_signature']]

y = genres[['genre_encoded']]

features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','duration_ms' ]
X.head()

In [None]:
genres['genre'].unique()


In [None]:
display(genres[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature','genre']].groupby(['mode','key','genre']).agg(["max",'mean',"min"]).style.background_gradient(cmap="Oranges"))

In [None]:
def ploting(feature): 
    plot.figure(figsize= (24,12))
    g = sns.catplot(x=feature, y ='genre', data = genres, kind = 'bar' )
    g.set_ylabels('Genre')

    plot.show()


In [None]:
for x in features:
    ploting(x)

In [None]:
def ploting2(feature):    
    sns.regplot(x = genres[feature], y = genres['genre'],fit_reg = False)
    plot.show()

In [None]:
for x in features:
    ploting2(x)

In [None]:
corr = genres.corr()
plot.figure(figsize = (30,12))
sns.heatmap(corr,vmax =1 ,vmin = -1,linewidth = 5, square =True, annot =True ,annot_kws ={'size':8},fmt ='.1f',cmap='BrBG_r')
plot.title('Correlation')
plot.show()

# Model Selection

In [None]:
genres['genre'].unique()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train ,y_test = train_test_split(X,y,random_state =0)

Checking with LogisticRegression

In [None]:
accuracy = [['model','accuracy']]

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)

In [None]:
yhat = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

accuracy.append(['LogisticRegression', accuracy_score(y_test , yhat, normalize =True)])
accuracy_score(y_test , yhat, normalize =True)

Using Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train,y_train)

In [None]:
yhat = model.predict(X_test)

In [None]:
accuracy.append(['Naive Bayes' ,accuracy_score(y_test,yhat)])
accuracy_score(y_test,yhat)

Using DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=10, min_samples_split=10, random_state=42)
clf.fit(X_train,y_train)

In [None]:
yhat= clf.predict(X_test)

In [None]:
accuracy.append(['DecisionTreeClassifier',accuracy_score(y_test,yhat)])
accuracy_score(y_test,yhat)

using K-Nearest Neighbours 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors = 8)
model.fit(X_train,y_train)

In [None]:
yhat = model.predict(X_test)


In [None]:
accuracy.append(['K-Nearest Neighbours',accuracy_score(y_test,yhat)])
accuracy_score(y_test,yhat)

using Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth = 10 , random_state  = 0)
model.fit(X_train,y_train)

In [None]:
yhat = model.predict(X_test)

In [None]:
accuracy.append(['Random Forest',accuracy_score(y_test,yhat)])
accuracy_score(y_test,yhat)

# Results 

In [None]:
from prettytable import PrettyTable
table = PrettyTable()
table.field_names = accuracy[0]
for i in range(len(accuracy)):
    if i!=0:
        table.add_row(accuracy[i])
print(table)

# Conclusion

### From the table available we can see the random tree is giving the best accuracy