In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/spotifyclassification/data.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
data_numerical=df[['acousticness','danceability','duration_ms','energy','instrumentalness','liveness','loudness','speechiness','tempo','valence']]

In [None]:
skewness_array=[]
for i in data_numerical:
   
    skewness_array.append([i,data_numerical[i].skew(),data_numerical[i].kurt()])
    

In [None]:
skew_df=pd.DataFrame(skewness_array,columns=['Columns','Skewness','Kurtosis'])

In [None]:
skew_df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
fig=plt.figure(figsize=(10,10))

sns.distplot(df.loc[df['target']==1]['acousticness'],kde_kws={'label':'Liked'},color='green')
sns.distplot(df.loc[df['target']==0]['acousticness'],kde_kws={'label':'Not Liked'},color='red')

fig.legend(labels=['Liked','Not Liked'])

In [None]:
fig=plt.figure(figsize=(10,10))

sns.distplot(df.loc[df['target']==1]['danceability'],kde_kws={'label':'Liked'},color='green')
sns.distplot(df.loc[df['target']==0]['danceability'],kde_kws={'label':'Liked'},color='purple')

plt.title('Danceability Plots',weight='bold')
fig.legend(labels=['Liked The Song','Did Not Like the Song'])

In [None]:
fig=plt.figure(figsize=(10,10))

sns.distplot(df.loc[df['target']==1]['duration_ms'],kde_kws={'label':'liked'},color='green')
sns.distplot(df.loc[df['target']==0]['duration_ms'],kde_kws={'label':'liked'},color='red')

plt.title('Duration_ms Plots',weight='bold')
fig.legend(labels=['Liked The Song','Did Not Like the Song'])

In [None]:
df1=df[['energy','instrumentalness','liveness','loudness','speechiness','tempo','valence']]

In [None]:
df1.head()

In [None]:
def generate_kdeplot(df1):
    for i in df1:
        fig=plt.figure(figsize=(10,10))

        sns.distplot(df.loc[df['target']==1][f'{i}'],kde_kws={'label':'liked'},color='green')
        sns.distplot(df.loc[df['target']==0][f'{i}'],kde_kws={'label':'liked'},color='red')
        
        plt.title(f'{i} Plot')
        fig.legend(labels=['Liked The Song','Did Not Like the Song'])
        

In [None]:
generate_kdeplot(df1)

In [None]:
df.info()

In [None]:
top_artists=df.artist.value_counts().sort_values(ascending=False)[:20]

In [None]:
top_artists

In [None]:
fig=plt.figure(figsize=(10,10))
sns.barplot(x=top_artists.index,y=top_artists.values)
plt.xticks(rotation=90)
plt.xlabel('Artist Name')
plt.title('Top 20 Artists')

In [None]:
df.song_title

In [None]:
fig=plt.figure(figsize=(10,10))
sns.heatmap(data_numerical.corr(),annot=True,cmap='OrRd')
plt.show()

In [None]:
df.head()

In [None]:
data_categorical=df[['mode','time_signature','target','key']]

In [None]:
data_categorical

In [None]:
data_categorical.head()

In [None]:
for col in data_categorical:
    fig=plt.figure(figsize=(10,10))
    sns.countplot(x=f'{col}',data=df,hue=df['target'])
    plt.title(f'{col} vs Target')
    fig.legend(labels=['Did not like The Song','Liked the Song'])
    

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import plot_roc_curve,roc_auc_score,roc_curve

In [None]:
le = LabelEncoder()
cols=['song_title','artist']
df[cols]=df[cols].apply(le.fit_transform)

In [None]:
df.head()

In [None]:
X=df.drop('target',axis=True)
y=df['target']

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train

In [None]:
clf_random_forest=RandomForestClassifier()
clf_random_forest=clf_random_forest.fit(X_train,y_train)
random_forest_predictions=clf_random_forest.predict(X_test)

accuracy_random_forest=accuracy_score(y_test,random_forest_predictions)*100


In [None]:
r_fpr,r_tpr,_=roc_curve(y_test,random_forest_predictions)
r_auc=roc_auc_score(y_test,random_forest_predictions)
plt.plot(r_fpr,r_tpr,label='Random Forest Prediction (area={:.3f})'.format(r_auc))
plt.title('ROC plot Random Forest Classifier')
plt.xlabel('False Positive rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.show()



In [None]:
s_regression=SVC()
s_regression=s_regression.fit(X_train,y_train)
svc_predictions=s_regression.predict(X_test)

accuracy_svc=accuracy_score(y_test,svc_predictions)*100


In [None]:
s_fpr,s_tpr,_=roc_curve(y_test,svc_predictions)
s_fpr,s_tpr,_=roc_curve(y_test,svc_predictions)
s_auc=roc_auc_score(y_test,svc_predictions)
plt.plot(s_fpr,s_tpr,label='SVC Prediction (area={:.3f})'.format(s_auc))
plt.title('ROC plot SVC')
plt.xlabel('False Positive rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.show()
s_auc

In [None]:
KNC=KNeighborsClassifier()
KNC=KNC.fit(X_train,y_train)
KNN_predictions=KNC.predict(X_test)

knn_accuracy=accuracy_score(y_test, KNN_predictions)*100

In [None]:
k_fpr,k_tpr,_=roc_curve(y_test,KNN_predictions)
knn_score=roc_auc_score(y_test,KNN_predictions)
plt.plot(k_fpr,k_tpr,label='KNN prediction (area={:.3f})'.format(knn_score))
plt.title('ROC plot KNN')
plt.xlabel('False Positive rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.show()


In [None]:
d_tree=DecisionTreeClassifier()
d_tree=d_tree.fit(X_train,y_train)
d_tree_predictions=d_tree.predict(X_test)

d_tree_accuracy=accuracy_score(y_test,d_tree_predictions)*100

In [None]:
d_fpr,d_tpr,_=roc_curve(y_test,d_tree_predictions)
d_tree_score=roc_auc_score(y_test,d_tree_predictions)
plt.plot(d_fpr,d_tpr,label='Decision Tree prediction (area={:.3f})'.format(d_tree_score))
plt.title('ROC plot Decision Tree Classifier')
plt.xlabel('False Positive rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.show()

In [None]:
lr=LogisticRegression()
lr=lr.fit(X_train,y_train)
lr_prediction=lr.predict(X_test)
lr_score=accuracy_score(y_test,lr_prediction)*100

In [None]:

l_fpr,l_tpr,_=roc_curve(y_test,lr_prediction)
lr_score=roc_auc_score(y_test,lr_prediction)
plt.plot(l_fpr,l_tpr,label='Logistic Reg (area={:.3f})'.format(lr_score))
plt.title('ROC plot Logistic Regression')
plt.xlabel('False Positive rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.show()

In [None]:
clf = GaussianNB()
clf=clf.fit(X_train,y_train)
clf_predictions=clf.predict(X_test)
gb_score=accuracy_score(y_test,clf_predictions)*100

In [None]:

gb_fpr,gb_tpr,_=roc_curve(y_test,clf_predictions)
clf_score=roc_auc_score(y_test,clf_predictions)
plt.plot(gb_fpr,gb_tpr,label='Logistic Reg (area={:.3f})'.format(clf_score))
plt.title('ROC plot GaussianNB')
plt.xlabel('False Positive rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.show()

In [None]:
models=pd.DataFrame({'Model':['Random Forest','SVC','KNN','Decision Tree Classifier','Logistic Regression','Gaussian NB'],
                    'Score':[accuracy_random_forest,accuracy_svc,knn_accuracy,d_tree_accuracy,lr_score,gb_score]})

In [None]:
models

In [None]:
fig=plt.figure(figsize=(10,10))
sns.barplot(x=models.Model,y=models.Score)
plt.xticks(rotation=90)
fig.show()