In [None]:
import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.set(context='notebook', style='darkgrid') 

In [None]:
df=pd.read_csv("../input/taylorswiftlyricsfeatures/TaylorSwiftLyricsFeatureSet.csv")
df=df.rename(columns={"Album":"track_album","Artist":"track_artist","Track":"track_title","TrackURI":"track_uri" , "TrackID":"track_id" ,"Lyrics":"track_lyric"})

In [None]:
data = df.drop(columns=["track_lyric"],axis=0)
data.head()

## Exploratory Data Analysis
We will now do EDA on the data set which starts with splitting the problem(target) set and Feature Set

In [None]:
XtoPredict = data.loc[data['genres'].isnull(),:].drop(columns=['track_uri','track_id']).reset_index(drop=True)
FeatureSet = data.loc[data['genres'].notnull(),:].drop(columns=['track_uri','track_id']).reset_index(drop=True)
FeatureSet.head()

FeatureSet will be the one that is used to train and test out our Model.

In [None]:
print(len(FeatureSet.genres.unique()))

country = ['Country Pop','Country','Folk Pop','Blue grass','Contemporary Country']
pop = ['Pop','Electropop','Synth Pop','Dance Pop','Dream Pop']
rock = ['Pop Rock','Pop Punk','Alternative Rock','Soft Rock','R&B','Country Rock',]

print(len(rock)+len(country)+len(pop))

In [None]:
genre_broad = []
for index,i in enumerate(FeatureSet.genres):
    if i in country:
        genre_broad.append('country')
    
    if i in rock:
        genre_broad.append('rock')
    
    if i in pop:
        genre_broad.append('pop')
        
FeatureSet['genre_broad']=genre_broad

In [None]:
FeatureSet.groupby('genre_broad').count()['track_title'].plot.bar()

In [None]:
fig = plt.figure(figsize=(7,5))
sns.boxplot(x='duration_ms',data=FeatureSet)
plt.title("Duration (to check outliers)")

We see there are a few duration outliers. But they wont matter for now.  
Next we will see the correlation between the different features

In [None]:
fig = plt.figure(figsize=(15,15))

corr =df.loc[:,'danceability':'time_signature'].corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool)) #For Lower Triangle, removes TriU

sns.heatmap(corr,annot=True,mask=mask,cmap='RdBu')

Here, We see that Loudness and Energy are the most correlated. Thus, We can remove loudness parameter altogether
Let us now analyse how the genres correlate to each other

In [None]:
genre_rel= FeatureSet.groupby('genres').median().loc[:,'danceability':'time_signature']

corr=genre_rel.transpose().corr('kendall')
mask = np.triu(np.ones_like(corr, dtype=np.bool)) #For Lower Triangle, removes TriU

fig = plt.figure(figsize=(15,15))
sns.heatmap(corr,annot=True,mask=mask,cmap='RdBu')

In [None]:
genre_rel= FeatureSet.groupby('genre_broad').median().loc[:,'danceability':'time_signature']

corr=genre_rel.transpose().corr('kendall')
#mask = np.triu(np.ones_like(corr, dtype=np.bool)) #For Lower Triangle, removes TriU

fig = plt.figure(figsize=(5,5))
sns.heatmap(corr,annot=True,cmap='RdBu')

### Distribution Plots
We will see the distribution of all the parametres of the spotify audio features.

In [None]:
fig = plt.figure(figsize=(15,7))

fig.add_subplot(2,4,1)
sns.distplot(data.danceability)

fig.add_subplot(2,4,2)
sns.distplot(data.energy)

fig.add_subplot(2,4,3)
sns.distplot(data.key)

fig.add_subplot(2,4,4)
sns.distplot(data.loudness)

fig.add_subplot(2,4,5)
sns.distplot(data.speechiness)

fig.add_subplot(2,4,6)
sns.distplot(data.acousticness)

fig.add_subplot(2,4,7)
sns.distplot(data.valence)

fig.add_subplot(2,4,8)
sns.distplot(data.tempo)

## Modelling Classifiers
We will now model our classifiers. But first we must split our train and test sets

In [None]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz

In [None]:
X = FeatureSet.loc[:,'danceability':'duration_ms'].drop(columns=['loudness','mode'])
y = FeatureSet.loc[:,'genre_broad']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.25)

### Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier(min_samples_leaf=1)
dt.fit(X_train, y_train)

fig = plt.figure(figsize=(25,10))
#tree.plot_tree(dt);

a = plot_tree(dt, 
              feature_names=X.columns, 
              class_names=y.unique(), 
              label={"root"},
              proportion=True,
              filled=True, 
              impurity=False,
              rounded=True, 
              fontsize=15)

yhat=dt.predict(X_test)
dt.score(X_test,y_test)

### Random Forrest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(criterion='gini',
                             n_estimators=100)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print(gnb.score(X_test, y_test))

mnb = MultinomialNB(alpha=1000)
print((mnb.fit(X_train, y_train)).score(X_test, y_test))

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test,y_test)