In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
#sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
f = pd.read_csv('spotify_millsongdata.csv')

In [4]:
f.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [5]:
f.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [6]:
f.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
f['artist'].unique()

array(['ABBA', 'Ace Of Base', 'Adam Sandler', 'Adele', 'Aerosmith',
       'Air Supply', 'Aiza Seguerra', 'Alabama', 'Alan Parsons Project',
       'Aled Jones', 'Alice Cooper', 'Alice In Chains', 'Alison Krauss',
       'Allman Brothers Band', 'Alphaville', 'America', 'Amy Grant',
       'Andrea Bocelli', 'Andy Williams', 'Annie', 'Ariana Grande',
       'Ariel Rivera', 'Arlo Guthrie', 'Arrogant Worms', 'Avril Lavigne',
       'Backstreet Boys', 'Barbie', 'Barbra Streisand', 'Beach Boys',
       'The Beatles', 'Beautiful South', 'Beauty And The Beast',
       'Bee Gees', 'Bette Midler', 'Bill Withers', 'Billie Holiday',
       'Billy Joel', 'Bing Crosby', 'Black Sabbath', 'Blur', 'Bob Dylan',
       'Bob Marley', 'Bob Rivers', 'Bob Seger', 'Bon Jovi', 'Boney M.',
       'Bonnie Raitt', 'Bosson', 'Bread', 'Britney Spears',
       'Bruce Springsteen', 'Bruno Mars', 'Bryan White', 'Cake',
       'Carly Simon', 'Carol Banawa', 'Carpenters', 'Cat Stevens',
       'Celine Dion', 'Chaka Khan

In [8]:
df = f

In [9]:
df[df['artist']=='Various Artists'].index

Int64Index([54894, 54895, 54896], dtype='int64')

In [10]:
df = df.drop([54894, 54895, 54896],axis=0)

In [11]:
df

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
...,...,...,...,...
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...


In [12]:
df = df.drop(['song','link'],axis=1)

In [13]:
df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)

In [14]:
df

Unnamed: 0,artist,text
0,ABBA,"Look at her face, it's a wonderful face And i..."
1,ABBA,"Take it easy with me, please Touch me gently ..."
2,ABBA,I'll never know why I had to go Why I had to ...
3,ABBA,Making somebody happy is a question of give an...
4,ABBA,Making somebody happy is a question of give an...
...,...,...
57645,Ziggy Marley,Irie days come on play Let the angels fly let...
57646,Ziggy Marley,Power to the workers More power Power to the...
57647,Zwan,all you need is something i'll believe flash...
57648,Zwan,northern star am i frightened where can i go...


In [15]:
y= df['artist']
y

0                ABBA
1                ABBA
2                ABBA
3                ABBA
4                ABBA
             ...     
57645    Ziggy Marley
57646    Ziggy Marley
57647            Zwan
57648            Zwan
57649            Zwan
Name: artist, Length: 57647, dtype: object

In [16]:
x = df['text']
x

0        Look at her face, it's a wonderful face  And i...
1        Take it easy with me, please  Touch me gently ...
2        I'll never know why I had to go  Why I had to ...
3        Making somebody happy is a question of give an...
4        Making somebody happy is a question of give an...
                               ...                        
57645    Irie days come on play  Let the angels fly let...
57646    Power to the workers  More power  Power to the...
57647    all you need  is something i'll believe  flash...
57648    northern star  am i frightened  where can i go...
57649    come in  make yourself at home  i'm a bit late...
Name: text, Length: 57647, dtype: object

In [17]:
le = LabelEncoder()
yle = le.fit_transform(y)
yle

array([  1,   1,   1, ..., 641, 641, 641])

In [18]:
xtrain,xtest,ytrain,ytest = train_test_split(x,yle,test_size=0.2,random_state=101)

In [19]:
tfi = CountVectorizer()
s_xtr = tfi.fit_transform(xtrain)
s_xtr

<46117x77381 sparse matrix of type '<class 'numpy.int64'>'
	with 4149406 stored elements in Compressed Sparse Row format>

In [20]:
s_xte = tfi.transform(xtest)
s_xte

<11530x77381 sparse matrix of type '<class 'numpy.int64'>'
	with 1016118 stored elements in Compressed Sparse Row format>

In [21]:
mnb = MultinomialNB()
mnb.fit(s_xtr,ytrain)
mnbpreds = mnb.predict(s_xte)
accuracy_score(ytest,mnbpreds)

0.03642671292281006

In [None]:
gnb = GaussianNB()
gnb.fit(s_xtr.toarray(),ytrain)
gnbpreds = gnb.predict(s_xte)
accuracy_score(ytest,gnbpreds)

In [22]:
bnb = BernoulliNB()
bnb.fit(s_xtr,ytrain)
bnbpreds = bnb.predict(s_xte)
accuracy_score(ytest,bnbpreds)

0.006504770164787511

In [23]:
lr = LogisticRegression()
lr.fit(s_xtr,ytrain)
lrpred = lr.predict(s_xte)
accuracy_score(lrpred,ytest)

KeyboardInterrupt: 