In [1]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
import numpy as np

In [2]:
import re
import time
import operator
from tqdm import tqdm
from pprint import pprint
from textblob import TextBlob
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

'''Features'''
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD

In [3]:
'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

In [4]:
'''Metrics/Evaluation'''
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

In [5]:
stoplist = set(stopwords.words("english"))

In [6]:
df = pd.read_csv('songdata.csv')

In [7]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [8]:
df.shape

(57650, 4)

In [9]:
print('No. of unique artists =', len(df.artist.unique()))

No. of unique artists = 643


In [10]:
'''Removing unnecessary data'''
df = df.drop(['link', 'song'], axis=1)

In [11]:
def word_count(text):
    '''
    Purpose:
        To get word count of a string.
    Args:
        text: (str) - string
    Returns:
        number of words: (int) - number of words in string
    '''
    return len(str(text).split(' '))

In [12]:
def clean_text(text):
    '''
    Purpose:
        To clean text string of the punctuations, tags and special characters and digits.
    Args:
        text: (string) - uncleaned text string
    Returns:
        text: (string) - cleaned text string
    '''
    #Remove punctuations
    text = re.sub("[^a-zA-Z']", ' ', text.lower())
    #remove tags
    text=re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", text)
    # remove special characters and digits
#     text=re.sub("(\\d|\\W)+"," ",text)
    return text

In [13]:
def uni_gram(text_list):
    '''
    Purpose:
        To get unigram keyword's list from text of negative reviews.
    Args:
        text_list: (list) - list of cleaned reviews(string).
    Return:
        words_freq: (list) - list of keywords and occurance(normalized) of each keyword(tuple).
    '''
    d = dict()
    for text in tqdm(text_list):
        for w in text.split():
            if(w in stoplist):    continue
            d[w] = d.get(w, 0)+1
#     normalized_d = normalize(d)
#     words_freq = sort_dict(normalized_d)
    words_freq = sort_dict(d)
    return words_freq[:20]#, normalized_d

In [14]:
def avg_word_length(text):
    '''
    Purpose:
        To get average word length in a string.
    Args:
        text: (str) - string.
    Return:
        avg_word_len: (float) - value of average word length in a string.
    '''
    total_length = sum(list(map(len, text.split())))
    res = len(text.split())
    avg_word_len = total_length/res
    
    return avg_word_len

In [15]:
def sentiment_scores(sentence):
    '''
    Purpose:
        To get compound sentiment score of a string.
    Args:
        sentence: (str) - string.
    Return:
        score: (float) - value of compound sentiment score of a string.
    '''
    sid_obj = SentimentIntensityAnalyzer() 
    sentiment_dict = sid_obj.polarity_scores(sentence)
    score = sentiment_dict['compound']
    
    return score

In [16]:
def textblob_adj(text):
    '''
    Purpose:
        To get list of adjectives/adverbs, and number of adjectives/adverbs a string.
    Args:
        text: (str) - string.
    Return:
        adj_list: (list) - list of adjectives in the string.
        adv_list: (list) - list of adverbs in the string.
        count of adjectives and adverbs
    '''    
    blobed = TextBlob(text)
    counts = Counter(tag for word,tag in blobed.tags)
    adj_list = []
    adv_list = []
    adj_tag_list = ['JJ','JJR','JJS']
    adv_tag_list = ['RB','RBR','RBS']
    for (a, b) in blobed.tags:
        if b in adj_tag_list:
            adj_list.append(a)
        elif b in adv_tag_list:
            adv_list.append(a)
        else:
            pass
    return adj_list, adv_list, counts['JJ']+counts['JJR']+counts['JJS'], counts['RB']+counts['RBR']+counts['RBS']

In [17]:
'''Creating a new dataframe to use it for further analysis by using text column of original dataframe
    and calculating sentiment score/adjective count/adverb count/average word length/adjective list/
    adverb list from the text.'''
data = pd.DataFrame()
data['artist'] = df['artist']
data['text'] = df['text']
data['avg_word_len'] = ''
data['sentiment_score'] = ''
data['adj_list'] = ''
data['adv_list'] = ''
data['adj_count'] = ''
data['adv_count'] = ''

for index, row in tqdm(data.iterrows()):
    adj_list, adv_list, adj_count, adv_count = textblob_adj(row['text'])
    row['adj_list'] = adj_list
    row['adv_list'] = adv_list
    row['adj_count'] = adj_count
    row['adv_count'] = adv_count
    row['sentiment_score'] = sentiment_scores((row['text']))
    row['avg_word_len'] = avg_word_length((row['text']))

57650it [1:38:38, 29.16it/s]  


In [18]:
'''Appending the new columns like word count in the text and unique word count to the previous dataframe.'''
data['Count'] = ''
data['unique_count'] = ''

for index, row in tqdm(data.iterrows()):
    row['text'] = clean_text(row['text']).split(' ')
    row['Count'] = len(row['text'])
    row['unique_count'] = len(set(row['text']))

57650it [00:16, 3438.18it/s]


In [21]:
data.head()

Unnamed: 0,artist,text,avg_word_len,sentiment_score,adj_list,adv_list,adj_count,adv_count,Count,unique_count
0,ABBA,"[look, at, her, face, , it's, a, wonderful, fa...",3.71242,0.9587,"[wonderful, special, lucky, blue, blue]","[just, ever, just, ever, just, ever, just, ever]",5,8,206,68
1,ABBA,"[take, it, easy, with, me, , please, , , touch...",4.1,0.9877,"[easy, slow, soft, light, strong, strong, stro...","[gently, slowly, now, again, again, lightly, n...",8,16,405,70
2,ABBA,"[i'll, never, know, why, i, had, to, go, , , w...",3.47756,0.9986,"[lousy, tough, enough, ma, good, new, good, ne...","[never, n't, anymore, now, here, again, here, ...",36,34,411,111
3,ABBA,"[making, somebody, happy, is, a, question, of,...",4.825,0.9971,"[happy, little, much, tender, boom-a-boomerang...","[so, n't, so, so, So, away, bang, always, arou...",17,15,342,88
4,ABBA,"[making, somebody, happy, is, a, question, of,...",4.96465,0.9974,"[happy, little, much, tender, boom-a-boomerang...","[so, n't, so, so, away, bang, always, around, ...",22,13,340,90
5,ABBA,"[well, , you, hoot, and, you, holler, and, you...",3.89908,-0.9382,"[holler, mad, lousy, sick, tired, tedious, na,...","[Well, always, Now, n't, more, Once, again, no...",11,9,150,77
6,ABBA,"[down, in, the, street, they're, all, singing,...",4.24931,-0.9957,"[alive, dead, hollow, smart, last, final, slow...","[Down, alone, then, again, Now, Now, n't, real...",19,23,502,152
7,ABBA,"[chiquitita, , tell, me, what's, wrong, , , yo...",3.93421,-0.9573,"[wrong, own, sad, quiet, Your, best, sure, new...","[so, so, always, Now, together, once, again, s...",16,25,411,111
8,ABBA,"[i, was, out, with, the, morning, sun, , , cou...",3.70066,-0.1842,"[front, girl, crazy, girl, crazy, other, stupi...","[n't, hardly, never, Then, n't, ever, just, n'...",16,26,431,132
9,ABBA,"[i'm, waitin', for, you, baby, , , i'm, sittin...",3.73984,0.6904,"[waitin, cold, true, blue, cryin, waitin, sitt...","[alone, so, never, now, so, never, now, over]",19,8,178,42


In [23]:
'''Append new column of ratio of count/unique_count to dataframe.'''
data['c/uc'] = ''
for index, row in tqdm(data.iterrows()):
    row['c/uc'] = row['Count']/row['unique_count']

57650it [00:06, 9412.91it/s]


In [25]:
data.head(500)

Unnamed: 0,artist,text,avg_word_len,sentiment_score,adj_list,adv_list,adj_count,adv_count,Count,unique_count,c/uc
0,ABBA,"[look, at, her, face, , it's, a, wonderful, fa...",3.71242,0.9587,"[wonderful, special, lucky, blue, blue]","[just, ever, just, ever, just, ever, just, ever]",5,8,206,68,3.02941
1,ABBA,"[take, it, easy, with, me, , please, , , touch...",4.1,0.9877,"[easy, slow, soft, light, strong, strong, stro...","[gently, slowly, now, again, again, lightly, n...",8,16,405,70,5.78571
2,ABBA,"[i'll, never, know, why, i, had, to, go, , , w...",3.47756,0.9986,"[lousy, tough, enough, ma, good, new, good, ne...","[never, n't, anymore, now, here, again, here, ...",36,34,411,111,3.7027
3,ABBA,"[making, somebody, happy, is, a, question, of,...",4.825,0.9971,"[happy, little, much, tender, boom-a-boomerang...","[so, n't, so, so, So, away, bang, always, arou...",17,15,342,88,3.88636
4,ABBA,"[making, somebody, happy, is, a, question, of,...",4.96465,0.9974,"[happy, little, much, tender, boom-a-boomerang...","[so, n't, so, so, away, bang, always, around, ...",22,13,340,90,3.77778
5,ABBA,"[well, , you, hoot, and, you, holler, and, you...",3.89908,-0.9382,"[holler, mad, lousy, sick, tired, tedious, na,...","[Well, always, Now, n't, more, Once, again, no...",11,9,150,77,1.94805
6,ABBA,"[down, in, the, street, they're, all, singing,...",4.24931,-0.9957,"[alive, dead, hollow, smart, last, final, slow...","[Down, alone, then, again, Now, Now, n't, real...",19,23,502,152,3.30263
7,ABBA,"[chiquitita, , tell, me, what's, wrong, , , yo...",3.93421,-0.9573,"[wrong, own, sad, quiet, Your, best, sure, new...","[so, so, always, Now, together, once, again, s...",16,25,411,111,3.7027
8,ABBA,"[i, was, out, with, the, morning, sun, , , cou...",3.70066,-0.1842,"[front, girl, crazy, girl, crazy, other, stupi...","[n't, hardly, never, Then, n't, ever, just, n'...",16,26,431,132,3.26515
9,ABBA,"[i'm, waitin', for, you, baby, , , i'm, sittin...",3.73984,0.6904,"[waitin, cold, true, blue, cryin, waitin, sitt...","[alone, so, never, now, so, never, now, over]",19,8,178,42,4.2381


In [47]:
'''df_0: Creating new dataframe with only numeric values by dropping the columns with word lists
    df_1: creating dataframe by taking group average value for the different artists.'''
df_0 = data.drop(['text', 'adj_list', 'adv_list'], axis = 1)

df_0 = df_0.infer_objects()
print(df_0.dtypes)
df_1 = df_0.groupby('artist')['avg_word_len', 'sentiment_score', 'Count', 'unique_count', 'c/uc', 'adj_count', 'adv_count'].mean()
df_1.reset_index()

artist              object
avg_word_len       float64
sentiment_score    float64
adj_count            int64
adv_count            int64
Count                int64
unique_count         int64
c/uc               float64
dtype: object


Unnamed: 0,artist,avg_word_len,sentiment_score,Count,unique_count,c/uc,adj_count,adv_count
0,'n Sync,3.951154,0.586233,409.924731,95.688172,4.367851,14.000000,19.408602
1,ABBA,3.988681,0.530261,364.946903,92.699115,4.040352,14.327434,16.734513
2,Ace Of Base,3.917772,0.633720,365.270270,86.135135,4.617053,12.405405,16.148649
3,Adam Sandler,4.286613,0.156261,420.057143,142.028571,3.013755,18.685714,16.771429
4,Adele,3.836127,0.308248,430.370370,98.222222,4.458876,15.925926,25.851852
5,Aerosmith,4.021258,0.419040,336.356725,96.175439,3.657960,15.666667,16.807018
6,Air Supply,3.892208,0.669054,282.942529,85.350575,3.350153,9.781609,17.183908
7,Aiza Seguerra,3.996793,0.436620,328.560000,83.400000,4.128992,15.640000,15.800000
8,Alabama,4.053818,0.694949,288.165775,90.652406,3.293392,12.475936,13.925134
9,Alan Parsons Project,3.955867,0.166842,304.127451,92.460784,3.364656,11.284314,15.147059


In [84]:
df_dict = df_1.set_index('artist').T.to_dict('list')
df_dict

KeyError: 'artist'

In [41]:
df_1.loc['ABBA']

avg_word_len         3.988681
sentiment_score      0.530261
Count              364.946903
unique_count        92.699115
c/uc                 4.040352
adj_count           14.327434
adv_count           16.734513
Name: ABBA, dtype: float64

In [48]:
'''Creating copy of dataframe for further operation.'''
new = data.copy()

In [49]:
new

Unnamed: 0,artist,text,avg_word_len,sentiment_score,adj_list,adv_list,adj_count,adv_count,Count,unique_count,c/uc
0,ABBA,"[look, at, her, face, , it's, a, wonderful, fa...",3.71242,0.9587,"[wonderful, special, lucky, blue, blue]","[just, ever, just, ever, just, ever, just, ever]",5,8,206,68,3.02941
1,ABBA,"[take, it, easy, with, me, , please, , , touch...",4.1,0.9877,"[easy, slow, soft, light, strong, strong, stro...","[gently, slowly, now, again, again, lightly, n...",8,16,405,70,5.78571
2,ABBA,"[i'll, never, know, why, i, had, to, go, , , w...",3.47756,0.9986,"[lousy, tough, enough, ma, good, new, good, ne...","[never, n't, anymore, now, here, again, here, ...",36,34,411,111,3.7027
3,ABBA,"[making, somebody, happy, is, a, question, of,...",4.825,0.9971,"[happy, little, much, tender, boom-a-boomerang...","[so, n't, so, so, So, away, bang, always, arou...",17,15,342,88,3.88636
4,ABBA,"[making, somebody, happy, is, a, question, of,...",4.96465,0.9974,"[happy, little, much, tender, boom-a-boomerang...","[so, n't, so, so, away, bang, always, around, ...",22,13,340,90,3.77778
5,ABBA,"[well, , you, hoot, and, you, holler, and, you...",3.89908,-0.9382,"[holler, mad, lousy, sick, tired, tedious, na,...","[Well, always, Now, n't, more, Once, again, no...",11,9,150,77,1.94805
6,ABBA,"[down, in, the, street, they're, all, singing,...",4.24931,-0.9957,"[alive, dead, hollow, smart, last, final, slow...","[Down, alone, then, again, Now, Now, n't, real...",19,23,502,152,3.30263
7,ABBA,"[chiquitita, , tell, me, what's, wrong, , , yo...",3.93421,-0.9573,"[wrong, own, sad, quiet, Your, best, sure, new...","[so, so, always, Now, together, once, again, s...",16,25,411,111,3.7027
8,ABBA,"[i, was, out, with, the, morning, sun, , , cou...",3.70066,-0.1842,"[front, girl, crazy, girl, crazy, other, stupi...","[n't, hardly, never, Then, n't, ever, just, n'...",16,26,431,132,3.26515
9,ABBA,"[i'm, waitin', for, you, baby, , , i'm, sittin...",3.73984,0.6904,"[waitin, cold, true, blue, cryin, waitin, sitt...","[alone, so, never, now, so, never, now, over]",19,8,178,42,4.2381


In [50]:
'''new_text: (dataframe) - dataframe with only word list as columns'''
new_text = new[['artist', 'text', 'adj_list', 'adv_list']]

In [51]:
new_text

Unnamed: 0,artist,text,adj_list,adv_list
0,ABBA,"[look, at, her, face, , it's, a, wonderful, fa...","[wonderful, special, lucky, blue, blue]","[just, ever, just, ever, just, ever, just, ever]"
1,ABBA,"[take, it, easy, with, me, , please, , , touch...","[easy, slow, soft, light, strong, strong, stro...","[gently, slowly, now, again, again, lightly, n..."
2,ABBA,"[i'll, never, know, why, i, had, to, go, , , w...","[lousy, tough, enough, ma, good, new, good, ne...","[never, n't, anymore, now, here, again, here, ..."
3,ABBA,"[making, somebody, happy, is, a, question, of,...","[happy, little, much, tender, boom-a-boomerang...","[so, n't, so, so, So, away, bang, always, arou..."
4,ABBA,"[making, somebody, happy, is, a, question, of,...","[happy, little, much, tender, boom-a-boomerang...","[so, n't, so, so, away, bang, always, around, ..."
5,ABBA,"[well, , you, hoot, and, you, holler, and, you...","[holler, mad, lousy, sick, tired, tedious, na,...","[Well, always, Now, n't, more, Once, again, no..."
6,ABBA,"[down, in, the, street, they're, all, singing,...","[alive, dead, hollow, smart, last, final, slow...","[Down, alone, then, again, Now, Now, n't, real..."
7,ABBA,"[chiquitita, , tell, me, what's, wrong, , , yo...","[wrong, own, sad, quiet, Your, best, sure, new...","[so, so, always, Now, together, once, again, s..."
8,ABBA,"[i, was, out, with, the, morning, sun, , , cou...","[front, girl, crazy, girl, crazy, other, stupi...","[n't, hardly, never, Then, n't, ever, just, n'..."
9,ABBA,"[i'm, waitin', for, you, baby, , , i'm, sittin...","[waitin, cold, true, blue, cryin, waitin, sitt...","[alone, so, never, now, so, never, now, over]"


In [52]:
'''keeping only unique words in the columns of word list.'''
for index, row in tqdm(new_text.iterrows()):
    row['text'] = list(set(row['text']))
    row['adj_list'] = list(set(row['adj_list']))
    row['adv_list'] = list(set(row['adv_list']))

57650it [00:11, 5198.81it/s]


In [53]:
new_text

Unnamed: 0,artist,text,adj_list,adv_list
0,ABBA,"[, we, a, can, look, at, i'm, and, be, wonderf...","[special, wonderful, blue, lucky]","[just, ever]"
1,ABBA,"[, slow, andante, a, play, summer, night, down...","[slow, soft, strong, light, easy]","[lightly, n't, slowly, again, gently, now, away]"
2,ABBA,"[, too, mistake, entitled, gotta, enough, used...","[ta, good, dumb, new, such, ma, enough, thank,...","[As, anymore, never, together, too, n't, alway..."
3,ABBA,"[, de, mean, take, i'll, to, on, boom, selfish...","[good, such, much, Dum-be-dum-dum, boom-a-boom...","[around, n't, always, love, so, anywhere, away..."
4,ABBA,"[, de, mean, take, i'll, to, on, boom, selfish...","[good, such, much, boom-a-boomerang, love, ten...","[around, n't, always, love, so, away, bang, ev..."
5,ABBA,"[, will, a, can, mad, pride, walkin', again, l...","[past, mad, last, na, free, lousy, sick, tired...","[Well, always, n't, Once, again, here, Now, no..."
6,ABBA,"[, sunrise, their, watched, us, sails, some, h...","[slow, final, sure, last, dead, blue, alive, c...","[then, Down, only, almost, n't, else, alone, a..."
7,ABBA,"[, patch, too, hate, sky, still, tomorrow, sad...","[own, sure, Your, new, sad, best, quiet, candl...","[once, down, together, too, always, again, so,..."
8,ABBA,"[, leave, everything, his, long, take, this, t...","[crazy, long, front, stupid, other, girl, nice]","[never, only, So, n't, blind, back, so, not, n..."
9,ABBA,"[, we, baby, cryin', over, i'm, leave, used, t...","[sittin, waitin, Little, blue, cold, true, cryin]","[never, over, alone, so, now]"


In [57]:
#Turning the labels into numbers (The names of artists are labels)
LE = LabelEncoder()
new_text['label_num'] = LE.fit_transform(df['artist'])
display(new_text['label_num'].unique())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29, 491,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 163, 179, 18

In [62]:
#Creating the features (tf-idf weights) for the processed text

texts = new_text['text'].astype('str')
adj = new_text['adj_list'].astype('str')
adv = new_text['adv_list'].astype('str')

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), 
                                   min_df = 2, 
                                   max_df = .95)

X1 = tfidf_vectorizer.fit_transform(texts) #features
X2 = tfidf_vectorizer.fit_transform(adj)
X3 = tfidf_vectorizer.fit_transform(adv)

y = new_text['label_num'].values #target

print (X1.shape)
print(X2.shape)
print(X3.shape)
print(y.shape)

(57650, 365371)
(57650, 50834)
(57650, 14571)
(57650,)


In [63]:
#Dimenionality reduction. Only using the 100 best features er category

lsa = TruncatedSVD(n_components=100, 
                   n_iter=10, 
                   random_state=3)

X1 = lsa.fit_transform(X1)
X2 = lsa.fit_transform(X2)
X3 = lsa.fit_transform(X3)

print(X1.shape)
print(X2.shape)
print(X3.shape)

(57650, 100)
(57650, 100)
(57650, 100)


In [65]:
type(X1)

numpy.ndarray

In [72]:
X4 = np.concatenate((X1, X2, X3), axis = 1)

In [73]:
X4.shape

(57650, 300)

In [74]:
X5 = new[['avg_word_len', 'sentiment_score', 'adj_count', 'adv_count', 'Count', 'unique_count', 'c/uc']]

In [75]:
X5.shape

(57650, 7)

In [76]:
type(X5)

pandas.core.frame.DataFrame

In [77]:
X = np.concatenate((X4, X5), axis = 1)

In [78]:
type(X)

numpy.ndarray

In [87]:
X.shape

(57650, 307)

In [80]:
#Preliminary model evaluation using default parameters

#Creating a dict of the models
model_dict = {'Dummy' : DummyClassifier(random_state=3),
              'Stochastic Gradient Descent' : SGDClassifier(random_state=3, loss='log'),
              'Random Forest': RandomForestClassifier(random_state=3),
              'Decsision Tree': DecisionTreeClassifier(random_state=3),
              'AdaBoost': AdaBoostClassifier(random_state=3),
              'Gaussian Naive Bayes': GaussianNB(),
              'K Nearest Neighbor': KNeighborsClassifier()}

#Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = .3, 
                                                    shuffle = True, 
                                                    random_state = 42)

#Function to get the scores for each model in a df
def model_score_df(model_dict):   
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in model_dict.items():   
        model_name.append(k)
        v.fit(X_train, y_train)
        y_pred = v.predict(X_test)
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df

model_score_df(model_dict)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, w

Unnamed: 0,model_name,accuracy_score,precision_score,recall_score,f1_score
5,Gaussian Naive Bayes,0.0262504,0.0289652,0.0358212,0.0258103
3,Decsision Tree,0.0185025,0.0210927,0.0209848,0.0196295
2,Random Forest,0.0168835,0.0307301,0.0183859,0.0180501
6,K Nearest Neighbor,0.00526164,0.00455936,0.0046643,0.00382553
0,Dummy,0.00225499,0.00192988,0.00194564,0.00188474
1,Stochastic Gradient Descent,0.00341139,0.000249148,0.00307373,0.000361591
4,AdaBoost,0.00462561,9.19429e-05,0.0029512,0.000173617
