In [22]:
# Load data
import pandas as pd
import numpy as np

df = pd.read_csv('lyrics.csv')
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [23]:
def mergeGenre(genre):
    if genre == "Country" or (genre) == "Folk":
        return "Country/Folk"
    if genre == "Hip-Hop" or (genre) == "R&B":
        return "Hip-Hop/R&B"
    if genre == "Rock" or (genre) == "Metal":
        return "Rock/Metal"
    return genre
    
# Clean data
df['lyrics'] = df['lyrics'].apply(lambda l: str(l)
                                  .replace('\n', ' ')
                                  .replace('.', '')
                                  .replace(',', '')
                                  .replace('!', '')
                                  .replace('?', '')
                                  .lower())

df['genre'] = df['genre'].apply(lambda row: mergeGenre(row))
df['word_count'] = df['lyrics'].str.split(' ').str.len()
df = df[df['word_count'] > 10]
df = df[df['genre'] != 'Other']
df = df[df['genre'] != 'Not Available']
df = df[df['genre'] != 'Indie']
df = df[df['song'].str.contains('remix') == False]
#df = df[df['genre'] != 'Jazz']
#df = df[df['genre'] != 'Electronic']

In [24]:
# Sample data (optional)
#df = df.sample(frac=0.5, replace=False, random_state=1)

In [25]:
# Get polarity scores of lyrics
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
df['pos_score'] = df['lyrics'].apply(lambda row: sid.polarity_scores(row)['pos'])
df['neg_score'] = df['lyrics'].apply(lambda row: sid.polarity_scores(row)['neg'])
df['neu_score'] = df['lyrics'].apply(lambda row: sid.polarity_scores(row)['neu'])
df['compound_score'] = df['lyrics'].apply(lambda row: sid.polarity_scores(row)['compound'])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/nickruspantini/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [26]:
# Get numerical features with Natural Language Toolkit
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

lyrics = df['lyrics']
sWords = stopwords.words('english')
sWords.extend(('got', 'get', 'gets' 'getting', '2X', '2x', 'x2', 'x3', 'x4', 'x2chorus', 'chorus', 'verse', 'bridge', 'd\xe3', 'n\xe3', 'm\xe3', 'the', 'it', 'is', "it's", 'are', 'were', 'a', 'an', 'its', 'of', 'for'))

tfidfconverter = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, ngram_range = (1,1), max_features=1000, min_df=4, max_df=0.7, stop_words=sWords)
n_features = list(tfidfconverter.fit_transform(lyrics).toarray())
df['n_features'] = list(n_features).toarray())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nickruspantini/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
# Get top ranking words
features = (tfidfconverter.get_feature_names()) 
sums = n_features.sum(axis = 0) 
data = [] 
for col, term in enumerate(features): 
    data.append( (term, sums[0, col] )) 
ranking = pd.DataFrame(data, columns = ['term', 'rank']) 
words = (ranking.sort_values('rank', ascending = False)) 
print (words.head(10))
#print tfidfconverter.get_stop_words()
#print tfidfconverter.get_feature_names()

      term          rank
499   love  10013.947153
439   know   8736.197116
471   like   8519.469945
598     oh   7001.511554
603    one   6563.517166
875   time   6502.576866
577  never   6299.593438
[u'across', u'act', u'afraid', u'ago', u'ah', u'ahead', u'ai', u'aint', u'air', u'al', u'alive', u'almost', u'alone', u'along', u'already', u'alright', u'always', u'amor', u'angel', u'angels', u'another', u'answer', u'anybody', u'anymore', u'anyone', u'anything', u'anyway', u'apart', u'arms', u'around', u'ask', u'asked', u'ass', u'auf', u'awake', u'away', u'ay', u'ba', u'babe', u'baby', u'back', u'bad', u'ball', u'band', u'bang', u'bar', u'battle', u'beat', u'beautiful', u'beauty', u'become', u'bed', u'begin', u'behind', u'believe', u'belong', u'beneath', u'beside', u'best', u'bet', u'better', u'beyond', u'big', u'bit', u'bitch', u'bitches', u'black', u'blame', u'bleed', u'blind', u'block', u'blood', u'blow', u'blue', u'blues', u'body', u'bone', u'bones', u'boom', u'born', u'bottle', u'bou

In [33]:
# Create input X
def f(row):    
    np.append(row[5], row[0])
    np.append(row[5], row[1])
    np.append(row[5], row[2])
    np.append(row[5], row[3])
    return np.append(row[5], row[4])
    return [row[0], row[]]

X = np.vstack((df['pos_score'], df['neg_score'], df['neu_score'], df['compound_score'], df['word_count'], df['n_features'])).T
X = np.array(map(f, X))

array([[0.156, 0.077, 0.767, 0.9632, 258,
        array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.08475725, 0.        ,
       0.0887157 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.08182265, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.04880726, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.   

In [38]:
# Train and Test
import time
startTime = time.time()

#X = df[['pos_score', 'neg_score', 'neu_score', 'compound_score', 'word_count']].values
#X = n_features
#X = df[['n_features']].tolist()

y = df['genre']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(X_train, y_train)

print "Score: ", classifier.score(X_test, y_test)
endTime = time.time()
elapsedTime = endTime - startTime
print "Time: ", elapsedTime

preds = classifier.predict(X_test)
print(sklearn.metrics.confusion_matrix(y_test, preds))

('Score: ', 0.5261426592797784)
[[  270     8     3    53     2    29    73   306     1  2165]
 [   12   127     2    74     1     8    52   152     2  1021]
 [    6     1    29    13     1     3     9    36     2   294]
 [    7     9     0  3152     0     9    70   371     1  1169]
 [    5     4     0    24    28     5    17    59     0   476]
 [   30     4     5    37     3   326    33   159    10   942]
 [   19    12     2   162     5     5   760   196     6  3246]
 [  101    37    12   587     8    73   200  2051    14  4958]
 [    7     1     0    41     0    20    26    79    54   419]
 [  267    64    22   591    24   131   968  1786    29 17515]]
('Time: ', 53.86378002166748)


In [36]:
# Confusion Matrix
print sklearn.metrics.classification_report(y_test, preds)

              precision    recall  f1-score   support

     Country       0.24      0.10      0.14      2910
  Electronic       0.26      0.09      0.13      1451
        Folk       0.18      0.06      0.09       394
     Hip-Hop       0.58      0.54      0.56      4788
       Indie       0.18      0.05      0.08       618
        Jazz       0.38      0.20      0.27      1549
       Metal       0.28      0.17      0.21      4413
         Pop       0.34      0.26      0.29      8041
         R&B       0.24      0.09      0.13       647
        Rock       0.53      0.74      0.62     21397

   micro avg       0.48      0.48      0.48     46208
   macro avg       0.32      0.23      0.25     46208
weighted avg       0.43      0.48      0.44     46208

