In [None]:
import pandas as pd
import numpy as np

In [39]:
# Load data (if no pickle)
df = pd.read_csv('lyrics.csv')
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [40]:
# Clean data (if no pickle)
def mergeGenre(genre):
    if genre == "Country" or (genre) == "Folk":
        return "Country/Folk"
    if genre == "Hip-Hop" or (genre) == "R&B":
        return "Hip-Hop/R&B"
    if genre == "Rock" or (genre) == "Metal":
        return "Rock/Metal"
    return genre
    
df['lyrics'] = df['lyrics'].apply(lambda l: str(l)
                                  .replace('\n', ' ')
                                  .replace('.', '')
                                  .replace(',', '')
                                  .replace('!', '')
                                  .replace('?', '')
                                  .lower())

df['genre'] = df['genre'].apply(lambda row: mergeGenre(row))
df['word_count'] = df['lyrics'].str.split(' ').str.len()
df = df[df['word_count'] > 10]
df = df[df['genre'] != 'Other']
df = df[df['genre'] != 'Not Available']
df = df[df['genre'] != 'Indie']
df = df[df['song'].str.contains('remix') == False]
#df = df[df['genre'] != 'Jazz']
#df = df[df['genre'] != 'Electronic']

# Sample data (optional)
#df = df.sample(frac=0.5, replace=False, random_state=1)

In [25]:
# Get polarity scores of lyrics (if no pickle)
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
df['pos_score'] = df['lyrics'].apply(lambda row: sid.polarity_scores(row)['pos'])
df['neg_score'] = df['lyrics'].apply(lambda row: sid.polarity_scores(row)['neg'])
df['neu_score'] = df['lyrics'].apply(lambda row: sid.polarity_scores(row)['neu'])
df['compound_score'] = df['lyrics'].apply(lambda row: sid.polarity_scores(row)['compound'])

df.to_pickle("./data.pkl")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/nickruspantini/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
df = pd.read_pickle("./data.pkl")

# Get numerical features with Natural Language Toolkit
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

lyrics = df['lyrics']
sWords = stopwords.words('english')
sWords.extend(('got', 'get', 'gets' 'getting', '2X', '2x', 'x2', 'x3', 'x4', 'x2chorus', 'chorus', 'verse', 'bridge', 'd\xe3', 'n\xe3', 'm\xe3', 'the', 'it', 'is', "it's", 'are', 'were', 'a', 'an', 'its', 'of', 'for'))

tfidfconverter = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, ngram_range = (1,2), max_features=1000, min_df=3, max_df=0.7, stop_words=sWords)
n_features = tfidfconverter.fit_transform(lyrics)
df['n_features'] = list(n_features.toarray())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nickruspantini/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Get top ranking words
features = (tfidfconverter.get_feature_names()) 
sums = n_features.sum(axis = 0) 
data = [] 
for col, term in enumerate(features): 
    data.append( (term, sums[0, col] )) 
ranking = pd.DataFrame(data, columns = ['term', 'rank']) 
words = (ranking.sort_values('rank', ascending = False)) 
print (words.head(10))
#print tfidfconverter.get_stop_words()
#print tfidfconverter.get_feature_names()

In [None]:
# X: Bag of words model with polarity scores, word count
def f(row):    
    np.append(row[5], row[0])
    np.append(row[5], row[1])
    np.append(row[5], row[2])
    np.append(row[5], row[3])
    return np.append(row[5], row[4])
    return [row[0], row[]]

X = np.vstack((df['pos_score'], df['neg_score'], df['neu_score'], df['compound_score'], df['word_count'], df['n_features'])).T
X = np.array(map(f, X))

In [None]:
# X: Bag of words model
X = n_features

In [None]:
# X: Polarity scores, word count
X = df[['pos_score', 'neg_score', 'neu_score', 'compound_score', 'word_count']].values

In [None]:
# Train and Test
import time
startTime = time.time()

y = df['genre']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(X_train, y_train)

print "Score: ", classifier.score(X_test, y_test)
endTime = time.time()
elapsedTime = endTime - startTime
print "Time: ", elapsedTime

preds = classifier.predict(X_test)
print(sklearn.metrics.confusion_matrix(y_test, preds))

In [None]:
# Confusion Matrix
print sklearn.metrics.classification_report(y_test, preds)