In [45]:
import pandas as pd
import numpy as np
import time

df = pd.read_pickle("./data.pkl")

In [49]:
# Get numerical features with Natural Language Toolkit
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

lyrics = df['lyrics']
sWords = stopwords.words('english')
sWords.extend(('got', 'get', 'gets' 'getting', '2X', '2x', 'x2', 'x3', 'x4', 'x2chorus', 'chorus', 'verse', 'bridge', 'd\xe3', 'n\xe3', 'm\xe3', 'the', 'it', 'is', "it's", 'are', 'were', 'a', 'an', 'its', 'of', 'for'))

tfidfconverter = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, ngram_range = (1,2), max_features=1000, min_df=3, max_df=0.7, stop_words=sWords)
n_features = tfidfconverter.fit_transform(lyrics)
df['n_features'] = list(n_features.toarray())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nickruspantini/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
# Get top ranking words
features = (tfidfconverter.get_feature_names()) 
sums = n_features.sum(axis = 0) 
data = [] 
for col, term in enumerate(features): 
    data.append( (term, sums[0, col] )) 
ranking = pd.DataFrame(data, columns = ['term', 'rank']) 
words = (ranking.sort_values('rank', ascending = False)) 
print (words.head(10))
#print tfidfconverter.get_stop_words()
#print tfidfconverter.get_feature_names()

      term         rank
507   love  9603.486619
438   know  8380.652668
475   like  8303.497004
607     oh  6499.606956
615    one  6374.867464
874   time  6301.769009
585  never  6137.866518
731    see  6061.003503
327     go  5793.165852
933   want  5537.147598


Select an input for X below to test with classification models

In [56]:
# X: Bag of words model with polarity scores, word count
def f(row):    
    np.append(row[5], row[0])
    np.append(row[5], row[1])
    np.append(row[5], row[2])
    np.append(row[5], row[3])
    return np.append(row[5], row[4])

X = np.vstack((df['pos_score'], df['neg_score'], df['neu_score'], df['compound_score'], df['word_count'], df['n_features'])).T
X = np.array(map(f, X))

In [None]:
# X: Bag of words model
X = n_features

In [62]:
# X: Polarity scores, word count
X = df[['pos_score', 'neg_score', 'neu_score', 'compound_score', 'word_count']].values

In [60]:
# X: Polarity scores
X = df[['pos_score', 'neg_score', 'neu_score', 'compound_score']].values

In [63]:
# TEST: Random Forest vs Naive Bayes vs K Nearest Neighbors vs Decision Tree

y = df['genre']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)


# Random Forest
from sklearn.ensemble import RandomForestClassifier
RFclassifier = RandomForestClassifier(n_estimators=100, random_state=0)
startTime = time.time()
RFclassifier.fit(X_train, y_train)
print "\nRandom Forest Score: ", RFclassifier.score(X_test, y_test)
print "Time Elapsed: ", time.time() - startTime
preds = RFclassifier.predict(X_test)
print "\nConfusion Matrix:"
print(sklearn.metrics.confusion_matrix(y_test, preds))
print "\nClassification Report:"
print sklearn.metrics.classification_report(y_test, preds)


# Naive Bayes
from sklearn.naive_bayes import GaussianNB
NBclassifier = GaussianNB()
startTime = time.time()
NBclassifier.fit(X_train, y_train)
print "\nNaive Bayes Score: ", NBclassifier.score(X_test, y_test)
print "Time Elapsed: ", time.time() - startTime
preds = NBclassifier.predict(X_test)
print "\nConfusion Matrix:"
print(sklearn.metrics.confusion_matrix(y_test, preds))
print "\nClassification Report:"
print sklearn.metrics.classification_report(y_test, preds)


# K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
KNclassifier = KNeighborsClassifier(n_neighbors=3)
startTime = time.time()
KNclassifier.fit(X_train, y_train)
print "\nK Nearest Neighbor Score: ", KNclassifier.score(X_test, y_test)
print "Time Elapsed: ", time.time() - startTime
preds = KNclassifier.predict(X_test)
print "\nConfusion Matrix:"
print(sklearn.metrics.confusion_matrix(y_test, preds))
print "\nClassification Report:"
print sklearn.metrics.classification_report(y_test, preds)


# Decision Tree
from sklearn.tree import DecisionTreeClassifier
DTclassifier = DecisionTreeClassifier(random_state=0)
startTime = time.time()
DTclassifier.fit(X_train, y_train)
print "\nDecision Tree Score: ", DTclassifier.score(X_test, y_test)
print "Time Elapsed: ", time.time() - startTime
preds = DTclassifier.predict(X_test)
print "\nConfusion Matrix:"
print(sklearn.metrics.confusion_matrix(y_test, preds))
print "\nClassification Report:"
print sklearn.metrics.classification_report(y_test, preds)


Random Forest Score:  0.6352110204439765
Time Elapsed:  44.796710968

Confusion Matrix:
[[  292     9    57    37   230  2681]
 [    6   119    95     7   134  1068]
 [   15     8  3292    17   454  1807]
 [   37     6    56   306   137   999]
 [   91    44   557    71  1850  5274]
 [  247    65   717   133  1571 23099]]

Classification Report:
              precision    recall  f1-score   support

Country/Folk       0.42      0.09      0.15      3306
  Electronic       0.47      0.08      0.14      1429
 Hip-Hop/R&B       0.69      0.59      0.64      5593
        Jazz       0.54      0.20      0.29      1541
         Pop       0.42      0.23      0.30      7887
  Rock/Metal       0.66      0.89      0.76     25832

   micro avg       0.64      0.64      0.64     45588
   macro avg       0.53      0.35      0.38     45588
weighted avg       0.60      0.64      0.59     45588


Naive Bayes Score:  0.6116083179784154
Time Elapsed:  0.299283027649

Confusion Matrix:
[[    0     2    35 

In [65]:
# TEST: Bag of words vs Sentiment Analyis vs Word Count (Uses Random Forest)

X1 = n_features
X2 = df[['pos_score', 'neg_score', 'neu_score', 'compound_score']].values
X3 = df[['word_count']].values
y = df['genre']
classifier = RandomForestClassifier(n_estimators=100, random_state=0)
from sklearn.model_selection import train_test_split


# X1: Bag of Words
print "\nBag of Words Model:\n"
X_train, X_test, y_train, y_test = train_test_split(X1, y, train_size=0.8, test_size=0.2, random_state=1)
startTime = time.time()
classifier.fit(X_train, y_train)
print "Score: ", classifier.score(X_test, y_test)
print "Time Elapsed: ", time.time() - startTime
preds = classifier.predict(X_test)
print "\nConfusion Matrix:"
print(sklearn.metrics.confusion_matrix(y_test, preds))
print "\nClassification Report:"
print sklearn.metrics.classification_report(y_test, preds)


# X2: Sentiment Analysis
print "\nSentiment Analysis:\n"
X_train, X_test, y_train, y_test = train_test_split(X2, y, train_size=0.8, test_size=0.2, random_state=1)
startTime = time.time()
classifier.fit(X_train, y_train)
print "Score: ", classifier.score(X_test, y_test)
print "Time Elapsed: ", time.time() - startTime
preds = classifier.predict(X_test)
print "\nConfusion Matrix:"
print(sklearn.metrics.confusion_matrix(y_test, preds))
print "\nClassification Report:"
print sklearn.metrics.classification_report(y_test, preds)


# X3: Word Count
print "\nWord Count:\n"
X_train, X_test, y_train, y_test = train_test_split(X3, y, train_size=0.8, test_size=0.2, random_state=1)
startTime = time.time()
classifier.fit(X_train, y_train)
print "Score: ", classifier.score(X_test, y_test)
print "Time Elapsed: ", time.time() - startTime
preds = classifier.predict(X_test)
print "\nConfusion Matrix:"
print(sklearn.metrics.confusion_matrix(y_test, preds))
print "\nClassification Report:"
print sklearn.metrics.classification_report(y_test, preds)


Bag of Words Model:

Score:  0.6964332719136614
Time Elapsed:  4555.94374895

Confusion Matrix:
[[  435     0    15    31    76  2749]
 [    2   148    67     0    76  1136]
 [    6     1  3857    19   263  1447]
 [   25     2    21   360   134   999]
 [   39    13   274    56  2166  5339]
 [  104    19   251    70   605 24783]]

Classification Report:
              precision    recall  f1-score   support

Country/Folk       0.71      0.13      0.22      3306
  Electronic       0.81      0.10      0.18      1429
 Hip-Hop/R&B       0.86      0.69      0.77      5593
        Jazz       0.67      0.23      0.35      1541
         Pop       0.65      0.27      0.39      7887
  Rock/Metal       0.68      0.96      0.80     25832

   micro avg       0.70      0.70      0.70     45588
   macro avg       0.73      0.40      0.45     45588
weighted avg       0.70      0.70      0.65     45588


Sentiment Analysis:

Score:  0.5943450030709836
Time Elapsed:  43.628526926

Confusion Matrix:
[[  2