In [146]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction import stop_words
import string
import re

In [147]:
df = pd.read_csv("lyrics.csv")
df = df.sample(n=100000)

In [148]:
def clean_lyrics(text):
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    # delete stuff but leave at least a space to avoid clumping together
    nopunct = regex.sub(" ", text)
    words = nopunct.split(" ")
    words = [w for w in words if len(w) > 2]  # ignore a, an, to, at, be, ...
    words = [w.lower() for w in words]
    words = [w for w in words if w not in stop_words.ENGLISH_STOP_WORDS]
    return words


df = df[pd.notnull(df['lyrics'])]
df = df[df['genre'] != 'Not Available']
df = df[df['genre'] != 'Other']
df['lyrics_clean'] = df.lyrics.apply(clean_lyrics)

In [149]:
X, y = df['lyrics'], df['genre']

In [150]:
def load_glove(filename):
    """
    Read all lines from the indicated file and return a dictionary
    mapping word:vector where vectors are of numpy `array` type.
    """
    gloves = {}
    key = ""
    with open(filename, 'r') as f:
        for line in f.readlines():
            data = line.split(" ")
            key = data.pop(0) 
            gloves[key] = np.array(data).astype(np.float) 
        return gloves


In [151]:
global gloves 
gloves = load_glove("glove/glove.6B.300d.txt")

In [152]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [153]:
# Below are the Extractors classes for Pipeline and FeatureUnion:

In [154]:
class Word2Vec(BaseEstimator, TransformerMixin):
    """Takes in the lyrics column and outputs word2vec for the song"""

    def __init__(self):
        pass

    def word2vec(self, song):
        """
        Helper code to compute word2vec.
        """

        sum = 0
        c = 0
        for word in song:
            if word in gloves:
                sum = np.add(sum, gloves[word])
                c += 1
                return np.mean(sum / c)
            else:
                return 0

    def transform(self, X, y=None):
        """Transform function of this feature extractor"""
        return np.array(X.apply(self.word2vec)).reshape(-1, 1)

    def fit(self, X, y=None):
        """Returns `self` unless there is other functions in train and test"""
        return self

In [155]:
class Polarity(BaseEstimator, TransformerMixin):
    """Takes in the lyrics column and outputs polarity score"""

    def __init__(self):
        pass

    def sent_score(self, song):
        """Helper code to compute sentiment"""
        sid = SentimentIntensityAnalyzer()
        if type(song) is str:
            return sid.polarity_scores(", ".join(song.split("\n")))['compound']
        else:
            return 0

    def transform(self, X, y=None):
        """Transform function of this feature extractor"""
        return np.array(X.apply(self.sent_score)).reshape(-1, 1)

    def fit(self, X, y=None):
        """Returns `self` unless there is other functions in train and test"""
        return self

In [156]:
class AverageWordLength(BaseEstimator, TransformerMixin):
    """Takes in the lyrics column and outputs average word length"""

    def __init__(self):
        pass

    def average_word_length(self, song):
        """Helper code to compute average word length"""
        return np.mean([len(w) for w in song.split()])

    def transform(self, X, y=None):
        """Transform function of this feature extractor"""
        return np.array(X.apply(self.average_word_length)).reshape(-1, 1)

    def fit(self, X, y=None):
        """Returns `self` unless there is other functions in train and test"""
        return self

In [157]:
# features:
word2vec = ('word2vec', Word2Vec())
countVectorizer = ('vect', CountVectorizer(
    stop_words='english', lowercase=True, analyzer='word'))
tfidf = ('tfidf', TfidfVectorizer())
polarity = ('polarity', Polarity())
avg_len = ('avg_len', AverageWordLength())


# pipe this features into the pipeline and FeatureUnion
pipeline = Pipeline([
    ('feats', FeatureUnion([avg_len, polarity, countVectorizer])),
    ('clf', RandomForestClassifier(n_estimators=200, n_jobs=-1))
])

In [158]:
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)
accuracy_score(pred, y_test)

0.5712761091629822

In [None]:
#Since we have 11 genre categories to classify, the accuracy we are getting is pretty good.