In [417]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin

In [418]:
df = pd.read_csv("lyrics.csv")
# df["polarity_tot"] = sid.polarity_scores(", ".join(song['lyrics'].split("\n")))['compound']
# pol = df["polarity_tot"]
df = df.sample(n=100000)
df = df[pd.notnull(df['lyrics'])]
df = df[df['genre']!= 'Not Available']
df = df[df['genre']!= 'Other']
#df = df.replace({'\n': ' '}, regex=True)

In [419]:
X, y = df['lyrics'], df['genre']

In [420]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# those are Extractors classes for Pipeline and FeatureUnion:

In [421]:
class Polarity(BaseEstimator, TransformerMixin):
    """Takes in the lyrics column and outputs polarity score"""

    def __init__(self):
        pass

    def sent_score(self, song):
        """Helper code to compute sentiment"""
        sid = SentimentIntensityAnalyzer()
        if type(song) is str:
            return sid.polarity_scores(", ".join(song.split("\n")))['compound']
        else:
            return 0
    
    def transform(self, X, y=None):
        """Transform function of this feature extractor"""
        return np.array(X.apply(self.sent_score)).reshape(-1,1)

    def fit(self, X, y=None):
        """Returns `self` unless there is other functions in train and test"""
        return self

In [422]:
class AverageWordLength(BaseEstimator, TransformerMixin):
    """Takes in the lyrics column and outputs average word length"""

    def __init__(self):
        pass

    def average_word_length(self, song):
        """Helper code to compute average word length"""
        return np.mean([len(w) for w in song.split()])

    def transform(self, X, y=None):
        """Transform function of this feature extractor"""
        return np.array(X.apply(self.average_word_length)).reshape(-1,1)

    def fit(self, X, y=None):
        """Returns `self` unless there is other functions in train and test"""
        return self

In [423]:
# features:
countVectorizer = ('vect', CountVectorizer(stop_words='english', lowercase=True, analyzer = 'word'))
tfidf = ('tfidf', TfidfVectorizer())
polarity = ('polarity', Polarity())
avg_len = ('avg_len', AverageWordLength())

# pipe this features into the pipeline and FeatureUnion
pipeline = Pipeline([
    ('feats', FeatureUnion([avg_len, polarity, countVectorizer])),
    ('clf', RandomForestClassifier(n_estimators = 100))  # classifier, n_estimators -  max number of trees
])

In [424]:
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)
accuracy_score(pred, y_test)

0.5753959146201515

# TO DO NEXT:
try to elimninate '\n' in dataframe already and change the code feature Extractor functions

for this I think we can use df = df.replace({'
': ' '}, regex=True)

add some other features like unique words or tfid

Try other algorithms

try feature selection to improve scores

How to use full data set? spark may be?