# Project: Lyrics Classifier

### Goal
* Scrape lyrics from lyrics.com and save them on hard drive
* Load lyrics into corpus
* Create vectors for bag-of-words approach
* Train models on lyrics
* Predict artist of song

### Import libraries

In [1]:
from bs4 import BeautifulSoup as soup
import os

### Load corpus

In [2]:
# other artists: "Peaches", "Barbra Streisand", "Britney Spears", "The Velvet Underground"
ARTISTS = [
    "Peaches",
    "Britney Spears",
    "Barbra Streisand",
    "Pavement",
    "Neutral Milk Hotel",
    "Sonic Youth",
    "Stephen Malkmus",
    "The Velvet Underground"
]  


def create_lyrics_corpus(artists):
    """loads song texts from files and stores lyrics and artist index in seperate lists"""
    complete_lyrics = []
    indices = []
    for i, artist in enumerate(ARTISTS):
        directory = f"lyrics/{artist.lower().replace(' ', '-')}-lyrics"
        allfiles = os.listdir(directory)
        all_lyrics = []
        for file in allfiles:
            with open(directory + "/" + file, "r", encoding="utf-8") as f:
                song_lyrics = f.read()
                all_lyrics.append(song_lyrics)
        indices += [i] * len(all_lyrics)
        print(artist, len(all_lyrics))
        complete_lyrics += all_lyrics
    return complete_lyrics, indices

In [3]:
# Store lists into variables, print out number of songs by artist
complete_lyrics, indices = create_lyrics_corpus(ARTISTS)

Peaches 67
Britney Spears 152
Barbra Streisand 428
Pavement 79
Neutral Milk Hotel 22
Sonic Youth 195
Stephen Malkmus 29
The Velvet Underground 77


### Create vectors

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


def vectors_and_df(complete_lyrics, indices):
    """creates vectors for songs and returns dataframe with songs as word vectors by all artists"""
    cv = TfidfVectorizer(stop_words="english")
    cv.fit(complete_lyrics)
    corpus_vecs = cv.transform(complete_lyrics)
    return pd.DataFrame(corpus_vecs.todense(), index=indices, columns=cv.get_feature_names()), cv


In [5]:
# Store results into dataframe, keep cv for later prediction
df, cv = vectors_and_df(complete_lyrics, indices)
df

Unnamed: 0,002,03,0h,10,100,12,125,14th,15,150,...,électrique,étaient,être,הו,יו,ימ,ירו,נו,עו,צו
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Define features and target column
X = df
y = df.index

### Train test split

In [7]:
# Split the data into train and test set
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

### Train models

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, CategoricalNB

models_params = {
    "MultinomialNB": {},
    "CategoricalNB": {},
    "RandomForestClassifier": {
        "n_estimators": 500,
        "max_depth": 200,
        "max_features": "auto",
        "n_jobs": -1,
        "random_state": 1,
    },
    "LogisticRegression": {"C": 1e6},
}

def train_models(models_params):
    """trains models on corpus and returns dataframe with scores"""
    scores = {}
    for model in models_params:
        if model == "LogisticRegression":
            m = LogisticRegression(**models_params[model])
        elif model == "RandomForestClassifier":
            m = RandomForestClassifier(**models_params[model])
        elif model == "MultinomialNB":
            m = MultinomialNB(**models_params[model])
        elif model == "CategoricalNB":
            m = MultinomialNB(**models_params[model])

        m.fit(Xtrain, ytrain)
        score_train = m.score(Xtrain, ytrain)
        score_test = m.score(Xtest, ytest)
        scores[f"{model}"] = {
            "params": models_params[model],
            "train score": score_train,
            "test score": score_test,
        }
    return pd.DataFrame(scores).T

In [9]:
df_scores = train_models(models_params)
df_scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,params,train score,test score
MultinomialNB,{},0.460072,0.414286
CategoricalNB,{},0.460072,0.414286
RandomForestClassifier,"{'n_estimators': 500, 'max_depth': 200, 'max_f...",0.998808,0.533333
LogisticRegression,{'C': 1000000.0},0.998808,0.528571


### Train on full data set

In [10]:
# Train on most promising model
model = "LogisticRegression"
m = LogisticRegression(**models_params[model])
m.fit(X, y)
m.score(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9990467111534795

In [11]:
# Input Elton John's "Can you feel the love tonight" for prediction
new_song = [
    """
    There's a calm surrender
    To the rush of day
    When the heat of a rolling wave
    Can be turned away
    An enchanted moment
    And it sees me through
    It's enough for this restless warrior
    Just to be with you
    And can you feel the love tonight?
    It is where we are
    It's enough for this wide-eyed wanderer
    That we've got this far
    And can you feel the love tonight? (Tonight)
    How it's laid to rest?
    It's enough to make kings and vagabonds
    Believe the very best
    There's a time for everyone
    If they only learn
    That the twisting kaleidoscope
    Moves us all in turn
    There's a rhyme and reason
    To the wild outdoors
    When the heart of this star-crossed voyager
    Beats in time with yours
    And can you feel the love tonight? (Tonight)
    It is where we are
    It's enough for this wide-eyed wanderer
    That we've got this far
    And can you feel the love tonight? (Tonight)
    How it's laid to rest?
    It's enough to make kings and vagabonds
    Believe the very best
    It's enough to make kings and vagabonds
    Believe the very best
    """
]

In [14]:
def predict_artist(song_lyrics):
    """predicts artist of song based on artists in corpus"""
    # transform song into vector matrix
    new_song_vecs = cv.transform(new_song)
    ynew = new_song_vecs.todense()
    
    print(f"This classifier predicts the song to be written by:\n")
    for i, artist in enumerate(ARTISTS):
        print(f"{artist}: {round(m.predict_proba(ynew)[0][i], 3) * 100}%.")
    song_pred = m.predict(ynew)[0]
    confidence = m.predict_proba(ynew).max()
    if confidence > 0.9:
        confidence_word = "definitely"
    elif confidence > 0.7:
        confidence_word = "probably"
    else:
        confidence_word = "maybe"
    print(f"\nThis song is {confidence_word} by {ARTISTS[song_pred]}!")

In [15]:
predict_artist(new_song)

This classifier predicts the song to be written by:

Peaches: 0.0%.
Britney Spears: 43.1%.
Barbra Streisand: 56.89999999999999%.
Pavement: 0.0%.
Neutral Milk Hotel: 0.0%.
Sonic Youth: 0.0%.
Stephen Malkmus: 0.0%.
The Velvet Underground: 0.0%.

This song is maybe by Barbra Streisand!
