# Project: Lyrics Classifier

### Goal
* Scrape lyrics from lyrics.com and save them on hard drive
* Load lyrics into corpus
* Create vectors for bag-of-words approach
* Train models on lyrics
* Predict artist of song

### Import libraries

In [1]:
from bs4 import BeautifulSoup as soup
import os
import spacy

### Load corpus

In [2]:
lang_model = spacy.load("en_core_web_md")

In [23]:
# other artists: "Peaches", "Barbra Streisand", "Britney Spears", "The Velvet Underground"
ARTISTS = [
    "Peaches",
    "Barbra Streisand",
    "Britney Spears",
]  


def clean_my_song(song, model):
    # parse the song through the spacy model
    tokenised_song = model(song)
    clean_song = ""
    # loop through words, drop stop words
    for word in tokenised_song:
        if not word.is_stop:
            clean_song += word.lemma_ + " "
    # return the lemmatized version to the call
    return clean_song.strip()


def create_lyrics_corpus(artists):
    """loads song texts from files and stores lyrics and artist index in seperate lists"""
    complete_lyrics = []
    indices = []
    for i, artist in enumerate(ARTISTS):
        directory = f"lyrics/{artist.lower().replace(' ', '-')}-lyrics"
        allfiles = os.listdir(directory)
        all_lyrics = []
        for file in allfiles:
            with open(directory + "/" + file, "r", encoding="utf-8") as f:
                song_lyrics = f.read()
                all_lyrics.append(song_lyrics)
                all_lyrics.append(clean_my_song(song_lyrics, lang_model))
        indices += [i] * len(all_lyrics)
        print(artist, len(all_lyrics))
        complete_lyrics += all_lyrics
    return complete_lyrics, indices

In [24]:
# Store lists into variables, print out number of songs by artist
complete_lyrics, indices = create_lyrics_corpus(ARTISTS)

Peaches 67
Barbra Streisand 428
Britney Spears 152


### Save to csv (optional)

In [25]:
import pandas as pd

df = pd.DataFrame(data=complete_lyrics, index=indices)

In [7]:
df.to_csv('songs.csv', sep=';')

In [8]:
pd.read_csv('songs.csv', sep=';', index_col=0)

Unnamed: 0,0
0,people watch scrub floor \n scrubbin ' floor g...
0,Exuma \n hear \n got to time \n Gettin ' start...
0,evening moonlight \n mother finish work \n sit...
0,love funny sad \n quiet mad \n good thing bad ...
0,"be get home , be get shoe \n be get money , be..."
...,...
3,"touch , taste , breath , face \n hand , head ,..."
3,"oh , love \n oh , yeah , yeah \n oh , yeah \n\..."
3,"ooh hey , yeah \n\n hush , stop \n , baby \n \..."
3,"yeah , smash radio bet , pen ! \n Britney ( br..."


### Create vectors

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


def vectors_and_df(complete_lyrics, indices):
    """creates vectors for songs and returns dataframe with songs as word vectors by all artists"""
    cv = TfidfVectorizer(stop_words="english")
    cv.fit(complete_lyrics)
    corpus_vecs = cv.transform(complete_lyrics)
    return pd.DataFrame(corpus_vecs.todense(), index=indices, columns=cv.get_feature_names()), cv


In [27]:
# Store results into dataframe, keep cv for later prediction
df, cv = vectors_and_df(complete_lyrics, indices)
df

Unnamed: 0,0h,10,12,150,17,1776,20,24,2x,30,...,électrique,étaient,être,הו,יו,ימ,ירו,נו,עו,צו
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# Define features and target column
X = df
y = df.index

### Train test split

In [29]:
# Split the data into train and test set
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

### Train models

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, CategoricalNB

models_params = {
    "MultinomialNB": {"alpha": 0.005},
    "CategoricalNB": {"alpha": 0.01},
    "RandomForestClassifier": {
        "n_estimators": 500,
        "max_depth": 200,
        "max_features": "auto",
        "n_jobs": -1,
        "random_state": 1,
    },
    "LogisticRegression": {"C": 1e6},
}

def train_models(models_params):
    """trains models on corpus and returns dataframe with scores"""
    scores = {}
    for model in models_params:
        if model == "LogisticRegression":
            m = LogisticRegression(**models_params[model])
        elif model == "RandomForestClassifier":
            m = RandomForestClassifier(**models_params[model])
        elif model == "MultinomialNB":
            m = MultinomialNB(**models_params[model])
        elif model == "CategoricalNB":
            m = MultinomialNB(**models_params[model])

        m.fit(Xtrain, ytrain)
        score_train = m.score(Xtrain, ytrain)
        score_test = m.score(Xtest, ytest)
        scores[f"{model}"] = {
            "params": models_params[model],
            "train score": score_train,
            "test score": score_test,
        }
    return pd.DataFrame(scores).T

In [41]:
df_scores = train_models(models_params)
df_scores

Unnamed: 0,params,train score,test score
MultinomialNB,{'alpha': 0.005},0.992263,0.792308
CategoricalNB,{'alpha': 0.01},0.992263,0.784615
RandomForestClassifier,"{'n_estimators': 500, 'max_depth': 200, 'max_f...",1.0,0.746154
LogisticRegression,{'C': 1000000.0},1.0,0.761538


### Train on full data set

In [42]:
# Train on most promising model
model = "MultinomialNB"
m = MultinomialNB(**models_params[model])
m.fit(X, y)
m.score(X, y)

0.990726429675425

In [43]:
# Input Elton John's "Can you feel the love tonight" for prediction
new_song = [
    """
    There's a calm surrender
    To the rush of day
    When the heat of a rolling wave
    Can be turned away
    An enchanted moment
    And it sees me through
    It's enough for this restless warrior
    Just to be with you
    And can you feel the love tonight?
    It is where we are
    It's enough for this wide-eyed wanderer
    That we've got this far
    And can you feel the love tonight? (Tonight)
    How it's laid to rest?
    It's enough to make kings and vagabonds
    Believe the very best
    There's a time for everyone
    If they only learn
    That the twisting kaleidoscope
    Moves us all in turn
    There's a rhyme and reason
    To the wild outdoors
    When the heart of this star-crossed voyager
    Beats in time with yours
    And can you feel the love tonight? (Tonight)
    It is where we are
    It's enough for this wide-eyed wanderer
    That we've got this far
    And can you feel the love tonight? (Tonight)
    How it's laid to rest?
    It's enough to make kings and vagabonds
    Believe the very best
    It's enough to make kings and vagabonds
    Believe the very best
    """
]

In [44]:
def predict_artist(song_lyrics):
    """predicts artist of song based on artists in corpus"""
    # transform song into vector matrix
    new_song_vecs = cv.transform(new_song)
    ynew = new_song_vecs.todense()
    
    print(f"This classifier predicts the song to be written by:\n")
    for i, artist in enumerate(ARTISTS):
        print(f"{artist}: {round(m.predict_proba(ynew)[0][i] * 100, 1)}%.")
    song_pred = m.predict(ynew)[0]
    confidence = m.predict_proba(ynew).max()
    if confidence > 0.9:
        confidence_word = "definitely"
    elif confidence > 0.7:
        confidence_word = "probably"
    else:
        confidence_word = "maybe"
    print(f"\nThis song is {confidence_word} by {ARTISTS[song_pred]}!")

In [45]:
predict_artist(new_song)

This classifier predicts the song to be written by:

Peaches: 0.0%.
Barbra Streisand: 99.6%.
Britney Spears: 0.4%.

This song is definitely by Barbra Streisand!
