In [1]:
from bs4 import BeautifulSoup as soup
import os

In [2]:
ARTISTS = ["Pavement", "Sonic Youth", "Barbra Streisand", "Peaches"]

complete_lyrics = []
lyrics = {}
for artist in ARTISTS:
    directory = f"lyrics/{artist.lower().replace(' ', '-')}-lyrics"
    allfiles = os.listdir(directory)
    all_lyrics =  []
    for file in allfiles:
        with open(directory + "/" + file, "r", encoding="utf-8") as f:
            song_lyrics = f.read()
            all_lyrics.append(song_lyrics)
    lyrics[artist] = all_lyrics
    complete_lyrics += all_lyrics

In [3]:
len(complete_lyrics)

769

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words="english")
cv.fit(complete_lyrics)
corpus_vecs = cv.transform(complete_lyrics)

indices = []
for i, artist in enumerate(ARTISTS):
    indices += [i] * len(lyrics[artist])

In [5]:
import pandas as pd
df = pd.DataFrame(corpus_vecs.todense(), index=indices, columns=cv.get_feature_names())
df

Unnamed: 0,002,03,0h,10,100,12,14th,15,1776,19,...,éclairé,étaient,être,הו,יו,ימ,ירו,נו,עו,צו
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
X = df
y = df.index

In [7]:
# Split the data into train and test set
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, CategoricalNB

models_params = {
    "MultinomialNB": {},
    "CategoricalNB": {},
    "RandomForestClassifier": {"n_estimators":500, "max_depth":200, "max_features":'auto', "n_jobs":-1, "random_state":1}, 
    "LogisticRegression": {"C":1e6}
}

scores = {}
for model in models_params:
    if model == "LogisticRegression":
        m = LogisticRegression(**models_params[model])
    elif model == "RandomForestClassifier":
        m = RandomForestClassifier(**models_params[model])
    elif model == "MultinomialNB":
        m = MultinomialNB(**models_params[model])

    m.fit(Xtrain, ytrain)
    score_train = m.score(Xtrain, ytrain)
    score_test = m.score(Xtest, ytest)
    scores[f"{model}"] = {"params": models_params[model], "train score": score_train, "test score": score_test}

In [9]:
df_scores = pd.DataFrame(scores).T
df_scores

Unnamed: 0,params,train score,test score
MultinomialNB,{},0.596748,0.545455
CategoricalNB,{},0.596748,0.545455
RandomForestClassifier,"{'n_estimators': 500, 'max_depth': 200, 'max_f...",1.0,0.681818
LogisticRegression,{'C': 1000000.0},1.0,0.75974


In [10]:
new_song = ["""
Baby, can't you see
I'm calling
A guy like you
Should wear a warning
It's dangerous
I'm fallin'

There's no escape
I can't wait
I need a hit
Baby, give me it
You're dangerous
I'm lovin' it

Too high
Can't come down
Losing my head
Spinning 'round and 'round
Do you feel me now

With a taste of your lips
I'm on a ride
You're toxic
I'm slipping under
With a taste of poison paradise
I'm addicted to you
Don't you know that you're toxic
And I love what you do
Don't you know that you're toxic

It's getting late
To give you up
I took a sip
From my devil cup
Slowly
It's taking over me

Too high
Can't come down
It's in the air
And it's all around
Can you feel me now

With a taste of your lips
I'm on a ride
You're toxic
I'm slipping under
With a taste of poison paradise
I'm addicted to you
Don't you know that you're toxic
And I love what you do
Don't you know that you're toxic

Don't you know that you're toxic
Taste of my lips and having fun
With a taste of your lips
I'm on a ride
You're toxic
I'm slipping under
With a taste of poison paradise
I'm addicted to you
Don't you know that you're toxic
And I love what you do
Don't you know that you're toxic

With a taste of your lips
I'm on a ride
You're toxic
I'm slipping under
With a taste of poison paradise
I'm addicted to you
Don't you know that you're toxic
And I love what you do
Don't you know that you're toxic

I'm intoxicated now
I think you'll love it now
I think I'm ready now
I think I'm ready now
I'm intoxicated now
I think you'll love it now
I think I'm ready now
"""]

new_song_vecs = cv.transform(new_song)
ynew = new_song_vecs.todense()

In [11]:
song_pred = m.predict(ynew)[0]
print(f"This song is definitely by {ARTISTS[song_pred]}!")

This song is definitely by Sonic Youth!
