In [1]:
from bs4 import BeautifulSoup as soup
import os

In [2]:
ARTISTS = [
    "Pavement",
    "Neutral Milk Hotel",
    "Sonic Youth",
    "Stephen Malkmus",
]  # "Peaches", "Barbra Streisand", "Britney Spears"

complete_lyrics = []
lyrics = {}
for artist in ARTISTS:
    directory = f"lyrics/{artist.lower().replace(' ', '-')}-lyrics"
    allfiles = os.listdir(directory)
    all_lyrics = []
    for file in allfiles:
        with open(directory + "/" + file, "r", encoding="utf-8") as f:
            song_lyrics = f.read()
            all_lyrics.append(song_lyrics)
    lyrics[artist] = all_lyrics
    complete_lyrics += all_lyrics

In [3]:
for artist in ARTISTS:
    print(artist, len(lyrics[artist]))

Pavement 79
Neutral Milk Hotel 22
Sonic Youth 195
Stephen Malkmus 29


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words="english")
cv.fit(complete_lyrics)
corpus_vecs = cv.transform(complete_lyrics)

indices = []
for i, artist in enumerate(ARTISTS):
    indices += [i] * len(lyrics[artist])

In [5]:
import pandas as pd

df = pd.DataFrame(corpus_vecs.todense(), index=indices, columns=cv.get_feature_names())
df

Unnamed: 0,002,03,10,100,14th,15,19,1929,1945,1966,...,young,yourr,youth,youths,yr,zany,zenith,zero,zoom,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.162997,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [6]:
X = df
y = df.index

In [7]:
# Split the data into train and test set
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, CategoricalNB

models_params = {
    "MultinomialNB": {},
    "CategoricalNB": {},
    "RandomForestClassifier": {
        "n_estimators": 500,
        "max_depth": 200,
        "max_features": "auto",
        "n_jobs": -1,
        "random_state": 1,
    },
    "LogisticRegression": {"C": 1e6},
}

scores = {}
for model in models_params:
    if model == "LogisticRegression":
        m = LogisticRegression(**models_params[model])
    elif model == "RandomForestClassifier":
        m = RandomForestClassifier(**models_params[model])
    elif model == "MultinomialNB":
        m = MultinomialNB(**models_params[model])
    elif model == "CategoricalNB":
        m = MultinomialNB(**models_params[model])

    m.fit(Xtrain, ytrain)
    score_train = m.score(Xtrain, ytrain)
    score_test = m.score(Xtest, ytest)
    scores[f"{model}"] = {
        "params": models_params[model],
        "train score": score_train,
        "test score": score_test,
    }

In [9]:
df_scores = pd.DataFrame(scores).T
df_scores

Unnamed: 0,params,train score,test score
MultinomialNB,{},0.611538,0.615385
CategoricalNB,{},0.611538,0.615385
RandomForestClassifier,"{'n_estimators': 500, 'max_depth': 200, 'max_f...",1.0,0.630769
LogisticRegression,{'C': 1000000.0},1.0,0.676923


In [10]:
new_song = [
    """
    I'm waiting for my man
    Got 26 dollars in my hand
    Up to lexington 125
    Feelin' sick and dirty
    Huh, I'm waiting for my man

    Hey white boy, what you doin' uptown
    Hey white boy, you chasin' my women around
    Pardon me sir, it's furthest from my mind
    I'm just waitin' for a dear-dear friend of mine
    I'm waiting for my man, come on

    Here he comes, he's all dressed in black
    Pr shoes and a big straw hat
    He's never early, he's always late
    First thing you learn is that you always gotta wait
    I'm waiting for my man

    Up to a brownstone, up three flights of stairs
    Everybody's pinned you and nobody cares
    He's got the works, gives you sweet taste
    Then you gotta split because you got no time to waste
    I'm waiting for my man

    Hey baby, don't you holler, don't you ball and shout
    I'm feeling good, I'm gonna work it on out
    I'm feeling good, feeling so fine
    Until tomorrow, but that's just some other time
    I'm waiting for my man
    I'm waiting for my man
    I'm waiting for my man
    Man-man-man-man-man-man-man
    """
]

In [11]:
new_song_vecs = cv.transform(new_song)
ynew = new_song_vecs.todense()

for i, artist in enumerate(ARTISTS):
    print(f"{artist}: {round(m.predict_proba(ynew)[0][i], 3)}")
song_pred = m.predict(ynew)[0]
confidence = m.predict_proba(ynew).max()
if confidence > 0.9:
    confidence_word = "definitely"
elif confidence > 0.7:
    confidence_word = "probably"
else:
    confidence_word = "maybe"
print(f"This song is {confidence_word} by {ARTISTS[song_pred]}!")

Pavement: 0.163
Neutral Milk Hotel: 0.001
Sonic Youth: 0.835
Stephen Malkmus: 0.001
This song is probably by Sonic Youth!
