## Train with data from Kaggle
[How to access datasets directly from Kaggle](https://accredianpublication.medium.com/how-to-access-datasets-directly-from-kaggle-6a3552ea891c#:~:text=Login%20to%20your%20kaggle.com,this%20on%20your%20local%20system.)

In [3]:
import pandas as pd
import sqlite3

conn = sqlite3.connect("spotify_db/spotify.db")
songs = pd.read_sql_query("SELECT * from songs_lyrics", conn)
conn.close()

In [8]:
songs.head(5)

Unnamed: 0,title,lyrics
0,In a Crowd of Thousands,it was june. i was ten. i still think of that ...
1,A Rumor in St. Petersburg,the neva flows a new wind blows. and soon it w...
2,Wake Up,i don't wanna wake up. i want you spread out o...
3,No,my mind is invaded. my gates are ignored. my t...
4,Perfect,sometimes is never quite enough. if you're fla...


## Preprocess

In [4]:
import string
import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# remove stopwords and punctuation
stopwords = set(stopwords.words('english'))
music_stopwords = ['ooh','yeah','hey','whoa','woah', 'ohh', 'was', 'mmm', 'oooh','yah','yeh','mmm', 'hmm','deh','doh','jah','wa']
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def preprocess(doc):
    stop_free = ' '.join([i for i in doc.lower().split() if i not in stopwords])
    stop_free = ' '.join([i for i in stop_free.split() if i not in music_stopwords])
    punc_free = ''.join(c for c in stop_free if c not in exclude)
    normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

clean_corpus = []
for song, doc in zip(songs['title'], songs['lyrics']):
    clean_lyrics = []
    for token in preprocess(song + " " + doc).split():
        if len(token) > 2 and not token.isnumeric():
            clean_lyrics.append(token)
    clean_corpus.append(clean_lyrics)

In [5]:
from gensim import corpora

# create document-term matrix
dictionary = corpora.Dictionary(clean_corpus)
dictionary.filter_extremes(no_below=0.2, no_above=0.8)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_corpus]

## LSA

In [6]:
from gensim.models import LsiModel

lsa = LsiModel(doc_term_matrix, num_topics=6, id2word=dictionary, onepass=False)
lsa.print_topics(num_topics=6, num_words=20)

[(0,
  '0.443*"love" + 0.328*"like" + 0.300*"know" + 0.230*"want" + 0.206*"baby" + 0.153*"let" + 0.152*"you" + 0.146*"one" + 0.141*"got" + 0.136*"cause" + 0.114*"get" + 0.108*"never" + 0.106*"come" + 0.105*"need" + 0.104*"bad" + 0.102*"take" + 0.099*"yeah" + 0.099*"see" + 0.099*"time" + 0.099*"say"'),
 (1,
  '-0.623*"love" + -0.352*"want" + -0.265*"bad" + -0.248*"romance" + 0.216*"know" + 0.190*"like" + 0.122*"got" + -0.118*"higher" + 0.113*"get" + 0.111*"work" + -0.106*"bring" + 0.093*"let" + 0.084*"girl" + 0.083*"aint" + 0.076*"cause" + 0.075*"yeah" + 0.074*"good" + 0.073*"make" + 0.069*"come" + -0.065*"caught"'),
 (2,
  '0.940*"work" + 0.159*"ohoh" + 0.098*"gotta" + 0.096*"home" + -0.091*"like" + 0.083*"want" + 0.068*"bad" + 0.065*"romance" + 0.058*"put" + -0.053*"gimme" + -0.046*"one" + -0.045*"got" + 0.042*"body" + -0.041*"you" + 0.035*"yeah" + -0.035*"never" + -0.034*"way" + 0.034*"ima" + -0.033*"keep" + -0.032*"time"'),
 (3,
  '0.893*"gimme" + 0.255*"more" + 0.178*"danja" + 0.14

In [9]:
from gensim.models import LdaModel

lda = LdaModel(doc_term_matrix, num_topics=6, id2word=dictionary)
lda.print_topics(num_topics=6, num_words=20)

[(0,
  '0.017*"like" + 0.014*"love" + 0.013*"know" + 0.010*"you" + 0.009*"let" + 0.008*"one" + 0.008*"take" + 0.008*"work" + 0.008*"way" + 0.008*"never" + 0.007*"cause" + 0.006*"time" + 0.006*"got" + 0.006*"get" + 0.005*"seventeen" + 0.005*"ever" + 0.005*"day" + 0.005*"could" + 0.005*"feel" + 0.005*"come"'),
 (1,
  '0.011*"love" + 0.009*"good" + 0.009*"baby" + 0.009*"like" + 0.008*"need" + 0.008*"you" + 0.008*"one" + 0.007*"know" + 0.007*"make" + 0.006*"time" + 0.006*"yeah" + 0.006*"girl" + 0.006*"aint" + 0.005*"come" + 0.005*"got" + 0.005*"life" + 0.005*"let" + 0.005*"way" + 0.005*"cant" + 0.005*"want"'),
 (2,
  '0.020*"love" + 0.019*"know" + 0.018*"like" + 0.010*"got" + 0.009*"one" + 0.007*"girl" + 0.007*"day" + 0.007*"never" + 0.007*"come" + 0.007*"you" + 0.007*"baby" + 0.007*"still" + 0.006*"say" + 0.006*"think" + 0.006*"cause" + 0.006*"cant" + 0.006*"yeah" + 0.005*"gimme" + 0.005*"keep" + 0.005*"take"'),
 (3,
  '0.016*"like" + 0.015*"know" + 0.011*"get" + 0.008*"cause" + 0.007*"lo

## BERTopic 

In [None]:
from bertopic import BERTopic
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(songs['lyrics'])

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,94,-1_you_the_and_me,"[you, the, and, me, it, your, to, in, on, that]",[yeah y'all know what it is. katy perry juicy ...
1,0,523,0_you_the_and_it,"[you, the, and, it, to, me, my, that, we, in]",[you know i want you. it's not a secret i try ...
2,1,50,1_she_you_the_me,"[she, you, the, me, her, and, it, to, ooh, that]",[oh. oh. oh. oh. . . oh her eyes her eyes make...


## Test

In [None]:
import sqlite3

# import data from spotify database as df
conn = sqlite3.connect("spotify_db/spotify.db")
songs = pd.read_sql_query("SELECT * from songs_lyrics", conn)
conn.close()

In [2]:
songs

Unnamed: 0,title,lyrics
0,In a Crowd of Thousands,it was june. i was ten. i still think of that ...
1,A Rumor in St. Petersburg,the neva flows a new wind blows. and soon it w...
2,Wake Up,i don't wanna wake up. i want you spread out o...
3,No,my mind is invaded. my gates are ignored. my t...
4,Perfect,sometimes is never quite enough. if you're fla...
...,...,...
662,Baby,baby baby tell me what's the antidote. wouldn'...
663,Till the World Ends,aw. . . this kitten got your tongue tied in kn...
664,NEL,. . hoy pensé en salir a buscarte pero nel. ll...
665,hot girl bummer,fuck you and you and you. i hate your friends ...
