# GOAL:
The goal of this notebook is to perform LDA topic modeling using the training data, and then applying the results on the test set. This will complete the dataset construction part of the pipeline, allowing us to move on to the models.

In [None]:
# mount google drive
from google.colab import drive
import os

drive.mount('/content/drive/')
os.chdir('/content/drive/Shareddrives/CS260-Project/data/')

Mounted at /content/drive/


In [None]:
import csv
# open the training and test set
X_train = []
Y_train = []
with open('./train/big-kaggle-train.csv') as traindata:
  line = 1
  reader = csv.reader(traindata, delimiter=',')
  for row in reader:
    if line > 1:
      X_train.append(row[0])
      Y_train.append(row[1])
    line += 1

X_test = []
Y_test = []
with open('./test/big-kaggle-test.csv') as testdata:
  line = 1
  reader = csv.reader(testdata, delimiter=',')
  for row in reader:
    if line > 1:
      X_test.append(row[0])
      Y_test.append(row[1])
    line += 1

X_val = []
Y_val = []
with open('./val/kaggle-val.csv') as valdata:
  line = 1
  reader = csv.reader(valdata, delimiter=',')
  for row in reader:
    if line > 1:
      X_val.append(row[0])
      Y_val.append(row[1])
    line += 1
print(X_val[0])
print(Y_val[0])

mukesh
hold time feel break feel untrue convince speak voice tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little turn dust play house ruin run leave save like chase train late late tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little run leave save like chase train know late late play break string feel heart want feel tell real truth hurt lie worse anymore little know little hold time feel


Functions below come from: https://tim-denzler.medium.com/whats-in-a-song-using-lda-to-find-topics-in-over-120-000-songs-53785767b692

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

def lyric_tokenization(lyric_list):
  lyric_corpus_tokenized = []
  tokenizer = RegexpTokenizer(r'\w+')
  for lyric in lyric_list:
    tokenized_lyric = tokenizer.tokenize(lyric.lower())
    lyric_corpus_tokenized.append(tokenized_lyric)
  return lyric_corpus_tokenized

def token_filtering(lyric_tokenized_list):
  for s, song in enumerate(lyric_tokenized_list):
    filtered_song = []    
    for token in song:
        if len(token) > 2 and not token.isnumeric():
            filtered_song.append(token)
    lyric_tokenized_list[s] = filtered_song
  return lyric_tokenized_list

def lemmatization(lyric_corpus_tokenized):
  lemmatizer = WordNetLemmatizer()
  for s,song in enumerate(lyric_corpus_tokenized):
    lemmatized_tokens = []
    for token in song:
        lemmatized_tokens.append(lemmatizer.lemmatize(token))
    lyric_corpus_tokenized[s] = lemmatized_tokens
  return lyric_corpus_tokenized

def remove_stop_words(lyric_corpus_tokenized):
  profanities = ['fuck', 'shit', 'bitch']
  stop_words = stopwords.words('english')
  new_stop_words = ['ooh','yeah','hey','whoa','woah', 'ohh', 'was', 'mmm', 'oooh','yah','yeh','mmm', 'hmm','deh','doh','jah','wa']
  stop_words.extend(new_stop_words)
  for s,song in enumerate(lyric_corpus_tokenized):
    filtered_text = []    
    for token in song:
        if token not in stop_words and token not in profanities:
            filtered_text.append(token)
    lyric_corpus_tokenized[s] = filtered_text
  return lyric_corpus_tokenized

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
Y_train_tokenized = lyric_tokenization(Y_train)
Y_train_tokenized = token_filtering(Y_train_tokenized)
Y_train_tokenized = lemmatization(Y_train_tokenized)
Y_train_tokenized = remove_stop_words(Y_train_tokenized)

#NEXT STEP: LDA Modeling

In [None]:
import gensim
from gensim.corpora import Dictionary
from gensim.corpora import MmCorpus

dictionary = Dictionary(Y_train_tokenized)
dictionary.filter_extremes(no_below=100, no_above=0.8)

gensim_corpus = [dictionary.doc2bow(song) for song in Y_train_tokenized]
temp = dictionary[0]
id2word = dictionary.id2token

In [None]:
!pip install --upgrade gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim
  Downloading gensim-4.2.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.5 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


In [None]:
import gensim
from gensim import models, test
#from gensim.models import LDAModel
from gensim.test import utils
from gensim.test.utils import datapath

lda_model = models.LdaModel(
    corpus=gensim_corpus,
    id2word=id2word,
    chunksize=2000,
    alpha='auto',
    eta='auto',
    iterations=400,
    num_topics=10, # FOR GPT-3 BASELNES
    passes=20
)

In [None]:
!ls 
!pwd

 big-kaggle-dataset.csv   big-lda-val-20.csv	    Lyrics_LDA_k_30.html
 big-lda-test-20.csv	  big-lda-val-30.csv	    Lyrics_LDA_k_40.html
 big-lda-test-20.gsheet   big-lda-val-40.csv	    Lyrics_LDA_k_6.html
 big-lda-test-30.csv	 'deprecated (old info)'    out
 big-lda-test-40.csv	  lda-test-6-updated.csv    test
 big-lda-train-20.csv	  lda-train-6-updated.csv   train
 big-lda-train-30.csv	  lda-val-6-updated.csv     val
 big-lda-train-40.csv	  Lyrics_LDA_k_20.html	    wandb
/content/drive/Shareddrives/CS260-Project/data


In [None]:
train_model_save = datapath("/content/drive/Shareddrives/CS260-Project/models/lda/big-lda-train-10")
lda_model.save(train_model_save)

In [None]:
for idx, topic in lda_model.show_topics(formatted=False, num_words=15, num_topics=40):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

Topic: 0 
Words: time|one|never|know|way|see|say|could|back|day|thing|still|would|always|ever
Topic: 1 
Words: know|baby|want|get|let|got|gonna|wanna|girl|make|right|come|need|cause|take
Topic: 2 
Words: song|lyric|dance|sing|music|blue|sweet|play|hear|bill|boom|bring|singing|dancing|wild
Topic: 3 
Words: like|rock|get|head|beat|roll|party|hand|round|put|two|shake|look|house|drink
Topic: 4 
Words: man|little|good|boy|got|well|bad|said|woman|new|old|young|big|work|town
Topic: 5 
Words: love|heart|feel|hold|fall|need|like|cry|give|break|leave|kiss|forever|true|believe
Topic: 6 
Words: nigga|get|got|like|money|back|cause|chorus|know|verse|gon|see|real|hit|bout
Topic: 7 
Words: die|fire|dead|stand|fight|black|hell|alive|burn|blood|lie|pain|inside|kill|death
Topic: 8 
Words: night|away|come|light|home|dream|eye|long|day|sun|run|sky|star|rain|see
Topic: 9 
Words: life|world|live|god|free|lord|people|child|soul|heaven|come|save|angel|christmas|hand


# NEXT: Write New Dataset Files w/ LDA Labels
This will assign a topic to each artist, lyric pair in the train and test sets. Later, we will rewrite the dataset into a csv file.

In [None]:
Y_test_tokenized = lyric_tokenization(Y_test)
Y_test_tokenized = token_filtering(Y_test_tokenized)
Y_test_tokenized = lemmatization(Y_test_tokenized)
Y_test_tokenized = remove_stop_words(Y_test_tokenized)

In [None]:
# write the training csv
with open('/content/drive/Shareddrives/CS260-Project/data/big-lda-train-10.csv', 'w') as traindata:
  writer = csv.writer(traindata, delimiter=',')
  writer.writerow(['artist', 'topic_id', 'lyric'])
  line = 0
  for i, artist in enumerate(X_train):
    curr_doc = dictionary.doc2bow(Y_train_tokenized[i])
    #gensim_corpus = [dictionary.doc2bow(song) for song in Y_train_tokenized]
    probs = lda_model[curr_doc]
    max_prob = -1
    topic_id = -1
    for j in range(len(probs)):
      idx, curr_prob = probs[j]
      if curr_prob > max_prob:
        max_prob = curr_prob
        topic_id = idx
    if line < 4:
      print(probs)
      print(Y_train[i])
      print(topic_id)
    #max_prob = max(probs)
    #topic_id = probs.index(max_prob)
    lyrics = Y_train[i]
    writer.writerow([artist, topic_id, lyrics])
    line += 1

[(0, 0.077606104), (1, 0.058531664), (2, 0.017395582), (3, 0.20967111), (4, 0.064934164), (6, 0.4323828), (7, 0.03407869), (8, 0.10057696)]
i'm a political refugee that's how the fuck i felt  birds for the summer hummers for the runners candy on the paint nine for the thunder throw a couple of hundreds fishing on the fishtail with big money, cash money everywhere high roller, shot caller, big boss original, real nigga from the start head hunting, price on a nigga tab hit 'em up for playing with a nigga mail  say i'm better than beethoven to the beat that i rap over stay outta that medicine cabinet yeah, that what they told me giving us piss tests, cause we stay rolling and know a nigga act better than a .45 caliber pistol when they loaded they penalize us, tryna slow us down they constantly fucking us up that's why we're buck wild call me porch monkey, call me jigaboo when you know you wanna fuck my woman and eat my barbecue how the fuck you wanna watch my house but don't wanna live on

In [None]:
# write the test csv
print(Y_test[0])

with open('/content/drive/Shareddrives/CS260-Project/data/big-lda-test-10.csv', 'w') as testdata:
  writer = csv.writer(testdata, delimiter=',')
  writer.writerow(['artist', 'topic_id', 'lyric'])
  line = 0
  for i, artist in enumerate(X_test):
    curr_doc = dictionary.doc2bow(Y_test_tokenized[i])
    #gensim_corpus = [dictionary.doc2bow(song) for song in Y_train_tokenized]
    probs = lda_model[curr_doc]
    max_prob = -1
    topic_id = -1
    for j in range(len(probs)):
      idx, curr_prob = probs[j]
      if curr_prob > max_prob:
        max_prob = curr_prob
        topic_id = idx
    if line < 4:
      print(probs)
      print(Y_test[i])
      print(topic_id)
    #max_prob = max(probs)
    #topic_id = probs.index(max_prob)
    lyrics = Y_test[i]
    writer.writerow([artist, topic_id, lyrics])
    line += 1

[zazu] it's an honor and a privilege, a duty i perform with due sense of decorum and with pride with deference and great respect very much the norm plus a hint of sycophancy on the side to lay before my ruler all the facts about his realm to fill him in on all the beastly news  [mufasa] [spoken] yes, yes, zazu, get on with it!  [zazu] in order that his majesty stands sturdy at the helm aware of all the fauna's latest views  [mufasa] [spoken] zazu! the morning report!  [zazu] [spoken] er - yes, sire - the morning report  chimps are going ape, firaffes remain above it all elephants remember, though just what i can't recall crocodiles are snapping up fresh offers from the banks showed interes in my nest egg but i quickly said, "no thanks!" we haven't paid the hornbills and the vultures have a hunch not everyone invited (sung) will be coming back from lunch this is the morning report gives you the long and the short every grunt, roar and snort not a tale i distort on the morning report  [m

In [None]:
Y_val_tokenized = lyric_tokenization(Y_val)
Y_val_tokenized = token_filtering(Y_val_tokenized)
Y_val_tokenized = lemmatization(Y_val_tokenized)
Y_val_tokenized = remove_stop_words(Y_val_tokenized)

In [None]:
# write the validation CSV file
print(Y_val[0])

with open('/content/drive/Shareddrives/CS260-Project/data/big-lda-val-10.csv', 'w') as valdata:
  writer = csv.writer(valdata, delimiter=',')
  writer.writerow(['artist', 'topic_id', 'lyric'])
  line = 0
  for i, artist in enumerate(X_val):
    curr_doc = dictionary.doc2bow(Y_val_tokenized[i])
    #gensim_corpus = [dictionary.doc2bow(song) for song in Y_val_tokenized]
    probs = lda_model[curr_doc]
    max_prob = -1
    topic_id = -1
    for j in range(len(probs)):
      idx, curr_prob = probs[j]
      if curr_prob > max_prob:
        max_prob = curr_prob
        topic_id = idx
    if line < 4:
      print(probs)
      print(Y_val[i])
      print(topic_id)
    #max_prob = max(probs)
    #topic_id = probs.index(max_prob)
    lyrics = Y_val[i]
    writer.writerow([artist, topic_id, lyrics])
    line += 1

hold time feel break feel untrue convince speak voice tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little turn dust play house ruin run leave save like chase train late late tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little run leave save like chase train know late late play break string feel heart want feel tell real truth hurt lie worse anymore little know little hold time feel
[(0, 0.2170925), (1, 0.11802082), (2, 0.04169471), (3, 0.033879146), (4, 0.014512294), (5, 0.44570422), (7, 0.053964373), (8, 0.06522671)]
hold time feel break feel untrue convince speak voice tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little turn dust play house ruin run leave save like chase train late late tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt

# Write the train-lda-10 file again
DON'T RUN THIS!!

In [None]:
# import gensim
# from gensim import models, test
# #from gensim.models import LDAModel
# from gensim.test import utils
# from gensim.test.utils import datapath

# lda_model = models.LdaModel.load("/content/drive/Shareddrives/CS260-Project/models/lda/lda-train-6")

In [None]:
# import gensim
# from gensim.corpora import Dictionary
# from gensim.corpora import MmCorpus

# dictionary = Dictionary(Y_train_tokenized)
# dictionary.filter_extremes(no_below=100, no_above=0.8)

In [None]:
"""# write the training csv
with open('/content/drive/Shareddrives/CS260-Project/data/big-lda-train-20.csv', 'w') as traindata:
  writer = csv.writer(traindata, delimiter=',')
  writer.writerow(['artist', 'topic_id', 'lyric'])
  line = 0
  for i, artist in enumerate(X_train):
    curr_doc = dictionary.doc2bow(Y_train_tokenized[i])
    #gensim_corpus = [dictionary.doc2bow(song) for song in Y_train_tokenized]
    #print(curr_doc)
    probs = lda_model[curr_doc]
    max_prob = -1
    topic_id = -1
    for j in range(len(probs)):
      idx, curr_prob = probs[j]
      if curr_prob > max_prob:
        max_prob = curr_prob
        topic_id = idx
    if line < 4:
      print(probs)
      print(Y_train[i])
      print(topic_id)
    #max_prob = max(probs)
    #topic_id = probs.index(max_prob)
    lyrics = Y_train[i]
    writer.writerow([artist, topic_id, lyrics])
    line += 1"""

"# write the training csv\nwith open('/content/drive/Shareddrives/CS260-Project/data/big-lda-train-20.csv', 'w') as traindata:\n  writer = csv.writer(traindata, delimiter=',')\n  writer.writerow(['artist', 'topic_id', 'lyric'])\n  line = 0\n  for i, artist in enumerate(X_train):\n    curr_doc = dictionary.doc2bow(Y_train_tokenized[i])\n    #gensim_corpus = [dictionary.doc2bow(song) for song in Y_train_tokenized]\n    #print(curr_doc)\n    probs = lda_model[curr_doc]\n    max_prob = -1\n    topic_id = -1\n    for j in range(len(probs)):\n      idx, curr_prob = probs[j]\n      if curr_prob > max_prob:\n        max_prob = curr_prob\n        topic_id = idx\n    if line < 4:\n      print(probs)\n      print(Y_train[i])\n      print(topic_id)\n    #max_prob = max(probs)\n    #topic_id = probs.index(max_prob)\n    lyrics = Y_train[i]\n    writer.writerow([artist, topic_id, lyrics])\n    line += 1"

#Evaluation

In [None]:
!pip install gensim==4.1.0


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==4.1.0
  Downloading gensim-4.1.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.0 MB)
[K     |████████████████████████████████| 24.0 MB 225 kB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 4.2.0
    Uninstalling gensim-4.2.0:
      Successfully uninstalled gensim-4.2.0
Successfully installed gensim-4.1.0


In [None]:
import gensim
from gensim import models, test
#from gensim.models import LDAModel
from gensim.test import utils
from gensim.test.utils import datapath

lda_model = models.ldamodel.LdaModel.load("/content/drive/Shareddrives/CS260-Project/models/lda/big-lda-train-10")


In [None]:
from gensim.models.coherencemodel import CoherenceModel

coherencemodel = CoherenceModel(model=lda_model, texts=Y_train_tokenized, dictionary=dictionary, coherence='c_v')
print(coherencemodel.get_coherence())


0.41135759337280653


# Visualize

In [None]:
!pip install pyLDAvis
!pip install gensim==3.7.1


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 4.6 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Building wheels for collected packages: pyLDAvis, sklearn
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136898 sha256=6b916211df3ce2d637cbd109283ecb11c3c02915ebfa76d55833926b435cfce9
  Stored in directory: /root/.cache/pip/wheels/90/61/ec/9dbe9efc3acf9c4e37ba70fbbcc3f3a0ebd121060aa593181a
  Building wheel for sklearn (

In [None]:
import pyLDAvis
# import pyLDAvis.gensim as gensimvis
import pyLDAvis.gensim_models as gensimvis
import numpy as np

gensim_corpus_float = []
for i, item in enumerate(gensim_corpus):
  gensim_corpus_float.append([(float(x[0]), float(x[1])) for x in item])


vis_data = gensimvis.prepare(lda_model, gensim_corpus_float, dictionary)
pyLDAvis.save_html(vis_data, './Lyrics_LDA_k_'+ str(10) +'.html')
print('./Lyrics_LDA_k_'+ str(10) +'.html')
pyLDAvis.display(vis_data)

  from collections import Iterable
  default_term_info = default_term_info.sort_values(


./Lyrics_LDA_k_10.html
