# GOAL:
The goal of this notebook is to perform LDA topic modeling using the training data, and then applying the results on the test set. This will complete the dataset construction part of the pipeline, allowing us to move on to the models.

Note: This notebook does not perform topics visualization and does not compute the coherence of the topic model

In [None]:
# mount google drive
from google.colab import drive
import os

drive.mount('/content/drive/')
os.chdir('/content/drive/Shareddrives/CS260-Project/data/')

Mounted at /content/drive/


In [None]:
import csv
# open the training and test set
X_train = []
Y_train = []
with open('./train/kaggle-train.csv') as traindata:
  line = 1
  reader = csv.reader(traindata, delimiter=',')
  for row in reader:
    if line > 1:
      X_train.append(row[0])
      Y_train.append(row[1])
    line += 1

X_test = []
Y_test = []
with open('./test/kaggle-test.csv') as testdata:
  line = 1
  reader = csv.reader(testdata, delimiter=',')
  for row in reader:
    if line > 1:
      X_test.append(row[0])
      Y_test.append(row[1])
    line += 1

# add one of the datasets that Christina developed
X_val = []
Y_val = []
with open('./val/kaggle-val.csv') as valdata:
  line = 1
  reader = csv.reader(valdata, delimiter=',')
  for row in reader:
    if line > 1:
      X_val.append(row[0])
      Y_val.append(row[1])
    line += 1
print(X_val[0])
print(Y_val[0])

mukesh
hold time feel break feel untrue convince speak voice tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little turn dust play house ruin run leave save like chase train late late tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little run leave save like chase train know late late play break string feel heart want feel tell real truth hurt lie worse anymore little know little hold time feel


Functions below come from: https://tim-denzler.medium.com/whats-in-a-song-using-lda-to-find-topics-in-over-120-000-songs-53785767b692

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

def lyric_tokenization(lyric_list):
  lyric_corpus_tokenized = []
  tokenizer = RegexpTokenizer(r'\w+')
  for lyric in lyric_list:
    tokenized_lyric = tokenizer.tokenize(lyric.lower())
    lyric_corpus_tokenized.append(tokenized_lyric)
  return lyric_corpus_tokenized

def token_filtering(lyric_tokenized_list):
  for s, song in enumerate(lyric_tokenized_list):
    filtered_song = []    
    for token in song:
        if len(token) > 2 and not token.isnumeric():
            filtered_song.append(token)
    lyric_tokenized_list[s] = filtered_song
  return lyric_tokenized_list

def lemmatization(lyric_corpus_tokenized):
  lemmatizer = WordNetLemmatizer()
  for s,song in enumerate(lyric_corpus_tokenized):
    lemmatized_tokens = []
    for token in song:
        lemmatized_tokens.append(lemmatizer.lemmatize(token))
    lyric_corpus_tokenized[s] = lemmatized_tokens
  return lyric_corpus_tokenized

def remove_stop_words(lyric_corpus_tokenized):
  profanities = ['fuck', 'shit', 'bitch']
  stop_words = stopwords.words('english')
  new_stop_words = ['ooh','yeah','hey','whoa','woah', 'ohh', 'was', 'mmm', 'oooh','yah','yeh','mmm', 'hmm','deh','doh','jah','wa']
  stop_words.extend(new_stop_words)
  for s,song in enumerate(lyric_corpus_tokenized):
    filtered_text = []    
    for token in song:
        if token not in stop_words and token not in profanities:
            filtered_text.append(token)
    lyric_corpus_tokenized[s] = filtered_text
  return lyric_corpus_tokenized

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
Y_train_tokenized = lyric_tokenization(Y_train)
Y_train_tokenized = token_filtering(Y_train_tokenized)
Y_train_tokenized = lemmatization(Y_train_tokenized)
Y_train_tokenized = remove_stop_words(Y_train_tokenized)

#NEXT STEP: LDA Modeling

In [None]:
import gensim
from gensim.corpora import Dictionary
from gensim.corpora import MmCorpus

dictionary = Dictionary(Y_train_tokenized)
dictionary.filter_extremes(no_below=100, no_above=0.8)

gensim_corpus = [dictionary.doc2bow(song) for song in Y_train_tokenized]
temp = dictionary[0]
id2word = dictionary.id2token

In [None]:
!pip install --upgrade gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.8 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


In [None]:
import gensim
from gensim import models, test
from gensim.test import utils
from gensim.test.utils import datapath

lda_model = models.LdaModel(
    corpus=gensim_corpus,
    id2word=id2word,
    chunksize=2000,
    alpha='auto',
    eta='auto',
    iterations=400,
    num_topics=6, # FOR GPT-3 BASELNES
    passes=20
)



In [None]:
train_model_save = datapath("/content/drive/Shareddrives/CS260-Project/models/lda-train-6")
lda_model.save(train_model_save)

In [None]:
for idx, topic in lda_model.show_topics(formatted=False, num_words=15):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

Topic: 0 
Words: light|eye|see|come|like|world|sky|soul|fire|sun|inside|take|fall|hand|run
Topic: 1 
Words: like|get|got|nigga|money|back|chorus|know|cause|see|hit|make|verse|girl|put
Topic: 2 
Words: man|said|well|one|people|old|got|black|street|big|little|new|town|two|boy
Topic: 3 
Words: time|never|one|day|life|away|could|still|way|would|heart|long|ever|every|gone
Topic: 4 
Words: love|know|baby|like|want|make|let|got|say|get|feel|need|take|cause|wanna
Topic: 5 
Words: lyric|come|night|dance|rock|tonight|let|music|gonna|bill|sing|shake|party|song|roll


# NEXT: Write New Dataset Files w/ LDA Labels
This will assign a topic to each artist, lyric pair in the train and test sets. Later, we will rewrite the dataset into a csv file.

In [None]:
Y_test_tokenized = lyric_tokenization(Y_test)
Y_test_tokenized = token_filtering(Y_test_tokenized)
Y_test_tokenized = lemmatization(Y_test_tokenized)
Y_test_tokenized = remove_stop_words(Y_test_tokenized)

In [None]:
# write the training csv
with open('/content/drive/Shareddrives/CS260-Project/data/lda-train-6-updated.csv', 'w') as traindata:
  writer = csv.writer(traindata, delimiter=',')
  writer.writerow(['artist', 'topic_id', 'lyric'])
  line = 0
  for i, artist in enumerate(X_train):
    curr_doc = dictionary.doc2bow(Y_train_tokenized[i])
    #gensim_corpus = [dictionary.doc2bow(song) for song in Y_train_tokenized]
    probs = lda_model[curr_doc]
    max_prob = -1
    topic_id = -1
    for j in range(len(probs)):
      idx, curr_prob = probs[j]
      if curr_prob > max_prob:
        max_prob = curr_prob
        topic_id = idx
    if line < 4:
      print(probs)
      print(Y_train[i])
      print(topic_id)
    #max_prob = max(probs)
    #topic_id = probs.index(max_prob)
    lyrics = Y_train[i]
    writer.writerow([artist, topic_id, lyrics])
    line += 1

[(0, 0.25901562), (2, 0.03261063), (3, 0.43661842), (4, 0.23224649), (5, 0.034634035)]
Bee Gees - When A Lonely Heart Breaks


I stumble in the night
Never really knew what it would've been like
You're no longer there to break my fall
The heartache over you
I'd give it everything but I couldn't live through
I never saw the signs
You're the last to know when love is blind.

All the tears and the turbulent years
When I would not wait for no-one
Didn't stop and take a look at myself
And see me losing you.

(Chorus)
When a lonely heart breaks
It's the one that forsakes
It's the dream that we stole
And I'm missing you more
Than the fire that will roar
There's a hole in my soul
For you it's good-bye
For me it's to cry
For whom the bell tolls.

Seen you in a magazine
A picture at a party where you shouldn't have been
Hanging on the arm of someone else
I'm still in love with you
Won't you come back to your little boy blue
I've come to feel inside
This precious love was never mine.

Now I know 

In [None]:
# write the test csv
print(Y_test[0])

with open('/content/drive/Shareddrives/CS260-Project/data/lda-test-6-updated.csv', 'w') as testdata:
  writer = csv.writer(testdata, delimiter=',')
  writer.writerow(['artist', 'topic_id', 'lyric'])
  line = 0
  for i, artist in enumerate(X_test):
    curr_doc = dictionary.doc2bow(Y_test_tokenized[i])
    #gensim_corpus = [dictionary.doc2bow(song) for song in Y_train_tokenized]
    probs = lda_model[curr_doc]
    max_prob = -1
    topic_id = -1
    for j in range(len(probs)):
      idx, curr_prob = probs[j]
      if curr_prob > max_prob:
        max_prob = curr_prob
        topic_id = idx
    if line < 4:
      print(probs)
      print(Y_test[i])
      print(topic_id)
    #max_prob = max(probs)
    #topic_id = probs.index(max_prob)
    lyrics = Y_test[i]
    writer.writerow([artist, topic_id, lyrics])
    line += 1

Your love is so good for me
Your love is so good for me

Every day you're on my mind
Wanna be near you all the time
You made my poem rhyme
And my heart began to sing again
The day your eyes met mine

Your love is so good for me
Your love is so good for me

Two hearts just running free
Like a wind song through the trees
Your love does not possess
It just holds me where I wanna be
With binds of tenderness

Your love is so good for me
Your love is so good for me

Baby, you know your love is so good
It's good
You know it's good

Like a star up in the sky
Burning brightly, you and I
Time will tell if love survives
For we only have today
And today love is alive

Your love is so good for me
Your love is so good for me

It's so good
You know your love is so good,
So good for me...
It's so good

No need to say the words
When you touch me they will be heard
You gave so much to me
And you showed me how to love the way
True love was meant to be

Your love is so good for me
Your love is so good for

In [None]:
Y_val_tokenized = lyric_tokenization(Y_val)
Y_val_tokenized = token_filtering(Y_val_tokenized)
Y_val_tokenized = lemmatization(Y_val_tokenized)
Y_val_tokenized = remove_stop_words(Y_val_tokenized)

In [None]:
# write the validation CSV file
print(Y_val[0])

with open('/content/drive/Shareddrives/CS260-Project/data/lda-val-6.csv', 'w') as valdata:
  writer = csv.writer(valdata, delimiter=',')
  writer.writerow(['artist', 'topic_id', 'lyric'])
  line = 0
  for i, artist in enumerate(X_val):
    curr_doc = dictionary.doc2bow(Y_val_tokenized[i])
    #gensim_corpus = [dictionary.doc2bow(song) for song in Y_val_tokenized]
    probs = lda_model[curr_doc]
    max_prob = -1
    topic_id = -1
    for j in range(len(probs)):
      idx, curr_prob = probs[j]
      if curr_prob > max_prob:
        max_prob = curr_prob
        topic_id = idx
    if line < 4:
      print(probs)
      print(Y_val[i])
      print(topic_id)
    #max_prob = max(probs)
    #topic_id = probs.index(max_prob)
    lyrics = Y_val[i]
    writer.writerow([artist, topic_id, lyrics])
    line += 1

hold time feel break feel untrue convince speak voice tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little turn dust play house ruin run leave save like chase train late late tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little run leave save like chase train know late late play break string feel heart want feel tell real truth hurt lie worse anymore little know little hold time feel
[(0, 0.10326173), (2, 0.054414734), (3, 0.10834018), (4, 0.72854453)]
hold time feel break feel untrue convince speak voice tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little turn dust play house ruin run leave save like chase train late late tear try hold hurt try forgive okay play break string feel heart want feel tell real truth hurt lie worse anymore little run leave save like chase train know late l