**This file takes in a file upload of tokens from our lyrics and trains the GloVe model on our corpus. Generates and exports a vector GloVe embeddings for each token.**

# Setup

In [None]:
# installations
!pip install glove-python-binary

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting glove-python-binary
  Downloading glove_python_binary-0.2.0-cp38-cp38-manylinux1_x86_64.whl (974 kB)
[K     |████████████████████████████████| 974 kB 7.5 MB/s 
Installing collected packages: glove-python-binary
Successfully installed glove-python-binary-0.2.0


In [None]:
from glove import Glove, Corpus

## File Upload

In [None]:
from google.colab import files
 
uploaded = files.upload()

Saving processed_tokens_ALL.txt to processed_tokens_ALL.txt


In [None]:
# format should be a vector of vectors, each inner vector is a list of tokens that represents a song's lyrics, outer vector is a list of all songs
all_tokens = []
with open('processed_tokens_ALL.txt', 'r', encoding='utf-8') as infile:
  for line in infile:
    song_tokens = []
    for word in line.split(' '):
      song_tokens.append(word)
    all_tokens.append(song_tokens)

In [None]:
print(len(all_tokens))
print(all_tokens[:100])

86893
[['feel', 'unsur', 'take', 'hand', 'lead', 'danc', 'floor', 'music', 'die', 'someth', 'eye', 'call', 'mind', 'silver', 'screen', 'sad', 'goodby', 'never', 'gon', 'na', 'danc', 'guilti', 'foot', 'got', 'rhythm', 'though', 'easi', 'pretend', 'know', 'fool', 'known', 'better', 'cheat', 'friend', 'wast', 'chanc', 'given', 'never', 'gon', 'na', 'danc', 'way', 'danc', 'time', 'never', 'mend', 'careless', 'whisper', 'good', 'friend', 'heart', 'mind', 'ignor', 'kind', 'comfort', 'truth', 'pain', 'find', 'never', 'gon', 'na', 'danc', 'guilti', 'foot', 'got', 'rhythm', 'though', 'easi', 'pretend', 'know', 'fool', 'known', 'better', 'cheat', 'friend', 'wast', 'chanc', 'given', 'never', 'gon', 'na', 'danc', 'way', 'danc', 'never', 'without', 'love', 'tonight', 'music', 'seem', 'loud', 'wish', 'could', 'lose', 'crowd', 'mayb', 'better', 'way', 'hurt', 'thing', 'want', 'say', 'could', 'good', 'togeth', 'could', 'live', 'danc', 'forev', 'gon', 'na', 'danc', 'plea', 'stay', 'never', 'gon', 'na',

# Training GloVe on Custom Corpus

In [None]:
# create corpus consisting of our tokens
corpus = Corpus()
corpus.fit(all_tokens, window=10)

# train GloVe model on our corpus
glove = Glove(no_components=5, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [None]:
# example glove vector
print(glove.word_vectors[glove.dictionary['bill']])

[ 0.08225988 -1.19179927 -0.86380483 -0.924785    0.40373187]


In [None]:
# export glove embeddings

import json
with open('glove_vectors.txt', 'w') as outfile:
    for vector in glove.word_vectors:
      for value in vector:
        outfile.write(str(value) + ' ')
      outfile.write('\n') 

with open('glove_dictionary.json', 'w') as outfile:
     outfile.write(json.dumps(glove.dictionary))
