In [5]:
import pandas as pd
import numpy as np
import gensim.models.word2vec as w2v
import multiprocessing
import os
import re
import pprint
import sklearn.manifold
import matplotlib.pyplot as plt

In [12]:
songs = pd.read_json("edSheeranLyrics.json", encoding='utf-8')


In [23]:
text_corpus = []
for song in songs['lyrics']:
    words = song.lower().split()
    text_corpus.append(words)


num_features = 50

min_word_count = 1

num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-1

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 2

songs2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

songs2vec.build_vocab(text_corpus)
print ("Number of unique words in Ed Sheeran's Songs: " + str(len(songs2vec.vocab)))

Number of unique words in Ed Sheeran's Songs: 2328


In [24]:
songs2vec.train(text_corpus)
if not os.path.exists("trained"):
    os.makedirs("trained")
songs2vec.save(os.path.join("trained", "songs2vec.w2v"))

In [25]:
songs2vec = w2v.Word2Vec.load(os.path.join("trained", "songs2vec.w2v"))

In [29]:
import sklearn
def songVector(row):
    vector_sum = 0
    words = row.lower().split()
    for word in words:
        vector_sum = vector_sum + songs2vec[word]
    vector_sum = vector_sum.reshape(1, -1)
    normalised_vector_sum = sklearn.preprocessing.normalize(vector_sum)
    return normalised_vector_sum

songs['lyrics_vector'] = songs['lyrics'].apply(songVector)

In [31]:
songs.shape


(47, 4)

In [36]:
songs_vector = []


for song_vector in songs['lyrics_vector']:
    songs_vector.append(song_vector)

X = np.array(songs_vector).reshape((47, 50))

tsne = sklearn.manifold.TSNE(n_components=2, n_iter=500, random_state=0, verbose=2)

all_word_vectors_matrix_2d = tsne.fit_transform(X)

df=pd.DataFrame(all_word_vectors_matrix_2d,columns=['X','Y'])

df.head(10)

df.reset_index(drop=True, inplace=True)
songs.reset_index(drop=True, inplace=True)

two_dimensional_songs = pd.concat([songs, df], axis=1)

two_dimensional_songs.head()

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 46 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 47 / 47
[t-SNE] Mean sigma: 0.141871
[t-SNE] Iteration 25: error = 1.5810152, gradient norm = 0.0028508
[t-SNE] Iteration 50: error = 1.4712532, gradient norm = 0.0021362
[t-SNE] Iteration 75: error = 1.3032110, gradient norm = 0.0011651
[t-SNE] Iteration 100: error = 1.2693498, gradient norm = 0.0010068
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.269350
[t-SNE] Iteration 125: error = 1.2138315, gradient norm = 0.0008863
[t-SNE] Iteration 125: gradient norm 0.000886. Finished.
[t-SNE] Error after 125 iterations: 1.269350


Unnamed: 0,lyrics,song,url,lyrics_vector,X,Y
0,The club isn't the best place to find a lover\...,Shape Of You,https://www.directlyrics.com/ed-sheeran-shape-...,"[[0.156699, -0.174665, -0.0537184, 0.00572745,...",538.300882,13.58604
1,When I was six years old I broke my leg\nI was...,Castle On The Hill,https://www.directlyrics.com/ed-sheeran-castle...,"[[0.155166, -0.205254, -0.0616643, -0.00568822...",142.166006,23.13836
2,"You are the one, girl\nYou know that it's true...",How Would You Feel,https://www.directlyrics.com/ed-sheeran-how-wo...,"[[0.123813, -0.143095, -0.0610325, -0.0159541,...",-126.436687,-44.398532
3,When your legs don't work like they used to be...,Thinking Out Loud,https://www.directlyrics.com/ed-sheeran-thinki...,"[[0.140063, -0.174749, -0.0692255, -0.0133274,...",108.299064,-12.31462
4,"It's just another night, and I'm staring at th...",All Of The Stars,https://www.directlyrics.com/ed-sheeran-all-of...,"[[0.182078, -0.191971, -0.0634723, -0.028479, ...",-123.368071,50.674563


In [None]:
two_dimensional_songs.to_csv("songs.csv")