In [1]:
import pandas as pd
from collections import Counter
import numpy as np

<br>
<br>

### **Loading Dataset**

In [2]:
def load_csv():
    df = pd.read_csv("spotify_million_song_dataset.csv", nrows=10000)
    return df

In [3]:
df = load_csv()
print(df["text"][1658])
len(df["text"])

Back In A us Made Car  
(Back In The ussr-Beatles)  
  
Ohhh...  
Used to drive a Honda C-I-V-I-C  
Didn't sleep a wink at night  
Auto-workers laid-off in Detroit last week  
Man it's such a dreadful sight  
  
I'm back in a us made car  
You know how lazy we are, boys  
Back in a us made car  
  
Power steering, power windows, seats and brakes  
It's bigger than the Astrodome  
Drive it under 50 or the back-end shakes  
God I hope it gets me home  
  
I'm back in a us made car  
You know how lazy we are, boys  
Back in a us  
Back in the Do-Less  
Back in a us made car  
  
Those foreign cars really knock me out  
They leave Chevettes behind  
They cruise right past all the screams and shouts  
From gm unemployment li-li  
Li-li-li-li-li-li-lines  
  
Oh tow that Iacocca monster to my house  
You didn't have to twist my arm  
It's my patriotic duty to be helping out  
Can't let the big three buy the farm  
  
I'm back in us made car  
A gallon don't go too far boy  
Back in a us made

10000

<br>
<br>

### **Cleaning and Tokenizing Dataset**

In [4]:
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
              'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
              'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself',
              'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
              'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
              'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
              'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
              'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to',
              'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
              'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
              'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
              's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y',
              'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't",
              'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',
              "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
              'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

In [5]:
import re


def tokenize(text):
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())

def load_data(df):
    tokenized_songs = []
    total_tokens = 0

    for index, song in df.iterrows():
        song_lyrics_paragraphs = song["text"].replace("\n\n", "").split("\n  \n")

        tokenized_song = []

        for paragraph in song_lyrics_paragraphs:
            cleaned_paragraph = paragraph.replace("\n", "").replace("  ", " ")
            tokenized_paragraph = tokenize(cleaned_paragraph)
            tokenized_filtered_paragraph = []

            for token in tokenized_paragraph:
                if token not in stop_words:
                    tokenized_filtered_paragraph.append(token)
                    total_tokens += 1

            tokenized_song.append(tokenized_filtered_paragraph)

        tokenized_songs.append(tokenized_song)

    print("Total number of words without stop words:", total_tokens)
    return tokenized_songs

In [6]:
tokenized_songs = load_data(df)

print("Total number of songs:", len(tokenized_songs))

Total number of words without stop words: 1106211
Total number of songs: 10000


<br>
<br>

### **Mapping words to ids and ids to words**

In [7]:
def mapping(tokenized_songs):

    word_to_id = {}
    id_to_word = {}
    all_tokens_cache = set()

    for tokenized_song in tokenized_songs:
        for tokenized_para in tokenized_song:
            for token in tokenized_para:
                all_tokens_cache.add(token)

    for index, word in enumerate(all_tokens_cache):
        word_to_id[word] = index
        id_to_word[index] = word

    return word_to_id, id_to_word

In [8]:
word_to_id, id_to_word = mapping(tokenized_songs)
print("Total Unique words: ", len(word_to_id))
print("Mapping preview:")
word_to_id
id_to_word

Total Unique words:  36605
Mapping preview:


{0: 'indulgence',
 1: 'means',
 2: 'recorded',
 3: 'frat',
 4: 'rent',
 5: 'drugstore',
 6: 'jerk',
 7: 'skin',
 8: 'busier',
 9: 'motored',
 10: 'officially',
 11: 'wong',
 12: 'readeth',
 13: "muthafuckas'll",
 14: 'incidentally',
 15: 'singt',
 16: 'duo',
 17: 'nissi',
 18: 'wanders',
 19: 'augmented',
 20: 'intricately',
 21: 'wingtip',
 22: "way'",
 23: 'bouche',
 24: 'dionne',
 25: 'upheaval',
 26: 'imma',
 27: 'praises',
 28: 'forcast',
 29: "sighin'",
 30: "moun'",
 31: 'performer',
 32: 'theprojects',
 33: 'cassidy',
 34: 'ben',
 35: 'saws',
 36: 'poodle',
 37: 'buffer',
 38: 'gears',
 39: 'biggity',
 40: "dependin'",
 41: 'tupelo',
 42: 'change',
 43: 'tattooed',
 44: 'rockstar',
 45: "'where",
 46: "other's",
 47: 'florida',
 48: "ain't",
 49: 'perico',
 50: 'yeh',
 51: 'colorful',
 52: 'trails',
 53: 'gouge',
 54: 'mot',
 55: 'exciety',
 56: 'bogie',
 57: 'ceremonies',
 58: 'fridges',
 59: 'unmerited',
 60: "approachin'",
 61: 'optimistically',
 62: 'impulsion',
 63: 'endea

<br>
<br>

### **One hot Encoding**

In [9]:
def one_hot_encode(id, vocab_size):
    base_vector = [0] * vocab_size
    base_vector[id] = 1
    return base_vector

<br>
<br>

### **Creating Training Dataset**

For this we first need to create the X,Y pairs from the song lyrics.

In [10]:
def generate_pairs(tokenized_songs, window):
    pairs = []

    for tokenized_song in tokenized_songs:
        for tokenized_paragraph in tokenized_song:
            for k, word in enumerate(tokenized_paragraph):
                start = max(0, k - window)
                end = min(len(tokenized_paragraph), k + window + 1)

                for index in range(start, end):
                    if index == k:
                        continue
                    else:
                        pairs.append((tokenized_paragraph[k], tokenized_paragraph[index]))
    return pairs

In [11]:
pairs = generate_pairs(tokenized_songs, 2)

In [12]:
print("Pairs Preview: ")
pairs[:10]
len(pairs)

Pairs Preview: 


4075414

In [13]:
def encode_data(batch, vocab_size, word_to_id):
    X = []
    y = []
    for pair in batch:
        X.append(one_hot_encode(word_to_id[pair[0]], vocab_size))
        y.append(one_hot_encode(word_to_id[pair[1]], vocab_size))

    return np.asarray(X), np.asarray(y)

<br>
<br>

### **Creating Batches**

In [14]:
batch_size = 3000
mini_batches = []
total_examples = len(pairs)
num_of_batches = total_examples // batch_size
leftover = total_examples % batch_size

for i in range(num_of_batches):
    start_index = i * batch_size
    end_index = (i + 1) * batch_size
    mini_batch = pairs[start_index:end_index]
    mini_batches.append(mini_batch)

if leftover > 0:
    last_mini_batch = pairs[-leftover:]
    mini_batches.append(last_mini_batch)

# return mini_batches

In [15]:
len(mini_batches[0])
len(mini_batches)

1359

In [16]:
X, y = encode_data(mini_batches[i], len(word_to_id), word_to_id)
del X
del y

In [None]:
test = np.empty((10, 2048, len(word_to_id)))

In [None]:
for i in range(10):
    X, y = encode_data(mini_batches[i], len(word_to_id), word_to_id)
    np.append(test, X)
del X
del y

In [17]:
W1 = np.random.randn(len(word_to_id), 300) * (np.sqrt(2. / 300))
W2 = np.random.randn(300, len(word_to_id)) * (np.sqrt(2. / len(word_to_id)))
W2.shape

(300, 36605)

In [18]:
def softmax(x):
    t = np.exp(x)
    t_sum = np.sum(t, axis=1, keepdims=True)
    a = t / t_sum
    return a
    

In [19]:
def cross_entropy(z, y):
    return - np.sum(np.log(z) * y)

In [20]:
for epoch in range(1):
    for index, batch in enumerate(mini_batches):
        X, y = encode_data(batch, len(word_to_id), word_to_id)
        Z1 = np.dot(X, W1)
        Z2 = np.dot(Z1, W2)
        A2 = softmax(Z2)

        loss = cross_entropy(A2, y)
        print("Loss:", loss)

        dA2 = - y / A2
        dA2.shape

        dZ2 = dA2 * (A2 * (1 - A2))
        dW2 = np.dot(Z1.T, dZ2)
        dW1 = np.dot(dZ2.T, Z1)

        W1 -= 0.001 * dW1
        W2 -= 0.001 * dW2

        # dA2 = A - y
        # dW2 = np.dot(Z1.T, dA2)
        # dA1 = np.dot(dA2, W2.T)
        # dW1 = np.dot(X.T, dA1)

        # W1 -= 0.01 * dW1
        # W2 -= 0.01 * dW2

        # if index == 20:
        #     break

Loss: 31522.700815724213
Loss: 31523.342339752777
Loss: 31522.913073406322
Loss: 31523.798469898225
Loss: 31523.297664745107
Loss: 31521.17921222732
Loss: 31523.506463022255
Loss: 31516.165782916123
Loss: 31521.501674154493
Loss: 31510.477611631257


KeyboardInterrupt: 

### End of Code 
### Rough Testing Code

In [None]:
dA2 = - y / A2
dA2.shape

dZ2 = dA2 * (A2 * (1 - A2))
dW2 = np.dot(Z1.T, dZ2)
dW1 = np.dot(dZ2.T, Z1)

W1 -= 0.01 * dW1
W2 -= 0.01 * dW2
dW1.shape

In [None]:
Z1.shape

<br>
<br>

##### Ignore the method below it was designed to be faster but it is a bit slower due to some extra computations. It functions the same as the above method just the approach is different.

In [None]:
tally = [0]

songs = []

tokens = []
para_count = 0
para_prev_length = 0

for index, song in df.iterrows():
    song_lyrics_paragraphs = song["text"].replace("\n\n", "").split("\n  \n")

    for paragraph in song_lyrics_paragraphs:
        para_count += 1
        tokenized_paragraph = tokenize(paragraph.replace("\n", "").replace("  ", " "))
        tokens.extend(tokenized_paragraph)
        para_prev_length += len(tokenized_paragraph)
        tally.append(para_prev_length)

In [None]:
def generate_pairs_fast(tokens, window):
    pairs = []

    for i, value in enumerate(tally):
        if i+1 == len(tally):
            break

        for k in range(tally[i], tally[i+1]):
            localized_k = k - tally[i]
            start = max(0, localized_k - window)
            end = min(tally[i+1] - tally[i], localized_k + window + 1)

            for index in range(start, end):
                if index == localized_k:
                    continue
                else:
                    pairs.append((tokens[localized_k + tally[i]], tokens[index + tally[i]]))
    print(pairs[0])
    return pairs

<br>
<br>

### **Generating a summary of words (Not required this was just to learn the Counter Collections Class)**

In [None]:
def get_words_summary(all_tokens):
    words_count_dict = Counter(all_tokens)
    print("Top 3 most common words are:", words_count_dict.most_common(3))
    print("Total unique words are:", len(words_count_dict))
    print("Total number of words:", sum(words_count_dict.values()))
    print("Total number of words:", len(all_tokens))

    return words_count_dict

In [None]:
words_count_dict = get_words_summary(all_tokens)

In [None]:
a = np.array([
    [1, 1, 1, 1],
    [2, 2, 2, 5],
    [3, 3, 3, 3],
])

In [None]:
b = np.exp(a)
c = np.sum(a, axis=1, keepdims=True)

a / c

In [None]:
def softmax_old(X):
    res = []
    for x in X:
        exp = np.exp(x)
        res.append(exp / exp.sum())
    return res