<center>
    Predicting Song Similarity using Deep Neural Networks
    <br>
    Part 02: Models
</center>
<p style="text-align:right">
    Sudheer Kumar Reddy Beeram
    <br>
    Sivaraman Lakshmipathy
    <br>
    Sneha Shet
</p>

<b>Models</b>
<br>
This Jupyter Notebook contains the source code to train the models using the generated dataset.
<br>
Note: All models were trained on Google Collab using TPU accelerator.

<center><b>Loading the lyric pairs and preprocessing the data</b></center>

In [1]:
import os,sys,csv
import statistics
import pandas as pd
import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, KeyedVectors
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Removing words that occur less than 100 times 
import csv
from collections import defaultdict
nltk.download('stopwords')

def clean(string):
  string = string.replace('(','')
  string = string.replace(')','')
  string = remove_stopwords(string)
  return string

def remove_stopwords(text):
    retStr = ""
    for w in w_tokenizer.tokenize(text):
        if w not in stopWords:
            lyrics = [''.join(e.lower() for e in w if e.isalpha())]
            retStr += lyrics[0] + " "
    return retStr

stopWords = set(stopwords.words('english'))
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()   
counter = defaultdict(int)
lines = open('fullDump.txt').read().splitlines()

for sentence in lines:
  sentence = clean(sentence)
  for word in sentence.split():
    counter[word]+=1

mx_word_count=counter[max(counter.keys(), key=(lambda k: counter[k]))]
min_word_count=counter[min(counter.keys(), key=(lambda k: counter[k]))]

eliminate=set()
for word,count in counter.items():
  if count<100:eliminate.add(word)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
# Preprocessing the lyrics
def basic_preprocessor(data):
    data['X_left'] = data['X_left'].str.lower() # converting to lower case
    data['X_right'] = data['X_right'].str.lower() # converting to lower case
    data['X_left'] = data['X_left'].str.replace('(','') # Removing parenthesis
    data['X_left'] = data['X_left'].str.replace(')','') # Removing parenthesis
    data['X_right'] = data['X_right'].str.replace('(','') # Removing parenthesis
    data['X_right'] = data['X_right'].str.replace(')','') # Removing parenthesis
    data['X_left'] = data['X_left'].apply(remove_stopwords) # Removing stopwords
    data['X_right'] = data['X_right'].apply(remove_stopwords) # Removing stopwords
    return data

#Reading DataFiles
dataset_filepath = "Project_dataset"
dataset_file = "final_dataset.csv"

data = pd.read_csv(dataset_file, sep = "\t", encoding = "utf-8")
data = data.reindex(np.random.permutation(data.index))

stopWords = set(stopwords.words('english'))|eliminate
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
data = basic_preprocessor(data)

In [0]:
dataCorpus = data['X_left'].copy()
dataCorpus = dataCorpus.append(data['X_right'].copy())

In [5]:
# Vectorizing the sentences into integer indices
def getEmbeddedData(data, tokenizerObj):
    return tokenizerObj.texts_to_sequences(data)

word_tokenizer = tf.keras.preprocessing.text.Tokenizer()
word_tokenizer.fit_on_texts(dataCorpus)

# Dictionary of all the unique words and their indices in the order of their occuring frequency
word_index = word_tokenizer.word_index

# No of unique words
vocab_length = len(word_index) + 1
 
print("Vocabulary length:", vocab_length)

data['X_left'] = getEmbeddedData(data['X_left'], word_tokenizer)
data['X_right'] = getEmbeddedData(data['X_right'], word_tokenizer)

#Changing Y label to 1 if >0.5 and to 0 otherwise
data['Y']=data['Y'].where(data['Y']<=0.5,1)
data['Y']=data['Y'].where(data['Y']>0.5,0)
data['Y']=data['Y'].astype(int)
data.head()

Vocabulary length: 13419


Unnamed: 0,X_left_trackid,X_left,X_right_trackid,X_right,Y
16112,TRMCJGU128E0791C67,"[21, 21, 18, 5656, 18, 366, 558, 202, 115, 717...",TRGJHMO128F42478CC,"[130, 669, 511, 2107, 406, 25, 2957, 7, 1965, ...",0
6919,TRFMITB128F92C91AD,"[17, 30, 1, 544, 379, 14, 14, 58, 6, 3445, 1, ...",TRPTPKD128F426A91D,"[13, 38, 1, 28, 111, 1, 1207, 1762, 43, 25, 14...",1
21111,TRWGQDF128F92E4F04,"[354, 4082, 1054, 484, 5657, 753, 9, 611, 182,...",TRWMAFQ128F932E475,"[404, 404, 13, 53, 2675, 262, 1572, 53, 462, 4...",0
17165,TRYGTCS128F93146AA,"[49, 2, 13, 78, 39, 2, 246, 18, 252, 2, 18, 52...",TRRSFKQ128F1484AB8,"[731, 64, 80, 658, 397, 2412, 88, 54, 28, 94, ...",0
21035,TRWDNTC128F92E87C9,"[137, 33, 1029, 85, 288, 943, 133, 122, 13, 45...",TRATBUH128EF34066F,"[165, 351, 202, 51, 370, 264, 392, 3384, 350, ...",0


In [6]:
from keras.preprocessing.sequence import pad_sequences
#Max Length of a sentence 
word_count = lambda sentence: len(word_tokenize(sentence))
longest_sentence = max(dataCorpus, key=word_count)
length_long_sentence = len(word_tokenize(longest_sentence))
print("Max length:", length_long_sentence)

dataCorpus_1 = data['X_left'].copy()
dataCorpus_1 = dataCorpus_1.append(data['X_right'].copy())
k =dataCorpus_1.tolist()
f = [len(g) for g in k ]
max_len = max(f)
x = statistics.mean(f)

#Mean length was 38.2 and stand dev = 22 hence restricting max seq length to 60
sequence_length = 60
data2 = data.copy()
data2['X_left'] = pad_sequences(data['X_left'], sequence_length, padding='post').tolist()
data2['X_right'] = pad_sequences(data['X_right'], sequence_length, padding='post').tolist()

Using TensorFlow backend.


Max length: 255


In [7]:
data2.head()

Unnamed: 0,X_left_trackid,X_left,X_right_trackid,X_right,Y
16112,TRMCJGU128E0791C67,"[21, 21, 18, 5656, 18, 366, 558, 202, 115, 717...",TRGJHMO128F42478CC,"[130, 669, 511, 2107, 406, 25, 2957, 7, 1965, ...",0
6919,TRFMITB128F92C91AD,"[17, 30, 1, 544, 379, 14, 14, 58, 6, 3445, 1, ...",TRPTPKD128F426A91D,"[13, 38, 1, 28, 111, 1, 1207, 1762, 43, 25, 14...",1
21111,TRWGQDF128F92E4F04,"[354, 4082, 1054, 484, 5657, 753, 9, 611, 182,...",TRWMAFQ128F932E475,"[404, 404, 13, 53, 2675, 262, 1572, 53, 462, 4...",0
17165,TRYGTCS128F93146AA,"[49, 2, 13, 78, 39, 2, 246, 18, 252, 2, 18, 52...",TRRSFKQ128F1484AB8,"[731, 64, 80, 658, 397, 2412, 88, 54, 28, 94, ...",0
21035,TRWDNTC128F92E87C9,"[137, 33, 1029, 85, 288, 943, 133, 122, 13, 45...",TRATBUH128EF34066F,"[165, 351, 202, 51, 370, 264, 392, 3384, 350, ...",0


In [0]:
#Splitting into train/test/dev and adding (b,a) pairs
validationDataSize = 3500
testDataSize = 3500
trainingDataSize = data2.shape[0] - validationDataSize - testDataSize

#Train data
trainData_similar = data2[:trainingDataSize//2]
trainData_non_similar = data2[data2.shape[0]//2:data2.shape[0]//2 + trainingDataSize//2]

trainData = pd.concat([trainData_similar,trainData_non_similar],ignore_index=True, sort = False)

trainData1 = pd.DataFrame({'X_left_trackid': trainData['X_right_trackid'],'X_left':trainData['X_right'],'X_right_trackid': trainData['X_left_trackid'],'X_right':trainData['X_left'],'Y':trainData['Y']})
trainData_both = pd.concat([trainData,trainData1],ignore_index=True, sort = False)
trainData_both= trainData_both.sample(frac=1).reset_index(drop=True)

#Validation Data
validationData_similar = data2[trainingDataSize//2:trainingDataSize//2 + validationDataSize//2]
validationData_non_similar = data2[data2.shape[0]//2 + trainingDataSize//2:data2.shape[0]//2 + trainingDataSize//2 + validationDataSize//2]

validationData = pd.concat([validationData_similar,validationData_non_similar],ignore_index=True, sort = False)

validationData1 = pd.DataFrame({'X_left_trackid': validationData['X_right_trackid'],'X_left':validationData['X_right'],'X_right_trackid': validationData['X_left_trackid'],'X_right':validationData['X_left'],'Y':validationData['Y']})
validationData_both = pd.concat([validationData,validationData1],ignore_index=True, sort = False)
validationData_both= validationData_both.sample(frac=1).reset_index(drop=True)


#Test Data
testData_similar = data2[(trainingDataSize + validationDataSize)//2:data2.shape[0]//2]
testData_non_similar = data2[data2.shape[0] - testDataSize//2:data2.shape[0]]

testData = pd.concat([testData_similar,testData_non_similar],ignore_index=True, sort = False)

testData1 = pd.DataFrame({'X_left_trackid': testData['X_right_trackid'],'X_left':testData['X_right'],'X_right_trackid': testData['X_left_trackid'],'X_right':testData['X_left'],'Y':testData['Y']})
testData_both = pd.concat([testData,testData1],ignore_index=True, sort = False)
testData_both= testData_both.sample(frac=1).reset_index(drop=True)


In [0]:
#Using Pretrained embeddings
embeddings_index = dict()
f = open(r'lyrics_model_100dim.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    if index > vocab_length - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

<center><b>Training and Evaluating the Models</b></center>
<br>
<b>1. Siamese [Base Line]</b>

In [0]:
def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

In [0]:
from keras.callbacks import EarlyStopping
X_train_left = np.array(trainData_both['X_left'].tolist())
X_train_right = np.array(trainData_both['X_right'].tolist())
X_valid_left = np.array(validationData_both['X_left'].tolist())
X_valid_right = np.array(validationData_both['X_right'].tolist())
Y_train = np.array(trainData_both['Y'].tolist())
Y_valid = np.array(validationData_both['Y'].tolist())
early_stop = EarlyStopping(monitor='val_loss', patience=5)

In [12]:
from keras.layers import Lambda
import keras.backend as K
from keras.models import Input,Model
from keras import regularizers
from keras.initializers import RandomNormal
from keras.layers import Dense, Dropout,Embedding, Activation,Concatenate,LSTM,Subtract,Multiply, GRU
from keras import optimizers
from keras.callbacks import EarlyStopping


max_input_length = sequence_length
vocabulary_size = vocab_length

n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 64
n_epoch = 25

def Train_Siamese(learning_rate):
    
    global max_input_length,vocabulary_size  
    embedding_layer = Embedding(vocabulary_size,
                                embedding_matrix.shape[1],
                                weights=[embedding_matrix],
                                input_length=max_input_length,
                                trainable=True,
                                name = "embedding")

    sequence_input_A = Input(shape=(max_input_length,), dtype='int32',name="Input_A")
    sequence_input_B = Input(shape=(max_input_length,), dtype='int32',name="Input_B")

    embedded_sequences_A = embedding_layer(sequence_input_A)
    embedded_sequences_B = embedding_layer(sequence_input_B)

    lstm = LSTM(max_input_length)

    hfinal_A = lstm(embedded_sequences_A)
    hfinal_B = lstm(embedded_sequences_B)
    
    # Calculates the distance as defined by the MaLSTM model
    malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),
                             output_shape=lambda x: (x[0][0], 1))([hfinal_A, hfinal_B])

    # Pack it all up into a model
    malstm = Model([sequence_input_A, sequence_input_B], [malstm_distance])

    # Adadelta optimizer, with gradient clipping by norm
    optimizer = optimizers.Adadelta(clipnorm=gradient_clipping_norm)

    malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
    malstm.summary()
    
    malstm_trained = malstm.fit([X_train_left, X_train_right], Y_train, 
                                batch_size=batch_size, 
                                nb_epoch=n_epoch,
                                validation_data=([X_valid_left, X_valid_right], Y_valid))
    return malstm
    
malstm_trained = Train_Siamese(0.01)











Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input_A (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
Input_B (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 60, 100)      1341900     Input_A[0][0]                    
                                                                 Input_B[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 60)           38640       embedding[0][0]  



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 34040 samples, validate on 7000 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [13]:
#Evaluate
X_test_left = np.array(testData_both['X_left'].tolist())
X_test_right = np.array(testData_both['X_right'].tolist())
Y_test = np.array(testData_both['Y'].tolist())
malstm_trained.evaluate([X_test_left,X_test_right], Y_test)



[0.28533432408741544, 0.48228571428571426]

<b>2. Twin Architecture [LSTM]</b>

In [0]:
def Train_LSTM(learning_rate,combine_type,output_units):
    
    global max_input_length,vocabulary_size
    random_weights = RandomNormal(mean=0.0, stddev=0.05, seed=100)


    embedding_layer = Embedding(vocabulary_size,
                                embedding_matrix.shape[1],
                                weights=[embedding_matrix],
                                input_length=max_input_length,
                                trainable=True,
                                name = "embedding")

    sequence_input_A = Input(shape=(max_input_length,), dtype='int32',name="Input_A")
    sequence_input_B = Input(shape=(max_input_length,), dtype='int32',name="Input_B")

    embedded_sequences_A = embedding_layer(sequence_input_A)
    embedded_sequences_B = embedding_layer(sequence_input_B)

    lstm = LSTM(output_units, return_sequences=False, 
                  dropout=0.1, recurrent_dropout=0.1)

    hfinal_A = lstm(embedded_sequences_A)
    hfinal_B = lstm(embedded_sequences_B)


    if combine_type=="Subtract":
        combined = Subtract(name="Subtract")([hfinal_A, hfinal_B])
    elif combine_type == "Multiply":
        combined = Multiply(name="Multiply")([hfinal_A, hfinal_B])
    elif combine_type=="Concatenate":
        combined = Concatenate(axis=-1,name="Concatenate")([hfinal_A, hfinal_B])

    model = Dropout(0.1,name="Dropout_layer_1")(combined)
    model = Dense(64,activation='relu',name="Dense_1")(model)
    model = Dense(1,activation='sigmoid',name="Ouput_layer")(model)
    final_model = Model(inputs=[sequence_input_A,sequence_input_B], outputs=model)

    #optimizers
    Adam=optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, amsgrad=False)

    final_model.compile(loss='binary_crossentropy',optimizer=Adam,metrics=['accuracy'])
    final_model.summary()
    print('......................................................')

    return final_model

In [15]:
#fitting the model
final_model_LSTM = Train_LSTM(0.01,"Multiply",64)
final_model_LSTM.fit([X_train_left,X_train_right], Y_train, batch_size=64, epochs=25, validation_data = ([X_valid_left ,X_valid_right],Y_valid ), callbacks = [early_stop])#, verbose=2)


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input_A (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
Input_B (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 60, 100)      1341900     Input_A[0][0]                    
                                                                 Input_B[0][0]                    
__________________________________________________________________________

<keras.callbacks.History at 0x7f33aebc89e8>

In [16]:
# Evaluating the model
final_model_LSTM.evaluate([X_test_left,X_test_right], Y_test)



[0.7678706512451172, 0.49914285710879736]

<b>3.Twin Architecture [GRU]</b>

In [0]:
def Train_GRU(learning_rate,combine_type,output_units):
    
    global max_input_length,vocabulary_size
    embedding_layer = Embedding(vocabulary_size,
                                embedding_matrix.shape[1],
                                weights=[embedding_matrix],
                                input_length=max_input_length,
                                trainable=True,
                                name = "embedding")

    sequence_input_A = Input(shape=(max_input_length,), dtype='int32',name="Input_A")
    sequence_input_B = Input(shape=(max_input_length,), dtype='int32',name="Input_B")

    embedded_sequences_A = embedding_layer(sequence_input_A)
    embedded_sequences_B = embedding_layer(sequence_input_B)

    gru = GRU(embedding_matrix.shape[1], return_sequences=False, 
                  dropout=0.1, recurrent_dropout=0.1)

    hfinal_A = gru(embedded_sequences_A)
    hfinal_B = gru(embedded_sequences_B)


    if combine_type=="Subtract":
        combined = Subtract(name="Subtract")([hfinal_A, hfinal_B])
    elif combine_type == "Multiply":
        combined = Multiply(name="Multiply")([hfinal_A, hfinal_B])
    elif combine_type=="Concatenate":
        combined = Concatenate(axis=-1,name="Concatenate")([hfinal_A, hfinal_B])

    model = Dropout(0.1,name="Dropout_layer_1")(combined)
    model = Dense(64,activation='relu',name="Dense_1")(model)
    model = Dense(1,activation='sigmoid',name="Ouput_layer")(model)
    final_model = Model(inputs=[sequence_input_A,sequence_input_B], outputs=model)


    #optimizers
    Adam=optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, amsgrad=False)

    final_model.compile(loss='binary_crossentropy',optimizer=Adam,metrics=['accuracy'])
    final_model.summary()
    print('......................................................')
    return final_model

In [18]:
#fitting the model
final_model_GRU = Train_GRU(0.01,"Multiply",64)
final_model_GRU.fit([X_train_left,X_train_right], Y_train, batch_size=64, epochs=25, validation_data = ([X_valid_left ,X_valid_right],Y_valid ), callbacks = [early_stop])#, verbose=2)

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input_A (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
Input_B (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 60, 100)      1341900     Input_A[0][0]                    
                                                                 Input_B[0][0]                    
__________________________________________________________________________________________________
gru_1 (GRU)                     (None, 100)          60300       embedding[0][0]            

<keras.callbacks.History at 0x7f33a08f6518>

In [19]:
# Evaluating the model
final_model_GRU.evaluate([X_test_left,X_test_right], Y_test)



[0.6937591751643589, 0.49514285714285716]