# **Representation Learning**

### Aim of this work is to learn different types of embeddings such as Word2vec, GloVe and fastText and compare their performance on the sentence classification task.  To learn which embeddings the network prefers for a given problem by predicting a weight for each embedding type.

## **Dynamic Meta-Embeddings for Improved Sentence Representations**

Dynamic meta-embeddings, a simple yet effective method for the supervised learning of embedding ensembles. The method is to giving networks access to multiple types of embeddings, allowing a network to learn which embeddings it
prefers by predicting a weight for each embedding type, optionally depending on the context.

## Reference

Kiela et. al., Dynamic Meta-Embeddings for Improved Sentence Representations, EMNLP, 2018.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Input, Reshape, Concatenate, Permute, Activation, multiply, Lambda
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras import Model
import tensorflow.keras.backend as K

In [None]:
positive_data = []

file = open('rt-polarity.pos','rb')
lines = file.read().decode('utf-8','ignore').splitlines()

for l in lines:
  positive_data.append(l)

  

In [None]:
positive_data[0]

'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . '

In [None]:
negative_data = []

file = open('rt-polarity.neg','rb')
lines = file.read().decode('utf-8','ignore').splitlines()

for l in lines:
  negative_data.append(l)

In [None]:
# Removing punctuations

punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~0123456789'''
i = 0
for sentence in positive_data:
  for ele in sentence:
    if ele in punctuations:
        sentence = sentence.replace(ele, "")
  positive_data[i] = sentence
  i+=1


i = 0
for sentence in negative_data:
  for ele in sentence:
    if ele in punctuations:
        sentence = sentence.replace(ele, "")
  negative_data[i] = sentence
  i+=1
  



In [None]:
positive_data[0]

'the rock is destined to be the st centurys new  conan  and that hes going to make a splash even greater than arnold schwarzenegger  jeanclaud van damme or steven segal  '

In [None]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip

--2021-09-06 09:49:30--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-09-06 09:49:30--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-09-06 09:49:30--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [None]:
# !unzip glove*.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
train_label = [1]*4500 + [0]*4500
test_label = [1]*831 + [0]*831
train_label = np.array(train_label)
test_label = np.array(test_label)

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

Using Lemmatizer

In [None]:
pre_positive_data = []

from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()

for s in positive_data:
  l = []
  for w in s.split():
    w = lemmatizer.lemmatize(w)
    l.append(w)
  
  # pre_data.append(l)
  pre_positive_data.append(" ".join([i for i in l]))


pre_negative_data = []
  
lemmatizer = WordNetLemmatizer()

for s in negative_data:
  l = []
  for w in s.split():
    w = lemmatizer.lemmatize(w)
    l.append(w)
  
  # pre_data.append(l)
  pre_negative_data.append(" ".join([i for i in l]))

In [None]:
train_data = pre_positive_data[:4500] + pre_negative_data[:4500]
test_data = pre_positive_data[4500:] + pre_negative_data[4500:]

In [None]:
train_max_len = max([len(s.split()) for s in train_data])
test_max_len = max([len(s.split()) for s in test_data])

if train_max_len > test_max_len:
  max_len = train_max_len
else:
  max_len = test_max_len


In [None]:
max_len

51

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data)
sequences = tokenizer.texts_to_sequences(train_data)

#pad sequences
train_word_index = tokenizer.word_index
print("number of unique tokens = ", len(train_word_index))



train_padded_sequences = pad_sequences(sequences, padding='post', maxlen = max_len)

tokenizer.fit_on_texts(test_data)
sequences = tokenizer.texts_to_sequences(test_data)

#pad sequences
test_word_index = tokenizer.word_index
print("number of unique tokens = ", len(test_word_index))

test_padded_sequences = pad_sequences(sequences, padding='post', maxlen = max_len)

number of unique tokens =  16763
number of unique tokens =  18310


In [None]:
test_padded_sequences[0]

array([   2,   54,  245,  167,  334,   22,    4,    1,   82,  286,    1,
        259, 9102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0], dtype=int32)

using glove embedding

In [None]:
import numpy as np
# load the whole embedding into memory
embeddings_index = dict()
embedding_dim = 300
f = open('/content/drive/MyDrive/IISc_Assignment/DLNLP_Assignment3/glove.6B.300d.txt')
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

In [None]:
unique_words = len(train_word_index) + 1
glove_embedding = np.zeros((unique_words, embedding_dim))

for word, i in train_word_index.items():
  # if i > num_words:
  #   continue
  if word in embeddings_index:
    embedding = embeddings_index[word]

  

  if embedding is not None:
    glove_embedding[i] = embedding

In [None]:
model = Sequential()
embedding_layer = Embedding(unique_words, 
                            output_dim = embedding_dim,
                            weights = [glove_embedding], trainable = False)

model.add(embedding_layer)
model.add(LSTM(units=32, dropout=0.2, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [None]:
model.fit(train_padded_sequences, train_label, epochs=10, verbose = 1, validation_split= 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9740558610>

In [None]:
model.evaluate(test_padded_sequences, test_label)



[0.831078052520752, 0.5072202086448669]

implementing word2vec

In [None]:
# !wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

--2021-09-06 09:35:22--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.98.75
Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.98.75|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2021-09-06 09:35:58 (43.7 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [None]:
# import gzip
# import shutil

# with gzip.open("GoogleNews-vectors-negative300.bin.gz", 'rb') as f_in:
#   with open("GoogleNews-vectors-negative300.bin", 'wb') as f_out:
#     shutil.copyfileobj(f_in, f_out)

In [None]:
from gensim.models import Word2Vec, KeyedVectors
w2v_model = KeyedVectors.load_word2vec_format("/content/drive/MyDrive/IISc_Assignment/DLNLP_Assignment3/GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
unique_words = len(train_word_index) + 1
w2v_embedding_dim = 300
w2v_embedding = np.zeros((unique_words, w2v_embedding_dim))

for word, i in train_word_index.items():
  # if i > num_words:
  #   continue
  if word in w2v_model.vocab:
    embedding = w2v_model.wv[word]

  

  if embedding is not None:
    w2v_embedding[i] = embedding

  if __name__ == '__main__':


In [None]:
w2v_embedding.shape

(16764, 300)

In [None]:
model = Sequential()
embedding_layer = Embedding(unique_words, 
                            output_dim = w2v_embedding_dim,
                            weights = [w2v_embedding], trainable = False)

model.add(embedding_layer)
model.add(LSTM(units=32, dropout=0.2, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [None]:
model.fit(train_padded_sequences, train_label, epochs=10, verbose = 1, validation_split= 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa6903d3950>

In [None]:
model.evaluate(test_padded_sequences, test_label)



[0.7705963253974915, 0.49638989567756653]

Implementing Fast Text

In [None]:
import gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [None]:
# !pip install fasttext



In [None]:
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

--2021-09-06 10:34:57--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 172.67.9.4, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2021-09-06 10:37:14 (31.4 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]



In [None]:
# import gzip
# import shutil

# with gzip.open("cc.en.300.bin.gz", 'rb') as f_in:
#   with open("cc.en.300.bin", 'wb') as f_out:
#     shutil.copyfileobj(f_in, f_out)

In [None]:
# import fasttext
# from gensim.models import Word2Vec, KeyedVectors
# ft_model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/IISc_Assignment/DLNLP_Assignment3/cc.en.300.bin')

In [None]:
import gensim.downloader
import pickle
import os

if not os.path.isfile('/content/drive/MyDrive/IISc_Assignment/DLNLP_Assignment3/fastText_dict.pkl'):
  fastText_embed = gensim.downloader.load('fasttext-wiki-news-subwords-300')
  fastText_embedding_dict = open("/content/drive/MyDrive/IISc_Assignment/DLNLP_Assignment3/fastText_dict.pkl", "wb")
  pickle.dump(fastText_embed, fastText_embedding_dict)
  fastText_embedding_dict.close()





In [None]:
fastText_embed = open("/content/drive/MyDrive/IISc_Assignment/DLNLP_Assignment3/fastText_dict.pkl", "rb")
fastText_embed_dict = pickle.load(fastText_embed)

In [None]:
unique_words = len(train_word_index) + 1
ft_embedding_dim = 300
ft_embedding = np.zeros((unique_words, ft_embedding_dim))

for word, i in train_word_index.items():
  # if i > num_words:
  #   continue
  if word in fastText_embed_dict.vocab:
    embedding = fastText_embed_dict.wv[word]

  

  if embedding is not None:
    ft_embedding[i] = embedding

  if __name__ == '__main__':


In [None]:
model = Sequential()
embedding_layer = Embedding(unique_words, 
                            output_dim = ft_embedding_dim,
                            weights = [ft_embedding], trainable = False)

model.add(embedding_layer)
model.add(LSTM(units=32, dropout=0.2, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [None]:
model.fit(train_padded_sequences, train_label, epochs=10, verbose = 1, validation_split= 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa10269ee50>

In [None]:
model.evaluate(test_padded_sequences, test_label)



[0.725334644317627, 0.5]

Implementing DME

In [None]:
### DEFINE INPUT LAYER FOR EMBEDDINGS READING AND CONCATENATION ###
def Concat_Emb(list_emb, maxlen):
    
    inputs = []
    output = []
    for embedding in list_emb:
        
        inp = Input(shape=(maxlen,))
        emb = Embedding(len(train_word_index) + 1, 300, weights=[embedding], trainable=False)(inp)
        emb = Reshape((-1,300,1))(emb)
        inputs.append(inp)
        output.append(emb)
        
    concat = Concatenate(axis=-1)(output)
    
    return Model(inputs, concat)

In [None]:
def DME(maxlen):
    
    inp = Input(shape=(maxlen, embedding_dim, no_of_embeddings))
    x = Reshape((maxlen, embedding_dim*no_of_embeddings))(inp)    
    temp = Dense(embedding_dim*no_of_embeddings,activation=None)(x)
    proj2mul  = Reshape((maxlen, embedding_dim,no_of_embeddings))(temp)  
    proj = Permute((1,3, 2))(proj2mul)          
    alphas = Dense(1,activation=None)(proj)      
    alphas = Activation('softmax')(alphas)       
    alphas2mul = Permute((1,3, 2)) (alphas)     
    x = multiply([proj2mul, alphas2mul])
    out = Lambda(lambda t: K.sum(t, axis=-1))(x)
    print('Out',out.shape)                     
    return Model(inp, out)


In [None]:
print(w2v_embedding.shape)
print(glove_embedding.shape)
print(ft_embedding.shape)

(16764, 300)
(16764, 300)
(16764, 300)


In [None]:
print()

In [None]:
embedding_dim = 300
no_of_embeddings = 3
concat_inp = Concat_Emb([w2v_embedding, ft_embedding, glove_embedding], maxlen=max_len)
dme = DME(max_len)
x = dme(concat_inp.output)
x = GRU(128, dropout=0.2, return_sequences=True)(x)
x = GRU(32, dropout=0.2)(x)
out = Dense(2, activation='softmax')(x)

dme_model = Model(concat_inp.input, out)
dme_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Out (None, 51, 300)


In [None]:
dme_model.fit([train_padded_sequences]*no_of_embeddings, train_label, batch_size=64, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7de9590dd0>

In [None]:
dme_model.evaluate([test_padded_sequences]*no_of_embeddings, test_label)



[1.3546805381774902, 0.46871238946914673]