# Sample Code for Testing Saved Model
This file provides a sample to test the saved model. Make necessary changes so that we can test your CNN/RNN model with this file. If you developed a RNN model, change the name of this file to *test_rnn*.

## Load test data
In the sample below, it loads the dev set for testing. But in real marking, the markers will load held-out test set.

In [1]:
# read data
import pandas as pd

train_data = pd.read_csv('cw2_train.csv')
dev_data = pd.read_csv('cw2_dev.csv')

train_data

Unnamed: 0.1,Unnamed: 0,Sent1,Sent2,SimScore
0,0,"U.S., EU Widen Sanctions On Russia","U.S., EU Boost Sanctions On Russia",1.00
1,1,The lawyers advised the judges .,The lawyers advised the judges behind the acto...,0.79
2,2,Man kills 4 in Calif. before police shoot him ...,Police: Gunman killed 6 in California shootings,0.40
3,3,Someone is playing a piano.,A man is playing a guitar.,0.24
4,4,In an E-mail statement to the Knoxville News S...,I am not giving any consideration to resignati...,0.80
...,...,...,...,...
11493,11493,A man is playing piano.,A man is laying on the ground.,0.15
11494,11494,"The doctors resigned , or the secretaries supp...",The doctors resigned .,0.50
11495,11495,The artist contacted the banker .,The banker contacted the artist by the student .,0.29
11496,11496,"While the professors arrived , the student wai...",The professors arrived .,0.61


## Load Embeddings
Clearly specify the embeddings your implementation requires. Also provide the link for downloading the embeddings. 

In [2]:
# load pre-trained glove embeddings
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import numpy as np

embd_name = 'glove.6B.300d'
link_to_embd = 'https://archive.org/download/glove.6B.50d-300d/glove.6B.300d.txt' # TODO: you should provide the link to download the embedding here

# Below is a sample to load the glove embeddings. ADJUST the code according to the
# embedding you want to use. 
word_vec_dim = 300
path_of_downloaded_files = "E:/RHUL/CW2-Handout/handout/glove.6B.300d.txt".format(word_vec_dim)
glove_file = datapath(path_of_downloaded_files)
word2vec_glove_file = get_tmpfile("glove.6B.300d.txt")
glove2word2vec(glove_file, word2vec_glove_file)
word_vectors = KeyedVectors.load_word2vec_format(word2vec_glove_file)



  glove2word2vec(glove_file, word2vec_glove_file)


## Provide Functions Needed for Evaluation
All functions used to run and evaluate your model should be provided. 

In [8]:
oov_vec = np.random.rand(word_vec_dim)

def get_sent_word_vecs(word_vectors, sent_words):
    vecs = []
    for ww in sent_words:
        if ww in word_vectors:
            vecs.append(word_vectors[ww])
        else:
            vecs.append(oov_vec)
    return np.array(vecs)

In [9]:
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

def get_sent_word_vecs(word_vectors, sent_words):
    vecs = []
    for ww in sent_words:
        if ww in word_vectors:
            vecs.append(word_vectors[ww])
        else:
            vecs.append(oov_vec)
    return np.array(vecs)

def evaluate_trained_model(trained_model, dev_data):
    pred_scores = []
    true_scores = []
    cos_sim = nn.CosineSimilarity()
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval()
        for i, entry in tqdm(dev_data.iterrows()):
            sent1 = entry['Sent1']
            sent2 = entry['Sent2']
            gold_score = entry['SimScore']
            sent1_embds = get_sent_word_vecs(word_vectors, sent1.split())
            sent2_embds = get_sent_word_vecs(word_vectors, sent2.split())
            sent1_repr, sent2_repr = trained_model(sent1_embds, sent2_embds)
            pred_sim = cos_sim(sent1_repr, sent2_repr)
        
            pred_scores.append(pred_sim)
            true_scores.append(gold_score)

    assert len(true_scores) == len(pred_scores)
    squared_errors = [np.square(ts-ps) for (ts, ps) in zip(true_scores, pred_scores)]
    print('MSE of the method on the dev set:', np.mean(squared_errors))

    # check the distribution (histo gram) of the squared errors
    plt.hist(squared_errors)

## Provide Your Model
You should provide the implementaiton of your encoder model below. 

In [13]:
# define the baseline model
import numpy as np
import torch
import torch.nn as nn

class BaselineModel(nn.Module):
    def __init__(self, embd_dim):
        super(BaselineModel, self).__init__()
        self.relu = nn.ReLU()
        self.leaky = nn.LeakyReLU()
        self.dropout = nn.Dropout(0.25)
        self.convo1 = nn.Conv1d(in_channels=embd_dim,out_channels=100,kernel_size=1)
        self.max_pool = nn.MaxPool1d(1, stride=2)
        self.fully_connected_layer = nn.Linear(embd_dim, embd_dim)
        
    
       
    def forward(self, sent1_vecs, sent2_vecs):
        
        def reshape_sent1(a):
            return a.reshape(sent1_vecs.shape[0], sent1_vecs.shape[1], 1)
         
    
        def reshape_sent2(a):
            return a.reshape(sent2_vecs.shape[0], sent2_vecs.shape[1], 1)
        sent1_repr = self.leaky(self.convo1(reshape_sent1(torch.FloatTensor(sent1_vecs))))
        sent2_repr = self.leaky(self.convo1(reshape_sent2(torch.FloatTensor(sent2_vecs))))
        sent1_repr = self.max_pool(sent1_repr)
        sent2_repr = self.max_pool(sent2_repr)
       

        sent1_repr_1 = self.leaky(self.convo1(reshape_sent1(torch.FloatTensor(sent1_vecs))))
        sent2_repr_1 = self.leaky(self.convo1(reshape_sent2(torch.FloatTensor(sent2_vecs))))
        sent1_repr_1 = self.max_pool(sent1_repr_1)
        sent2_repr_1 = self.max_pool(sent2_repr_1)
       

        sent1_repr_2 = self.leaky(self.convo1(reshape_sent1(torch.FloatTensor(sent1_vecs))))
        sent2_repr_2 = self.leaky(self.convo1(reshape_sent2(torch.FloatTensor(sent2_vecs))))
        sent1_repr_2 = self.max_pool(sent1_repr_2)
        sent2_repr_2 = self.max_pool(sent2_repr_2)

        
        
        pool_1 = torch.cat([sent1_repr.squeeze(dim=2),sent1_repr_1.squeeze(dim=2),sent1_repr_2.squeeze(dim=2)], dim=1)
        pool_2 = torch.cat([sent1_repr.squeeze(dim=2),sent1_repr_1.squeeze(dim=2),sent1_repr_2.squeeze(dim=2)], dim=1)
        # Applying Dropout with
        pool_1 = self.fully_connected_layer(self.dropout(pool_1))
        pool_2 = self.fully_connected_layer(self.dropout(pool_2))
             
       
        return pool_1, pool_2

In [14]:
from tqdm.notebook import tqdm

def train_model(train_data, n_epochs, lr, optimizer, loss_fnc, model):
    cos_sim = nn.CosineSimilarity()
    for epoch_i in tqdm(range(n_epochs)):
        ep_loss = []
        cnt = 0
        for i, entry in tqdm(train_data.sample(frac=1).iterrows()):
            cnt += 1
            sent1 = entry['Sent1']
            sent2 = entry['Sent2']
            sent1_embds = get_sent_word_vecs(word_vectors, sent1.split())
            sent2_embds = get_sent_word_vecs(word_vectors, sent2.split())

            # Step 1: Clear the gradients 
            optimizer.zero_grad()

            # Step 2: Compute the forward pass of the model
            sent1_repr, sent2_repr = model(sent1_embds, sent2_embds)
            pred_sim = cos_sim(sent1_repr, sent2_repr)
            true_sim = torch.FloatTensor([entry['SimScore']])

            # Step 3: Compute the loss value that we wish to optimize
            loss = loss_fnc(pred_sim, true_sim)
            ep_loss.append(loss.detach())

            # Step 4: Propagate the loss signal backward
            loss.backward()

            # Step 5: Trigger the optimizer to perform one update
            optimizer.step()

            if  cnt%1000 == 0:
                print('epoch {}, avg loss until step {}: {}'.format(epoch_i, cnt, np.mean(ep_loss)))

        print('\n======epoch {} loss======'.format(epoch_i),np.mean(ep_loss))
        

In [15]:
model = BaselineModel(word_vec_dim)
loss_fnc = nn.MSELoss()

# hyper parameters
n_epochs = 4 
lr = 1e-2 

# init optimizer and scheduler (lr adjustor)
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=lr) 

train_model(train_data, n_epochs, lr, optimizer, loss_fnc, model)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
 

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


epoch 0, avg loss until step 1000: 0.11310171335935593


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


epoch 0, avg loss until step 2000: 0.1045951396226883


  return F.mse_loss(input, target, reduction=self.reduction)


epoch 0, avg loss until step 3000: 0.09898602962493896


  return F.mse_loss(input, target, reduction=self.reduction)


epoch 0, avg loss until step 4000: 0.0963294506072998
epoch 0, avg loss until step 5000: 0.09390513598918915


  return F.mse_loss(input, target, reduction=self.reduction)


epoch 0, avg loss until step 6000: 0.09198470413684845
epoch 0, avg loss until step 7000: 0.09105119854211807
epoch 0, avg loss until step 8000: 0.08974477648735046


  return F.mse_loss(input, target, reduction=self.reduction)


epoch 0, avg loss until step 9000: 0.08859039098024368


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


epoch 0, avg loss until step 10000: 0.08739475905895233
epoch 0, avg loss until step 11000: 0.08711204677820206




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

epoch 1, avg loss until step 1000: 0.08253300189971924
epoch 1, avg loss until step 2000: 0.0807281956076622
epoch 1, avg loss until step 3000: 0.0799618512392044
epoch 1, avg loss until step 4000: 0.07910382002592087
epoch 1, avg loss until step 5000: 0.07774107903242111
epoch 1, avg loss until step 6000: 0.07776038348674774
epoch 1, avg loss until step 7000: 0.07768284529447556
epoch 1, avg loss until step 8000: 0.07788888365030289
epoch 1, avg loss until step 9000: 0.0779138058423996
epoch 1, avg loss until step 10000: 0.07791011780500412
epoch 1, avg loss until step 11000: 0.07775583863258362




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

epoch 2, avg loss until step 1000: 0.07237749546766281
epoch 2, avg loss until step 2000: 0.07571989297866821
epoch 2, avg loss until step 3000: 0.07594526559114456
epoch 2, avg loss until step 4000: 0.07482713460922241
epoch 2, avg loss until step 5000: 0.07487545907497406
epoch 2, avg loss until step 6000: 0.07523693889379501
epoch 2, avg loss until step 7000: 0.07507190108299255
epoch 2, avg loss until step 8000: 0.07529424130916595
epoch 2, avg loss until step 9000: 0.07544352114200592
epoch 2, avg loss until step 10000: 0.07554395496845245
epoch 2, avg loss until step 11000: 0.07538053393363953




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

epoch 3, avg loss until step 1000: 0.07396243512630463
epoch 3, avg loss until step 2000: 0.0751955434679985
epoch 3, avg loss until step 3000: 0.07450584322214127
epoch 3, avg loss until step 4000: 0.07352545112371445
epoch 3, avg loss until step 5000: 0.0732509046792984
epoch 3, avg loss until step 6000: 0.07342078536748886
epoch 3, avg loss until step 7000: 0.07375172525644302
epoch 3, avg loss until step 8000: 0.07361449301242828
epoch 3, avg loss until step 9000: 0.07410828769207001
epoch 3, avg loss until step 10000: 0.07387755066156387
epoch 3, avg loss until step 11000: 0.07385565340518951





## Run and Evaluate Model
The code below creates an instance of the model, loads the saved weights (sample_model.state_dict; run cw2_sample.ipynb will generate this file), and tests it.

In [16]:
def evaluate_trained_model(trained_model, dev_data):
    pred_scores = []
    true_scores = []
    cos_sim = nn.CosineSimilarity()
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        for i, entry in tqdm(dev_data.iterrows()):
            sent1 = entry['Sent1']
            sent2 = entry['Sent2']
            gold_score = entry['SimScore']
            sent1_embds = get_sent_word_vecs(word_vectors, sent1.split())
            sent2_embds = get_sent_word_vecs(word_vectors, sent2.split())
            sent1_repr, sent2_repr = trained_model(sent1_embds, sent2_embds)
            pred_sim = cos_sim(sent1_repr, sent2_repr)
       
            pred_scores.append(pred_sim)
            true_scores.append(gold_score)

    assert len(true_scores) == len(pred_scores)
    squared_errors = [np.square(ts-ps) for (ts, ps) in zip(true_scores, pred_scores)]
    a = [float(squared_errors[i][0]) for i in range(len(squared_errors))]
    print('MSE of the method on the dev set:', np.mean(a))
    # check the distribution (histo gram) of the squared errors

In [17]:
evaluate_trained_model(model, dev_data)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…


MSE of the method on the dev set: 0.07644416806479487


In [18]:
import pickle

info_to_save = {
    'model_state_dict': model.state_dict(),
    'oov_vec': oov_vec
}

with open('cnn.state_dict', 'wb') as ff:
    pickle.dump(info_to_save, ff)

