In [0]:
!pip install pytorch-pretrained-bert

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.utils.data as data
from torchvision import transforms
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

import logging
import urllib
import sys
import os
import zipfile
from os.path import join, exists


In [0]:
## Helper function to read data from URLS and store in dataframes

def readData(file):
    data = pd.read_csv(file, delimiter='\t', header = None, skiprows=1)
    data.columns = ["pair_ID", "sentence_A", "sentence_B", "relatedness_score", "entailment_judgment"]
    return data

def column_values_tolist(output_list,df,col_name):
  for i in range(len(df)):
    output_list.append(df[col_name][i])
  return output_list

def encode_class_labels(ColumnAsList):
  for i in range(len(ColumnAsList)):
    if ColumnAsList[i] == 'CONTRADICTION':
      ColumnAsList [i] = 0
    elif ColumnAsList [i] == 'NEUTRAL':
      ColumnAsList [i] = 1
    elif ColumnAsList [i] == "ENTAILMENT":
      ColumnAsList [i] = 2
    else:
      pass
  return ColumnAsList

def get_sent_em(sentence):
  text = sentence
  # Add the special tokens.
  marked_text = "[CLS] " + text + " [SEP]"
  # Split the sentence into tokens.
  tokenized_text = tokenizer.tokenize(marked_text)
  # Map the token strings to their vocabulary indeces.
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  segments_ids = [1] * len(tokenized_text)
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])
  with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)
  token_embeddings = torch.stack(encoded_layers, dim=0)
  token_embeddings = torch.squeeze(token_embeddings, dim=1)
  token_embeddings = token_embeddings.permute(1,0,2)
  token_vecs = encoded_layers[11][0]
  sentence_embedding = torch.mean(token_vecs, dim=0)
  return(sentence_embedding)


def get_bert_mean_pool(list_of_sentences):
  #passing list of sentences to this function
  output = list(map(get_sent_em,list_of_sentences))

  ## returns output which is list of tensors which are bert sentences embeddings with Mean pooling over bert word embeddings
  return output


def absolute_listoflists(u,v):

  ##returns abs value of u,v |u-v| where u, v are list of lists
  assert len(u) == len(v)
  inter_list = []
  final_list = []
  for i in range(len(u)):
    for j in range(len(u[0])):
      inter_list.append(abs(u[i][j]-v[i][j]))
    final_list.append(inter_list)
    inter_list = []
  return final_list



def final_embedding(u,v,auv):
  ## returns concatenated final embeddings [u,v,|u-v|]
  temp = u
  for i in range(len(u)):
    temp[i].extend(v[i])
    temp[i].extend(auv[i])

  return temp


In [0]:
def get_sbert_embeddings(dataframe):

  sentence_A_list = []

  sentence_A_list = column_values_tolist(sentence_A_list,dataframe,"sentence_A")

  sentence_B_list = []

  sentence_B_list = column_values_tolist(sentence_B_list,dataframe,"sentence_B")


## getting mean pool sentence embeddings from bert word embeddings

  sentence_A_meanpool_embedding = get_bert_mean_pool(sentence_A_list)

  sentence_B_meanpool_embedding = get_bert_mean_pool(sentence_B_list)

##output from above function is list of tensors and convertin them to list

  sent_A_list = [sentence_A_meanpool_embedding[i].tolist()  for i in range(len(sentence_A_meanpool_embedding))]

  sent_B_list = [sentence_B_meanpool_embedding[i].tolist()  for i in range(len(sentence_B_meanpool_embedding))]

##To get |u-v| for all the sentence embeddings in the list

  absoflists = absolute_listoflists(sent_A_list,sent_B_list)

##Final siamese network sentence embeddings (u,v,|u-v|)

  final_sent_embedding = final_embedding(sent_A_list,sent_B_list,absoflists)

  return final_sent_embedding



def df_for_task1(embeddings_list,dataframe):

  labels = []
  labels = column_values_tolist(labels,dataframe,"entailment_judgment")

  labels_list = encode_class_labels(labels)

  task1_df = pd.DataFrame.from_records(embeddings_list)

  task1_df.insert(2304,column='Outcome',value=labels_list)

  ## 0 for C , 1 for N and 2 for E

  return task1_df

In [0]:
def get_sbert_embeddings_task2(dataframe):

  sentence_A_list = []

  sentence_A_list = column_values_tolist(sentence_A_list,dataframe,"sentence_A")

  sentence_B_list = []

  sentence_B_list = column_values_tolist(sentence_B_list,dataframe,"sentence_B")


## getting mean pool sentence embeddings from bert word embeddings

  sentence_A_meanpool_embedding = get_bert_mean_pool(sentence_A_list)

  sentence_B_meanpool_embedding = get_bert_mean_pool(sentence_B_list)

##output from above function is list of tensors and convertin them to list

  sent_A_list = [sentence_A_meanpool_embedding[i].tolist()  for i in range(len(sentence_A_meanpool_embedding))]

  sent_B_list = [sentence_B_meanpool_embedding[i].tolist()  for i in range(len(sentence_B_meanpool_embedding))]

  return sent_A_list,sent_B_list

In [0]:
#Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

100%|██████████| 231508/231508 [00:00<00:00, 1298542.38B/s]
100%|██████████| 407873900/407873900 [00:10<00:00, 39858978.78B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [0]:
train_data = "http://www.site.uottawa.ca/~diana/csi5386/A2_2020/SICK_train.txt"
test_data ="http://www.site.uottawa.ca/~diana/csi5386/A2_2020/SICK_test_annotated.txt"
validation_data ="http://www.site.uottawa.ca/~diana/csi5386/A2_2020/SICK_trial.txt"


train = readData(train_data)
validation = readData(validation_data)
test = readData(test_data)

In [0]:
train_sentence_A , train_sentence_B = get_sbert_embeddings_task2(train)

In [0]:
relatedness_score = []

relatedness_score = column_values_tolist(relatedness_score,train,"relatedness_score")

#Converting list to numpy array
rs_numpy = np.asarray(relatedness_score)

In [0]:
# Computing cosine similarity of two sentences and storing them in a list first and then converting into numpy array format

from scipy.spatial.distance import cosine

Cosine_Similarity = [1-cosine(train_sentence_A[i], train_sentence_B[i]) for i in range(len(train_sentence_A))]

Cosine_Similarity_num = np.asarray(Cosine_Similarity)

In [0]:
#Scaling relatedness score using min max scalar to range [0 ,1] 

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

relatedness_score_scaled = scaler.fit_transform(rs_numpy)

SPEARMAN CORRELARION OF Approch - 1

In [0]:
#Spearman Correlation between relatedness score normalized and cosine similarity

from scipy.stats import spearmanr

coef , p = spearmanr(relatedness_score_scaled.squeeze(),Cosine_Similarity_num)

print(coef)

print(p)

0.591389536061306
0.0


PEARSON CORRELATION OF Approch -1

In [0]:
# Pearson Correlation between relatedness score normalized and cosine similarity of sentences

import numpy
numpy.corrcoef(relatedness_score_scaled.squeeze(),Cosine_Similarity_num)[0, 1]

0.6183053064192804

MSE of predicted score and actual relatedness Score

In [0]:
# MSE between relatedness score normalized and cosine similarity of sentences

from sklearn.metrics import mean_squared_error

mean_squared_error(relatedness_score_scaled.squeeze(),Cosine_Similarity_num)


0.1074915262514193

In [0]:
Cosine_Similarity = [float("{0:.3f}".format(Cosine_Similarity[i])) for i in range(len(Cosine_Similarity))]
labels = [float("{0:.3f}".format(labels[i])) for i in range(len(labels))]

In [0]:
## Getting Pretrained Siamese architecture Sentence Embeddings

pip install -U sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/07/32/e3d405806ea525fd74c2c79164c3f7bc0b0b9811f27990484c6d6874c76f/sentence-transformers-0.2.5.1.tar.gz (52kB)
[K     |████████████████████████████████| 61kB 9.7MB/s 
[?25hCollecting transformers==2.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 36.9MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 49.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████

In [0]:
## Instance of Model , here model which has bert and pooling layer top of it

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

100%|██████████| 405M/405M [00:24<00:00, 16.2MB/s]


In [0]:
## Storing Sentences in lists

sentence_A_list = []

sentence_A_list = column_values_tolist(sentence_A_list,train,"sentence_A")

sentence_B_list = []

sentence_B_list = column_values_tolist(sentence_B_list,train,"sentence_B")

In [0]:
## getting sentence embeddings from instantiated Model

SentA_pretrained_embeddings = model.encode(sentence_A_list)

SentB_pretrained_embeddings = model.encode(sentence_B_list)

In [0]:
from scipy.spatial.distance import cosine

Cosine_Similarity_withpretrained = [1-cosine(SentA_pretrained_embeddings[i], SentB_pretrained_embeddings[i]) for i in range(len(SentA_pretrained_embeddings))]

Cosine_Similarity_withpretrained_num = np.asarray(Cosine_Similarity_withpretrained)

SPEARMAN CORRELATION TRAIN : 

In [0]:
from scipy.stats import spearmanr

coef , p = spearmanr(relatedness_score_scaled.squeeze(),Cosine_Similarity_withpretrained_num)

print(coef)

print(p)

0.732856992616356
0.0


PEARSON TRAIN :

In [0]:
import numpy
numpy.corrcoef(relatedness_score_scaled.squeeze(),Cosine_Similarity_withpretrained_num)[0,1]

0.7314216694251019

MSE TRAIN:

In [0]:
from sklearn.metrics import mean_squared_error

mean_squared_error(relatedness_score_scaled.squeeze(),Cosine_Similarity_withpretrained_num)

0.03764292691783672

In [0]:
train_output_scores = scaler.inverse_transform(Cosine_Similarity_withpretrained_num.reshape(4500,1)).squeeze().tolist()

In [0]:
## Getting train and Val Sentences

test_sentence_A = []
test_sentence_A = column_values_tolist(test_sentence_A,test,"sentence_A")

test_sentence_B = []
test_sentence_B = column_values_tolist(test_sentence_B,test,"sentence_B")

test_relatedness_score = []
test_relatedness_score = column_values_tolist(test_relatedness_score,test,"relatedness_score")

val_sentence_A = []
val_sentence_A = column_values_tolist(val_sentence_A,validation,"sentence_A")

val_sentence_B = []
val_sentence_B = column_values_tolist(val_sentence_B,validation,"sentence_B")

val_relatedness_score = []
val_relatedness_score = column_values_tolist(val_relatedness_score,validation,"relatedness_score")

In [0]:
## Sentence Embeddings of test and val

SentA_pretrained_embeddings_test = model.encode(test_sentence_A)

SentB_pretrained_embeddings_test = model.encode(test_sentence_B)

SentA_pretrained_embeddings_val = model.encode(val_sentence_A)

SentB_pretrained_embeddings_val = model.encode(val_sentence_B)

In [0]:
from scipy.spatial.distance import cosine

Cosine_Similarity_withpretrained_test = [1-cosine(SentA_pretrained_embeddings_test[i], SentB_pretrained_embeddings_test[i]) for i in range(len(SentA_pretrained_embeddings_test))]

Cosine_Similarity_withpretrained_num_test = np.asarray(Cosine_Similarity_withpretrained_test)


Cosine_Similarity_withpretrained_val = [1-cosine(SentA_pretrained_embeddings_val[i], SentB_pretrained_embeddings_val[i]) for i in range(len(SentA_pretrained_embeddings_val))]

Cosine_Similarity_withpretrained_num_val = np.asarray(Cosine_Similarity_withpretrained_val)

In [0]:

test_relatedness_score_scaled = scaler.fit_transform(np.asarray(test_relatedness_score).reshape(4927,1))

TEST SPEARMAN : 

In [0]:
from scipy.stats import spearmanr

coef , p = spearmanr(test_relatedness_score_scaled.squeeze(),Cosine_Similarity_withpretrained_num_test)

print(coef)


0.7291454051597714


TEST PEARSON Approch 2:

In [0]:

numpy.corrcoef(test_relatedness_score_scaled.squeeze(),Cosine_Similarity_withpretrained_num_test)[0,1]

0.7294967112365477

MSE TEST Approch 2:

In [0]:

mean_squared_error(test_relatedness_score_scaled.squeeze(),Cosine_Similarity_withpretrained_num_test)

0.03691746826457032

In [0]:

val_relatedness_score_scaled = scaler.fit_transform(np.asarray(val_relatedness_score).reshape(500,1))

VALIDATION SPEARMAN Approch 2:

In [0]:
coef , p = spearmanr(val_relatedness_score_scaled.squeeze(),Cosine_Similarity_withpretrained_num_val)

print(coef)

0.7286048633682622


PEARSON COEFF VALIDATION :

In [0]:

numpy.corrcoef(val_relatedness_score_scaled.squeeze(),Cosine_Similarity_withpretrained_num_val)[0,1]

0.7041413759964165

In [0]:

mean_squared_error(val_relatedness_score_scaled.squeeze(),Cosine_Similarity_withpretrained_num_val)

0.03997629858469614

In [0]:
"""test_output_scores = scaler.inverse_transform(Cosine_Similarity_withpretrained_num_test.reshape(4927,1)).squeeze().tolist()
test_output_scores = [float("{0:.3f}".format(test_output_scores[i])) for i in range(len(test_output_scores))]"""

Saving results to Results.txt file

In [1]:
import pickle

with open('final_report_test.data', 'rb') as filehandle:
# read the data as binary data stream
     entailment_judgment = pickle.load(filehandle)
#      print(entailment_judgment)

with open('final_report_scores.data', 'rb') as filehandle:
# read the data as binary data stream
     relatedness_score = pickle.load(filehandle)
#      print(relatedness_score)

In [2]:
import pandas as pd
def readtest(file):
    data = pd.read_csv(file, delimiter='\t', header = None, skiprows=1)
    data.columns = ["pair_ID", "sentence_A", "sentence_B"]
    return data

In [3]:
test_data ="http://www.site.uottawa.ca/~diana/csi5386/A2_2020/SICK_test.txt"
test = readtest(test_data)
print("Test Data Shape :",test.shape)

Test Data Shape : (4927, 3)


In [4]:
test["entailment_judgment"] = entailment_judgment
test["relatedness_score"] = relatedness_score
print("Test Data Shape :",test.shape)
test = test.drop(["sentence_A","sentence_B"],axis = 1)
print("Test Data Shape :",test.shape)

Test Data Shape : (4927, 5)
Test Data Shape : (4927, 3)


In [5]:
import numpy as np
np.savetxt("Results.txt", test.values, delimiter="\t", newline = "\n", fmt="%s")