In [None]:
import os

import pandas as pd
import requests
import tensorflow as tf


def load_sts_dataset(filename):
    """
     Loads a subset of the STS dataset into a DataFrame.
     In particular both sentences and their human rated similarity score.
    :param filename:
    :return:
    """
    sent_pairs = []
    with tf.io.gfile.GFile(filename, "r") as f:
        for line in f:
            ts = line.strip().split("\t")
            sent_pairs.append((ts[5], ts[6], float(ts[4])))
    return pd.DataFrame(sent_pairs, columns=["sent_1", "sent_2", "sim"])

In [None]:
def download_and_load_sts_data():
    sts_dataset = tf.keras.utils.get_file(
        fname="Stsbenchmark.tar.gz",
        origin="http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz",
        extract=True)

    sts_train = load_sts_dataset(os.path.join(os.path.dirname(sts_dataset), "stsbenchmark", "sts-train.csv"))
    sts_dev = load_sts_dataset(os.path.join(os.path.dirname(sts_dataset), "stsbenchmark", "sts-dev.csv"))
    sts_test = load_sts_dataset(os.path.join(os.path.dirname(sts_dataset), "stsbenchmark", "sts-test.csv"))

    return sts_train,sts_dev, sts_test


In [None]:
sts_train, sts_dev, sts_test = download_and_load_sts_data()
sts_train[:5]

Downloading data from http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz


Unnamed: 0,sent_1,sent_2,sim
0,A plane is taking off.,An air plane is taking off.,5.0
1,A man is playing a large flute.,A man is playing a flute.,3.8
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8
3,Three men are playing chess.,Two men are playing chess.,2.6
4,A man is playing the cello.,A man seated is playing the cello.,4.25


In [None]:
i = 5
for row in sts_train.index:
  sts_train['sent_1'][row] = sts_train['sent_1'][row].lower()
  sts_train['sent_2'][row] = sts_train['sent_2'][row].lower()
  # row = str(row)
  # row = row.lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
df = sts_train

#### Pre - Analysis on  data
##### Conversion from Data frame to list

In [None]:
# Converting in to List
labels = df['sim'].values.tolist()
sent1 = df['sent_1'].values.tolist()
sent2 = df['sent_2'].values.tolist()

In [None]:
# Data samples
print('Sentence 1 - ',sent1[106])
print('Sentence 2 -', sent2[106])
print('Score -- ',labels[106])

Sentence 1 -  a man is playing on his keyboard.
Sentence 2 - a man is playing a keyboard piano.
Score --  4.0


##### Cleaning

In [None]:
#Converting to Lower Case
low1=[]
low2=[]
for i in range (0,len(sent1)):
    lower1 = (sent1[i]).lower()
    low1.append(lower1)
print(low1)

for i in range (0,len(sent2)):
    lower2 = (sent2[i]).lower()
    low2.append(lower2)
print(low2)



In [None]:
#Remove Numbers
n1=[]
n2=[]
for i in range (0,len(sent1)):
    b1 = ''.join(c for c in low1[i] if not c.isdigit())
    n1.append(b1)
print(n1)

for i in range (0,len(sent2)):
    b2 = ''.join(c for c in low2[i] if not c.isdigit())
    n2.append(b2)
print(n2)




In [None]:
#Remove Punctuations
from string import punctuation
p1=[]
p2=[]
for i in range (0,len(sent1)):
    pn1 = ''.join(c for c in n1[i] if c not in punctuation)
    p1.append(pn1)
print(p1)

for i in range (0,len(sent2)):
    pn2 = ''.join(c for c in n2[i] if c not in punctuation)
    p2.append(pn2)
print(p2)




#####   Tokenisation

In [None]:
#4. Treebank Tokeniser
from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
tree1=[]
tree2=[]
t = TreebankWordTokenizer()

for i in range (0,len(sent1)):
    bank1 = t.tokenize(p1[i])
    tree1.append(bank1)
print(tree1)

for i in range (0,len(sent2)):
    bank2 = t.tokenize(p2[i])
    tree2.append(bank2)
print(tree2)



##### Lemmatization

In [None]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloadin

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

lem_word1=[]
lem_word2=[]


for i in range (0,len(sent1)):
    lemma1 = [wordnet_lemmatizer.lemmatize(word) for word in tree1[i]]
    lem_word1.append(lemma1)
print (lem_word1)
                                   
for i in range (0,len(sent2)):
    lemma2 = [wordnet_lemmatizer.lemmatize(word) for word in tree2[i]]
    lem_word2.append(lemma2)
print (lem_word2)
                            
    



##### Removing Stop Words

In [None]:
from nltk.corpus import stopwords

In [None]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
import pandas as pd

stopwords = nltk.corpus.stopwords.words('english')
# stop_words = [line.strip() for line in open('stopwords', 'r')]# adding extra stpwords from pubmed(https://www.ncbi.nlm.nih.gov/CBBresearch/Wilbur/IRET/DATASET/)


# for i in stop_words:
#     stopwords.append(i)
    
cleaned1=[]
cleaned2=[]


for i in range (0,len(sent1)):
    sw_sent1 = [word for word in lem_word1[i] if not word in stopwords]
    cleaned1.append(sw_sent1)
print (cleaned1)

for i in range (0,len(sent2)):
    sw_sent2 = [word for word in lem_word2[i] if not word in stopwords]
    cleaned2.append(sw_sent2)
print (cleaned2)

In [None]:
#length of token
b1=0
b2=0

for item in cleaned1:
    a1 = (len(item))
    b1=a1+b1
print("sentence 1 has",b1, "tokens")

for item in cleaned2:
    a2 = (len(item))
    b2=a2+b2
print("sentence 2 has",b2,"tokens")
print("Totally there are",b1+b2, "tokens")

In [None]:
#storing variables for future use
%store cleaned1   
%store cleaned2


##### Detokenizing Sentence

In [None]:
d = TreebankWordDetokenizer()
dtree1=[]
dtree2=[]

for i in range (0,len(sent1)):
    bak1 = d.detokenize(cleaned1[i])
    dtree1.append(bak1)
print(dtree1)

for i in range (0,len(sent2)):
    bak2 = d.detokenize(cleaned2[i])
    dtree2.append(bak2)
print(dtree2)

%store dtree1
%store dtree2

Stored 'dtree1' (list)
Stored 'dtree2' (list)


In [None]:
# Export Final Dataset
#have not used these data anywhere. just for experimental purpose


import pandas
df = pandas.DataFrame(data={"Sent1": dtree1[:4000], "Sent2": dtree2[:4000],"Score":labels[:4000]})
df.to_csv("/content/sample_data/train.csv", sep=',',index=False)
df = pandas.DataFrame(data={"Sent1": dtree1[4000:], "Sent2": dtree2[4000:],"Score":labels[4000:]})
df.to_csv("/content/sample_data/test.csv", sep=',',index=False)


df = pandas.DataFrame(data={"Sent1": dtree1, "Sent2": dtree2,"Score":labels})
df.to_csv("/content/sample_data/STSData.csv", sep=',',index=False)


# Prepare Data 

In [None]:
import pandas as pd
df = pd.read_csv('/content/sample_data/STSData.csv')
df.head()

Unnamed: 0,Sent1,Sent2,Score
0,plane taking,air plane taking,5.0
1,man playing large flute,man playing flute,3.8
2,man spreading shreded cheese pizza,man spreading shredded cheese uncooked pizza,3.8
3,three men playing chess,two men playing chess,2.6
4,man playing cello,man seated playing cello,4.25


In [None]:
# check the size of the data 
sent1 = df['Sent1'].tolist()
sent2 = df['Sent2'].tolist()
lables = df['Score'].tolist()


In [None]:
#retrieve stored values
%store -r cleaned1
%store -r cleaned2
%store -r dtree1
%store -r dtree2

# data split.
train_text1 = dtree1[:4000]
train_text2 = dtree2[:4000]
train_labels =lables[:4000]

test_text1 = dtree1[4000:]
test_text2 = dtree2[4000:]
test_labels =lables[4000:]


In [None]:
train_text1[0]

'plane taking'

#TF-IDF

In [None]:
# training: tf-idf + Random Forest

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer1 = TfidfVectorizer()

sent1_vec = vectorizer1.fit_transform(dtree1)
sent2_vec = vectorizer1.fit_transform(dtree2)


train_vecs1 = vectorizer1.transform(train_text1)
train_vecs2 = vectorizer1.transform(train_text2)

test_vecs1 = TfidfVectorizer(vocabulary=vectorizer1.vocabulary_).fit_transform(test_text1)
test_vecs2 = TfidfVectorizer(vocabulary=vectorizer1.vocabulary_).fit_transform(test_text2)
feat = vectorizer1.get_feature_names()
print(feat)
%store feat


Stored 'feat' (list)




In [None]:
# train model
from sklearn.ensemble import RandomForestRegressor
reg1 = RandomForestRegressor(max_depth=6).fit(train_vecs1, train_labels)
reg2 = RandomForestRegressor(max_depth=6).fit(train_vecs2, train_labels)
reg1

RandomForestRegressor(max_depth=6)

In [None]:
# test model

from sklearn.metrics import mean_squared_error
test_pred1 = reg1.predict(test_vecs1)
test_pred2 = reg2.predict(test_vecs2)

a_mse1 = mean_squared_error(test_labels, test_pred1)
a_mse2 = mean_squared_error(test_labels, test_pred2)

print('MSE for Sentence  1: ', a_mse1)

print("--------------------------------------------")

print('MSE for Sentence  2: ', a_mse2)


MSE for Sentence  1:  2.356574255488694
--------------------------------------------
MSE for Sentence  2:  2.329020308784681


#### Similarity Measures - Cosine

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

a_cc=[]
for i in range(0,4000):
    c= cosine_similarity(train_vecs1[i].reshape(1, -1), train_vecs2[i].reshape(1, -1))[0][0]
    a_cc.append(c)
print(a_cc)
len(a_cc)

a_cc1=[]
for i in range(0,1749):
    c1= cosine_similarity(test_vecs1[i].reshape(1, -1), test_vecs2[i].reshape(1, -1))[0][0]
    a_cc1.append(c1)
print(a_cc1)
len(a_cc1)

[0.8423168740681976, 0.8242615289773747, 0.7494947354353714, 0.8498308773663568, 0.7945625124915482, 0.907883759300555, 0.14355447646974948, 0.4633241698094659, 0.6822686108957946, 0.6087910488724203, 0.6198628250330963, 0.9211213938854794, 0.7081137444566037, 0.8282292425229256, 0.6321514567663024, 0.7701791085915068, 0.9273890351417449, 0.19839622700003084, 0.8879295814975486, 1.0000000000000002, 0.6822124752488989, 0.7258520052331087, 0.7476884309870555, 0.5132424947458442, 0.4865191892071508, 0.5766367259008427, 1.0, 0.5434177331503995, 0.7879554657630508, 0.9200089248136307, 0.7528483030559141, 1.0000000000000002, 0.758384581578129, 0.4363504029195094, 0.6345730283729158, 0.6771236980758886, 0.6568367586721726, 0.8457472621747246, 0.4791992611695498, 0.5582676995303236, 0.49559169265282627, 0.6889850055375828, 0.39201009769358164, 0.5635044495811155, 0.650313842913878, 0.2345194720690872, 0.2865137484935979, 0.47961175889351015, 0.5749349057338147, 0.7226356301560015, 0.7226356301

1749

#### Pearson Coefficient

In [None]:
# pearson Correlation 

import numpy as np
from scipy.stats import pearsonr
a_pc, _ = pearsonr(a_cc, train_labels)
print('Pearsons correlation for Train: %.5f' % a_pc)

Pearsons correlation for Train: 0.70542


In [None]:
import numpy as np
from scipy.stats import pearsonr
a_pc_t, _ = pearsonr(a_cc1, test_labels)
print('Pearsons correlation for Test: %.5f' % a_pc_t)

Pearsons correlation for Test: 0.69666


In [None]:
#storing variables
%store a_mse1
%store a_mse2
%store a_cc
%store a_cc1
%store a_pc
%store a_pc_t

Stored 'a_mse1' (float64)
Stored 'a_mse2' (float64)
Stored 'a_cc' (list)
Stored 'a_cc1' (list)
Stored 'a_pc' (float64)
Stored 'a_pc_t' (float64)


#Word2Vec

In [None]:
import gensim
import gensim.downloader as gensim_api
from gensim.test.utils import datapath
from gensim.models import KeyedVectors

In [None]:
path_of_downloaded_bin = "/content/drive/MyDrive/PubMed-w2v.bin" #https://bio.nlplab.org
word_vectors = KeyedVectors.load_word2vec_format(datapath(path_of_downloaded_bin), binary=True)


In [None]:
from scipy import spatial
index2word_set = set(word_vectors.wv.index2word)


  


In [None]:
import numpy as np
from scipy import spatial

index2word_set = set(word_vectors.wv.index2word)

def avg_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec


  after removing the cwd from sys.path.


In [None]:
# calculating average feature vector for each sentences
#train data
cc1=[]
cc2=[]
for i in range(0,4000):
    s1_afv = avg_feature_vector(train_text1[i], model=word_vectors, num_features=200, index2word_set=index2word_set)
    s2_afv = avg_feature_vector(train_text2[i], model=word_vectors, num_features=200, index2word_set=index2word_set)
    cc1.append(s1_afv)
    cc2.append(s2_afv)
print(len(cc1))
print(cc1[1])
print(len(cc2))
print(cc2[1])

#test data
cc3=[]
cc4=[]
for i in range(0,1749):
    s3_afv = avg_feature_vector(test_text1[i], model=word_vectors, num_features=200, index2word_set=index2word_set)
    s4_afv = avg_feature_vector(test_text2[i], model=word_vectors, num_features=200, index2word_set=index2word_set)
    cc3.append(s3_afv)
    cc4.append(s4_afv)
print(len(cc3))
print(cc3[1])
print(len(cc4))
print(cc4[1])


4000
[-0.05343106 -0.10542451 -0.21266805  0.01632174  0.11383443 -0.1084749
  0.00907643  0.09286619  0.08387265  0.03182872 -0.013578   -0.04195878
 -0.00820775 -0.07350737  0.0237173  -0.01480702  0.36618176 -0.02480449
  0.1798574   0.01502042  0.08958536 -0.10146477  0.01187034 -0.01564995
  0.01698668  0.11359234 -0.16563196 -0.00677783  0.00214775  0.22018439
  0.12110197  0.06383767  0.02359292 -0.14217709  0.00145256 -0.12381602
  0.07311392 -0.07404344 -0.10095495 -0.06024169 -0.06609877 -0.15567082
 -0.13718398  0.05481745  0.16850576  0.01041579 -0.15290967  0.00768882
 -0.06688306 -0.05662205 -0.15240744  0.30984923 -0.011554   -0.00537762
  0.04716162 -0.06810151 -0.0456871   0.32119432 -0.12882847  0.06863739
  0.05114546  0.12201869  0.00686693  0.00107539 -0.00990393 -0.21206172
 -0.0349366  -0.01384683  0.09903654  0.07417025  0.00885041 -0.03517038
 -0.1131655  -0.05568472  0.04492363 -0.10070217 -0.18117926 -0.11380967
 -0.07867791 -0.06523567 -0.09991676  0.2844167

In [None]:
#word2vec + Random Forest
from sklearn.ensemble import RandomForestRegressor
reg1 = RandomForestRegressor(max_depth=6).fit(cc1, train_labels)
reg2 = RandomForestRegressor(max_depth=6).fit(cc2, train_labels)
reg1
reg2

RandomForestRegressor(max_depth=6)

In [None]:
from sklearn.metrics import mean_squared_error
# test model
test_pred1 = reg1.predict(cc3)
test_pred2 = reg2.predict(cc4)

b_mse1 = mean_squared_error(test_labels, test_pred1)
b_mse2 = mean_squared_error(test_labels, test_pred2)

print('MSE for Sentence  1: ', b_mse1)

print("--------------------------------------------")

print('MSE for Sentence  2: ', b_mse2)



MSE for Sentence  1:  2.408931071300139
--------------------------------------------
MSE for Sentence  2:  2.399153537466094


In [None]:
#word movers distance
#train data
from pyemd import emd
dtt=[]
a_wmd=[]
for i in range(0,4000):
    dt= word_vectors.wmdistance(train_text1[i], train_text2[i])
    dtt = 1-dt # changing distance to similarity
    a_wmd.append(dtt)
print(a_wmd)
len(a_wmd)



[0.4610990071373503, 0.5210095190228226, 0.5100526708316134, 0.4035996901521266, 0.29367957117520604, 0.36305903278109275, 0.31738491300582883, 0.16296071352708053, 0.29049810840900736, 0.672910046948263, 0.05684052257811012, 0.7308208383523911, 0.32076546284538765, 0.5539533304986968, 0.3222727010011651, 0.8381704286976432, 0.5910311612114876, 0.14229626892378122, 0.46653464932646127, 1.0, 0.42747674424674176, 0.3550832375754942, 0.5297793495928607, 0.32163114154948946, 0.49836466560105175, 0.717541814909056, 1.0, 0.297952875620373, 0.5297839931905162, 0.3645493724124276, 0.5059992626950525, 1.0, 0.5899649916517398, 0.40359955082407506, -0.011819555384541935, 0.23514068650337572, 0.5265651212884563, 0.2918403872156977, 0.12864228434264613, 0.546258101748336, 0.18511027897433463, 0.25478805963712403, 0.2104265396853806, 0.5470243061184373, 0.499996115124538, 0.28479289580402944, 0.08109112823881881, 0.3564244722485542, 0.4648420832805634, 0.4197171402096749, 0.4197171402096749, 0.32929

4000

In [None]:
# wmd for test data
from pyemd import emd
dtt1=[]
a_wmd1=[]
for i in range(0,1749):
    dt= word_vectors.wmdistance(test_text1[i], test_text2[i])
    dtt1 = 1-dt # changing distance to similarity
    a_wmd1.append(dtt1)
print(a_wmd1)
len(a_wmd1)

[0.4159094847300068, 0.06726954205861357, 0.44446120315987614, 0.18176371276172232, 0.14437014884636645, 0.293312365119487, 0.6972795753317669, 0.6803879732489387, 0.23229719776029545, 0.4147659010123834, 0.5604576605016853, 0.4293160661722588, 0.1413072001246568, 0.6309217234735929, -0.06887137132621524, 0.4982151123706464, 0.34894692888517753, 0.366715753664106, 0.4896228611442759, -0.14123692136098653, 0.2133372424915002, 0.5384019950546292, 0.4116527745267964, 0.20598243044536435, 0.445917690849298, 0.6451004918092633, 0.19267790075561242, 0.0339261926744725, -0.2609986379344946, 0.23046685644928855, 0.4542465244288383, -0.02609197570041566, 0.024992353823547586, 0.35902110072660354, 0.22183940281901293, 0.3688887798997784, 0.5562418903493629, 0.14949707527587308, 0.052012244609077274, -0.2301239586047934, 0.2582830809602388, 0.5616838970264688, 0.7492164906613829, 0.4033532579066761, 0.4054634978126693, 0.5135870963345046, 0.3748024077417581, 0.33040137940884595, -0.04851190833339

1749

#### Pearson Coefficient

In [None]:
# pearson Correlation 

import numpy as np
from scipy.stats import pearsonr    
b_pc, _ = pearsonr(a_wmd, train_labels)
print('Pearsons correlation for Train: %.5f' % b_pc)

Pearsons correlation for Train: 0.60622


In [None]:

from scipy.stats import pearsonr
b_pc_t, _ = pearsonr(a_wmd1, test_labels)
print('Pearsons correlation for Test: %.5f' % b_pc_t)

Pearsons correlation for Test: 0.55660


In [None]:
#storing variables
%store b_mse1
%store b_mse2
%store a_wmd
%store a_wmd1
%store b_pc
%store b_pc_t

Stored 'b_mse1' (float64)
Stored 'b_mse2' (float64)
Stored 'a_wmd' (list)
Stored 'a_wmd1' (list)
Stored 'b_pc' (float64)
Stored 'b_pc_t' (float64)


# Prepare Data for Neural Network Model

In [None]:
# data split. 
# %store -r cleaned1
# %store -r cleaned2
# %store -r dtree1
# %store -r dtree2

train_text1 = sent1[:3000]
train_text2 = sent2[:3000]
train_lables =lables[:3000]

dev_text1 = sent1[3000:4000]
dev_text2 = sent2[3000:4000]
dev_lables =lables[3000:4000]


test_text1 = sent1[4000:]
test_text2 = sent2[4000:]
test_lables =lables[4000:]

In [None]:
#concatenating sentences

sentences_pair_tr =  [x1 +' '+ x2 for x1, x2 in zip(train_text1, train_text2)]
sentences_pair_tr[1]
#len(sentences_pair_tr)

sentences_pair_dv = [x1 +' '+ x2 for x1, x2 in zip(dev_text1, dev_text2)]
sentences_pair_dv[1]
#len(sentences_pair_dv)


sentences_pair_tt = [x1 +' '+ x2 for x1, x2 in zip(test_text1, test_text2)]
sentences_pair_tt[1]
#len(sentences_pair_tt)



'wikileaks begin publishing two million syria email wikileaks releasing trove syria document'

# Neural Network

In [None]:
# # load pre-trained Pubmed embeddings
# from gensim.test.utils import datapath
# from gensim.models import KeyedVectors
# path_of_downloaded_bin = "/Users/aswath/PycharmProjects/mfac038/IndividualProject/PubMed-and-PMC-w2v.bin"
# word_vectors = KeyedVectors.load_word2vec_format(datapath(path_of_downloaded_bin), binary=True)


In [None]:
# define functions that build mini-batches
from nltk.tokenize import word_tokenize
import numpy as np

word_vec_dim =200 # make sure this number matches the embedding
oov_vec = oov_vec = np.random.rand(word_vec_dim)

def get_sent_word_vecs(word_vectors, sent_words, largest_len):
    vecs = []
    for ww in sent_words:
        if ww in word_vectors:
            vecs.append(word_vectors[ww])
        else:
            vecs.append(oov_vec)
    for i in range(largest_len-len(sent_words)):
        vecs.append([0.]*word_vec_dim)
    return np.array(np.transpose(vecs))

def build_mini_batch(sent_list, word_vectors):
    tokenized_sents = [word_tokenize(ss.lower()) for ss in sent_list]
    largest_len = np.max([len(tokens) for tokens in tokenized_sents])
    text_vecs = []
    for ts in tokenized_sents:
        vv = get_sent_word_vecs(word_vectors, ts, largest_len)
        text_vecs.append(vv)
    #print('mini batch shape',np.array(text_vecs))
    return np.array(text_vecs)
    


In [None]:
# define the CNN model

import numpy as np
import torch
import torch.nn as nn

class CNN_reg(nn.Module):
    def __init__(self, embd_dim, filter_size_list, filter_num_list, class_num, dp_rate=0.5, gpu=False):
        super(CNN_reg, self).__init__()
        self.embd_dim = embd_dim
        assert len(filter_size_list) == len(filter_num_list)
        self.output_dim = class_num
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(dp_rate)
        self.fc = nn.Linear(np.sum(filter_num_list), class_num)
        self.gpu = gpu
        self.convs = self.build_convs(filter_size_list, filter_num_list, gpu)
        if self.gpu:
            self.to('cuda')
            
    def build_convs(self, f_sizes, f_nums, gpu):
        convs = nn.ModuleList()
        for fs, fn in zip(f_sizes, f_nums):
            padding_size = fs-1
            m = nn.Conv1d(self.embd_dim, fn, fs, padding=padding_size)
            if gpu: m.to('cuda')
            convs.append(m)
        return convs
        
    def get_conv_output(self, input_matrix, conv, gpu):
        # step 1: compute convolution 
        assert input_matrix.shape[1] == self.embd_dim
        conv_output = conv(input_matrix)
        # step 2: pass through an activation function 
        conv_relu = self.tanh(conv_output)
        # step 3: max-over-time pooling
        maxp = nn.MaxPool1d(conv_relu.shape[2])
        maxp_output = maxp(conv_relu)
        return maxp_output
       
    def forward(self, all_text_vectors):
        cnn_repr = torch.tensor([])
        if self.gpu: cnn_repr = cnn_repr.to('cuda')
        for cv in self.convs:
            cv_output = self.get_conv_output(all_text_vectors, cv, self.gpu)
            cnn_repr = torch.cat((cnn_repr, cv_output), dim=1)
        # print(cnn_repr.shape)
        after_dp = self.dropout(cnn_repr.squeeze())
        #print('xxxx',after_dp.shape)
        logit = self.fc(after_dp)
        return logit

In [None]:
dropout_rate = 0.5 # dropout rate
filter_sizes = [2,3,4]
filter_nums = [100]*len(filter_sizes)

gpu = False
model = CNN_reg(word_vec_dim, filter_sizes, filter_nums,1, dropout_rate, gpu)
loss_fnc = torch.nn.MSELoss() # MSE loss

# hyper parameters
n_epochs = 10 # number of epoch 
batch_size = 32
lr = 0.001 # initial learning rate

# init optimizer and scheduler
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=lr) # use Adam as the optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95) # after each epoch, the learning rate is discounted to its 98%

In [None]:
# let's first see its performance on the dev set
from sklearn.metrics import mean_squared_error

with torch.no_grad():
    model.eval()
    dev_predictions = []
    for idx in range(0,len(sentences_pair_dv),batch_size):
        x_data = build_mini_batch(sentences_pair_dv[idx:idx+batch_size], word_vectors)
        if x_data.shape[0] == 0: continue # to avoid empty batch
        #print(x_data.shape)
        x_tensor = torch.tensor(x_data, dtype=torch.float)
        #print(x_tensor.shape)
        y_pred = model(x_tensor).cpu().detach().numpy()
        #print(y_pred.shape)
        #print(y_pred)
        pred_labels = [entry for entry in y_pred]
        #print(pred_labels)
        dev_predictions += pred_labels
    d_mse = mean_squared_error(dev_lables, dev_predictions)
    print('\n MSE on dev set is ',d_mse)
    


 MSE on dev set is  10.464801709907318


In [None]:
# training the CNN model

a_best_mse = 10
best_model = None
import copy
import numpy as np
from sklearn.metrics import mean_squared_error
from nltk.tokenize import word_tokenize
from tqdm import tqdm

for epoch_i in tqdm(range(n_epochs)):
    # the inner loop is over the batches in the dataset
    model.train() # let pytorch know that gradients should be computed, so as to update the model
    ep_loss = []
    for idx in range(0,len(sentences_pair_tr),batch_size):
        # Step 0: Get the data
        x_data = build_mini_batch(sentences_pair_tr[idx:idx+batch_size], word_vectors)
        if x_data.shape[0] == 0: continue # to avoid empty batch
        y_target = torch.tensor([train_lables[idx:idx+batch_size]], dtype=torch.float32).squeeze()
        if gpu:
            y_target = y_target.to('cuda')
        
        # Step 1: Clear the gradients 
        optimizer.zero_grad()

        # Step 2: Compute the forward pass of the model
        x_tensor = torch.tensor(x_data, dtype=torch.float)
        if gpu:
            x_tensor = x_tensor.to('cuda')
        y_pred = model(x_tensor)
        pred_labels = [entry for entry in y_pred.cpu().detach().numpy()]
        # print('pred labels', pred_labels)
        # print('true labels', y_target)

        # Step 3: Compute the loss value that we wish to optimize
        loss = loss_fnc(y_pred, y_target)
        # print(loss)
        ep_loss.append(loss.cpu().detach().numpy())

        # Step 4: Propagate the loss signal backward
        loss.backward()

        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()
    
    print('\n======epoch {} loss======'.format(epoch_i),np.mean(ep_loss))
    
    # after each epoch, we can test the model's performance on the dev set
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
        dev_predictions = []
        for idx in range(0,len(sentences_pair_tt),batch_size):
            x_data = build_mini_batch(sentences_pair_tt[idx:idx+batch_size], word_vectors)
            if x_data.shape[0] == 0: continue # to avoid empty batch
            x_tensor = torch.tensor(x_data, dtype=torch.float)
            if gpu:
                x_tensor = x_tensor.to('cuda')
            y_pred = model(x_tensor).cpu().detach().numpy()
            pred_labels = [entry for entry in y_pred]
            dev_predictions += pred_labels
            # print(pred_labels)
        mse = mean_squared_error(test_lables, dev_predictions)
        print('\n---> after epoch {} the mse on test set is {}'.format(epoch_i, mse))
        for param_group in optimizer.param_groups:
            print('learning rate', param_group['lr'])
        
        # save the best model
        if mse < a_best_mse:
            a_best_mse = mse
            best_model = copy.deepcopy(model.state_dict())
            print('best model updated; new best mse',mse)
    
    # (optional) adjust learning rate according to the scheduler
    scheduler.step()

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 10%|█         | 1/10 [00:07<01:03,  7.08s/it]


---> after epoch 0 the mse on test set is 2.57060866900767
learning rate 0.001
best model updated; new best mse 2.57060866900767


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 20%|██        | 2/10 [00:11<00:44,  5.53s/it]


---> after epoch 1 the mse on test set is 2.550695301151223
learning rate 0.00095
best model updated; new best mse 2.550695301151223


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 30%|███       | 3/10 [00:15<00:34,  4.86s/it]


---> after epoch 2 the mse on test set is 2.513994093630401
learning rate 0.0009025
best model updated; new best mse 2.513994093630401


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 40%|████      | 4/10 [00:19<00:27,  4.56s/it]


---> after epoch 3 the mse on test set is 2.5218932452843665
learning rate 0.000857375


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 50%|█████     | 5/10 [00:23<00:21,  4.37s/it]


---> after epoch 4 the mse on test set is 2.5002230954270908
learning rate 0.0008145062499999999
best model updated; new best mse 2.5002230954270908


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 60%|██████    | 6/10 [00:28<00:17,  4.41s/it]


---> after epoch 5 the mse on test set is 2.4768804656908565
learning rate 0.0007737809374999998
best model updated; new best mse 2.4768804656908565


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 70%|███████   | 7/10 [00:35<00:15,  5.23s/it]


---> after epoch 6 the mse on test set is 2.498831072699872
learning rate 0.0007350918906249997


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 80%|████████  | 8/10 [00:40<00:10,  5.39s/it]


---> after epoch 7 the mse on test set is 2.4955783247502783
learning rate 0.0006983372960937497


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 90%|█████████ | 9/10 [00:49<00:06,  6.30s/it]


---> after epoch 8 the mse on test set is 2.496564335737702
learning rate 0.0006634204312890621


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





100%|██████████| 10/10 [00:57<00:00,  5.71s/it]


---> after epoch 9 the mse on test set is 2.50578173930433
learning rate 0.000630249409724609





In [None]:
print("the best MSE is ----> ", a_best_mse)

the best MSE is ---->  2.4768804656908565


##### Cosine Similarity 

In [None]:
# # sanity check 
# vc=[]
# for i in range(0,4000):
#     m = build_mini_batch(sent1[:4000][i], word_vectors)
#     vc.append(m)
# print(vc)

# vc2=[]
# for i in range(0,1749):
#     m = build_mini_batch(sent2[:4000][i], word_vectors)
#     vc2.append(m)
# #print(vc2)



vc1=[]
for i in range(0,1749):
    m = build_mini_batch(sent1[4000:][i], word_vectors)
    vc1.append(m)
#print(vc1)

vc3=[]
for i in range(0,1749):
    m = build_mini_batch(sent2[4000:][i], word_vectors)
    vc3.append(m)
#print(vc3)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cc=[]
for i in range(0,750):
    c= cosine_similarity(vc1[i], vc3[i])
    cc.append(c)
#print(cc)
#len(cc)

In [None]:
%store a_best_mse
%store d_mse

Stored 'a_best_mse' (float64)
Stored 'd_mse' (float64)


#LSTM

In [None]:
# # load pre-trained Pubmed embeddings
# from gensim.test.utils import datapath
# from gensim.models import KeyedVectors
# path_of_downloaded_bin = "/Users/aswath/PycharmProjects/mfac038/IndividualProject/PubMed-and-PMC-w2v.bin"
# word_vectors = KeyedVectors.load_word2vec_format(datapath(path_of_downloaded_bin), binary=True)


In [None]:
# then we define the RNN-based Regressor
import torch
import torch.nn as nn

class RNN_Regressor(nn.Module):
    def __init__(self, embd_dim, hidden_dim, model_type, cls_num, pooler_type, dropout, gpu):
        super(RNN_Regressor, self).__init__()
        assert model_type in ['rnn','lstm','bilstm','gru']
        assert pooler_type in ['max','avg']
        # rnn type
        if model_type == 'rnn':
            self.rnn = nn.RNN(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, dropout=dropout)
        elif model_type == 'lstm':
            self.rnn = nn.LSTM(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, dropout=dropout)
        elif model_type == 'bilstm':
            self.rnn = nn.LSTM(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, bidirectional=True, dropout=dropout)
        else: # model_type == 'gru'
            self.rnn = nn.GRU(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, dropout=dropout)
        # map from rnn output to logits
        if model_type == 'bilstm':
            self.fc = nn.Linear(2*hidden_dim, cls_num)
        else:
            self.fc = nn.Linear(hidden_dim, cls_num)
        # pooler type
        self.pooler_type = pooler_type
        # gpu or not
        self.gpu = gpu
        if gpu: self.to('cuda')
            
    def forward(self, input_matrix):
        token_num = input_matrix.shape[1]
        hidden_vecs = self.rnn(input_matrix)[0]
        if self.pooler_type == 'max':
            pooler = nn.MaxPool1d(token_num)
        else: 
            pooler = nn.AvgPool1d(token_num)
        if self.gpu: pooler.to('cuda')
        pooled_hidden = pooler(torch.transpose(hidden_vecs,1,2)).squeeze()
        return self.fc(pooled_hidden)

In [None]:
# define functions that build mini-batches
from nltk.tokenize import word_tokenize
import numpy as np

embd_dim = 200
hidden_dim = 200
rnn_type = 'bilstm'
pooler_type = 'avg'
dropout = 1
gpu = False

oov_vec = oov_vec = np.random.rand(embd_dim)

def get_sent_word_vecs(word_vectors, sent_words, largest_len):
    vecs = []
    for ww in sent_words:
        if ww in word_vectors:
            vecs.append(word_vectors[ww])
        else:
            vecs.append(oov_vec)
    return np.array(vecs)

def build_mini_batch(sent_list, word_vectors):
    tokenized_sents = [word_tokenize(ss.lower()) for ss in sent_list]
    largest_len = np.max([len(tokens) for tokens in tokenized_sents])
    text_vecs = []
    for ts in tokenized_sents:
        vv = get_sent_word_vecs(word_vectors, ts, largest_len)
        text_vecs.append(vv)
    # print('mini batch shape',np.array(text_vecs).shape)
    return np.array(text_vecs)

def make_batch_prediction(sent_list, word_vectors, model, use_gpu=False):
    batch = build_mini_batch(sent_list, word_vectors)
    batch_logits = torch.tensor([])
    if use_gpu: batch_logits = batch_logits.to('cuda')
    for i in range(batch.shape[0]):
        input_sents = torch.from_numpy(batch[i]).float()
        if use_gpu: input_sents = input_sents.to('cuda')
        logits = model(input_sents.unsqueeze(0))
        batch_logits = torch.cat( (batch_logits, logits) )
    return batch_logits.view(batch.shape[0],-1)
  
# sanity check 
vc=[]
for i in range(0,600):
    m = build_mini_batch([sentences_pair_tr[i]],word_vectors)
    vc.append(m)
print(vc)

[array([[[ 1.58106178e-01, -8.16389918e-02,  4.15682886e-03,
         -7.47984946e-02,  3.76325518e-01, -7.03254700e-01,
          1.52429089e-01,  3.63919050e-01,  2.00187460e-01,
          1.69908822e-01,  9.01601017e-02,  1.80233449e-01,
          1.01542354e-01, -1.13195240e-01,  9.48374495e-02,
          1.32810161e-01,  4.33510423e-01,  1.63026318e-01,
          9.12018120e-03, -7.72754923e-02, -3.49835068e-01,
         -3.93431447e-03,  6.82015866e-02,  3.23716551e-01,
         -4.32807475e-01, -1.52139977e-01, -1.12877585e-01,
          9.84692425e-02,  1.25597566e-01, -1.26391992e-01,
         -1.26065955e-01, -1.07196614e-01,  2.40593359e-01,
          4.55424860e-02, -4.24680673e-02, -1.38063088e-01,
          8.10099840e-02,  9.47656184e-02, -2.11172163e-01,
          5.53010367e-02,  1.65989120e-02, -3.08284342e-01,
          1.49524465e-01,  3.20503324e-01,  1.59940317e-01,
         -2.30685130e-01,  8.42225850e-02,  1.14080720e-01,
         -1.69698641e-01,  1.70290563e-

In [None]:
loss_fnc = torch.nn.MSELoss() # mse loss
model = RNN_Regressor(embd_dim, hidden_dim, rnn_type,1, pooler_type, dropout, gpu)

# hyper parameters
n_epochs = 10 # number of epoch (i.e. number of iterations)
batch_size = 32
lr = 0.001 # initial learning rate

# init optimizer and scheduler (lr adjustor)
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=lr) # use Adam as the optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.999) # after each epoch, the learning rate is discounted to its 95%

  "num_layers={}".format(dropout, num_layers))


In [None]:
# training the LSTM model

best_mse = 10
best_model = None
import copy
import numpy as np
from sklearn.metrics import mean_squared_error
from nltk.tokenize import word_tokenize
from tqdm import tqdm

for epoch_i in tqdm(range(n_epochs)):
    # the inner loop is over the batches in the dataset
    model.train() # let pytorch know that gradients should be computed, so as to update the model
    ep_loss = []
    for idx in range(0,len(sentences_pair_tr),batch_size):
        # Step 0: Get the data
        sents = sentences_pair_tr[idx:idx+batch_size]
        if len(sents) == 0: break
        y_target = torch.tensor([train_lables[idx:idx+batch_size]], dtype=torch.float32).squeeze()
        if gpu:
            y_target = y_target.to('cuda')
        
        # Step 1: Clear the gradients 
        optimizer.zero_grad()

        # Step 2: Compute the forward pass of the model
        y_pred = make_batch_prediction(sents, word_vectors, model, gpu)
        pred_labels = [entry for entry in y_pred.cpu().detach().numpy()]
        #print('pred labels', pred_labels)
        #print('true labels', y_target)

        # Step 3: Compute the loss value that we wish to optimize
        loss = loss_fnc(y_pred, y_target)
        #print(loss)
        ep_loss.append(loss.cpu().detach().numpy())

        # Step 4: Propagate the loss signal backward
        loss.backward()
        
        # Step 4+: clip the gradient, to avoid gradient explosion
        nn.utils.clip_grad_value_(model.parameters(), clip_value=3.)

        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()
    
    print('\n======epoch {} loss======'.format(epoch_i),np.mean(ep_loss))
    
    # after each epoch, we can test the model's performance on the dev set
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
        predictions = []
        test_docs = sentences_pair_tt
        test_labels = test_lables
        
        for idx in range(0,len(sentences_pair_tt),batch_size):
            y_pred = make_batch_prediction(
                sentences_pair_tt[idx:idx+batch_size], word_vectors, model, gpu)
            pred_labels = [entry for entry in y_pred.cpu().detach().numpy()]
            predictions += pred_labels
        mse = mean_squared_error(test_labels, predictions)

        print('\n---> after epoch {} the mse on test set is {}'.format(epoch_i, mse))
        for param_group in optimizer.param_groups:
            print('learning rate', param_group['lr'])
        
        # save the best model
        if mse < best_mse:
            best_mse = mse
            best_model = copy.deepcopy(model.state_dict())
            print('best model updated; new best mse',mse)
    
    # (optional) adjust learning rate according to the scheduler
    scheduler.step()

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 10%|█         | 1/10 [01:01<09:17, 61.92s/it]


---> after epoch 0 the mse on test set is 2.370354254389553
learning rate 0.001


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 20%|██        | 2/10 [01:51<07:15, 54.48s/it]


---> after epoch 1 the mse on test set is 2.5119061645352687
learning rate 0.000999


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 30%|███       | 3/10 [02:40<06:06, 52.33s/it]


---> after epoch 2 the mse on test set is 2.6122924362918223
learning rate 0.000998001


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 40%|████      | 4/10 [03:31<05:09, 51.63s/it]


---> after epoch 3 the mse on test set is 2.6702109069133315
learning rate 0.000997002999


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 50%|█████     | 5/10 [04:20<04:14, 50.81s/it]


---> after epoch 4 the mse on test set is 2.6946257528912088
learning rate 0.000996005996001


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 60%|██████    | 6/10 [05:09<03:20, 50.15s/it]


---> after epoch 5 the mse on test set is 2.697139653888916
learning rate 0.000995009990004999


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 70%|███████   | 7/10 [05:59<02:29, 49.94s/it]


---> after epoch 6 the mse on test set is 2.687902785150012
learning rate 0.000994014980014994


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 80%|████████  | 8/10 [06:49<01:40, 50.00s/it]


---> after epoch 7 the mse on test set is 2.674408429872379
learning rate 0.0009930209650349789


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





 90%|█████████ | 9/10 [07:39<00:50, 50.00s/it]


---> after epoch 8 the mse on test set is 2.659328107364323
learning rate 0.0009920279440699439


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)





100%|██████████| 10/10 [08:40<00:00, 52.09s/it]


---> after epoch 9 the mse on test set is 2.6460253690551703
learning rate 0.0009910359161258739





In [None]:
print("the best MSE is ----> ", best_mse)

the best MSE is ---->  1.285


In [None]:
#storing variables
%store best_mse


Stored 'best_mse' (float64)
