# Encode Sentences and Subsentences to Embeddings
## Given two entity mentions, extract valid subsentence, then encode sentences and subsentences to embeddings by
* GEM
* sentBERT
* Skipthrought
* InferSentv1
* InferSentv2
* DCT
* QuickThought
* Glov
* Laser

### This notebook uses GEM to generate embeddings for sentences and subsentences. The implementations of all the embedding methods are available under **clustering-network-analysis/src/embedding/src/**

### To generate the embeddings by other methods, simply replace the embedding method in this notebook.*

## The sentence and subsentences for embedding generation:
* whole sentence
* span between two mentions
* span and before and after 1 word
* span and before and after 2 words
* span and before and after 3 words
* span and before and after 4 words
* span and before and after 5 words
* span and before and after 6 words
* span and before and after 7 words
* span and before and after 8 words
* span and before and after 9 words
* span and before and after 10 words
* span and before and after 15 words
* span and before and after 20 words
* if span has enough words:
 - contexts of mentions by 1 before and 1 after word
 - contexts of mentions by 2 before and 2 after words
 - contexts of mentions by 3 before and 3 after word
 - contexts of mentions by 4 before and 4 after words
 - contexts of mentions by 5 before and 5 after word
 - contexts of mentions by n before and n after words as long as the span has more than 2n words


In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/RTER')

In [None]:
import sentence_context as sc
import clusterability as clus
import clustering_validation as clus_val

# Load NYT Train Data

In [None]:
train_df_nn = pd.read_csv("/content/drive/My Drive/Colab Notebooks/RTER/data/NYT_train_df_nn.csv", index_col=0)

In [None]:
folder = "/content/drive/My Drive/Colab Notebooks/RTER/data/embs_gem"
prefix = "NYT_train_gem"

# GEM Encoding

In [None]:
import torch

In [None]:
import gem, embeddings, utils, build_gem_space

In [None]:
sys.path.append('/content/drive/My Drive/Colab Notebooks/RTER/embedding/src/sent-embeddings/gem')

In [None]:
glove_model_path = '/content/drive/My Drive/Colab Notebooks/RTER/embedding/src/sent-embeddings/glove/glove.6B.300d.txt'

# Set up a Timer

In [None]:
# For timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start

## GEM embedding on Sentences

In [None]:
sents = train_df_nn.sentTextClndUni

In [None]:
with elapsed_timer() as elapsed:
    sent_space = build_gem_space.transform_sentences(sents, glove_model_path)
    duration = '%.1f' % elapsed()
print(" GEM Embedding {} sentences takes {} seconds".format(sents.shape[0], duration))  

In [None]:
torch.save(sent_space, folder+"/"+prefix+"_embs.pt")

In [None]:
import torch
sent_embs = torch.load(folder+"/"+prefix+"_embs.pt")
sent_embs.shape

# Extract Span Subsentences and Embed the Span Subsentences

In [None]:
# Extract spans between two entity mentions
spans = []
for i in range(train_df_nn.shape[0]):
    sent = train_df_nn.iloc[i].sentTextClndUni
    try:
        em1 = train_df_nn.iloc[i].em1
        em2 = train_df_nn.iloc[i].em2
        span, _, _, _, _ = sc.spanEMs(sent, em1, em2)
        spans.append(span)
    except:
        print(sent)

In [None]:
with elapsed_timer() as elapsed:
    spans_space = build_gem_space.transform_sentences(spans, glove_model_path)
    duration = '%.1f' % elapsed()
#print(" GEM Embedding {} sentence spans takes {} seconds".format(spans.shape[0], duration))  

In [None]:
import torch
#torch.save(spans_space, folder+"/"+prefix+"_embs_span.pt")

In [None]:
span_embs = torch.load(folder+"/"+prefix+"_embs_span.pt")
span_embs.shape

In [None]:
import pickle
file_dest = folder+"/"+prefix+"_kls_embs_span.pk"
with open(file_dest, 'wb') as ff:
    pickle.dump(kls_embs_span, ff)

### Span and Before and After n Words

In [None]:
# Extract spans between two entity mentions and extra n words before and after
n_words = [1,2,3,4,5,6,7,8,9,10,15,20]

for n in n_words:

    spans_ba = []

    with elapsed_timer() as elapsed_n:

        for i in range(train_df_nn.shape[0]):
        # for i in range(14):
            try:
                sent = train_df_nn.iloc[i].sentTextClndUni
                em1 = train_df_nn.iloc[i].em1
                em2 = train_df_nn.iloc[i].em2
                span_ba = sc.baN(sent, em1, em2, n)
                spans_ba.append(span_ba.strip())
            except:
                print("Something was wrong at {}th instance".format(i))
                continue
        
        duration = '%.1f' % elapsed_n()
    print("Extracting the span and before and after {} words takes {} seconds".format(n, duration))   

    with elapsed_timer() as elapsed_encoding:
        embs_span_ba = build_gem_space.transform_sentences(spans_ba, glove_model_path)
        duration = '%.1f' % elapsed_encoding()
    print("Encoding the span and before and after {} words takes {} seconds".format(n, duration))

    print("The length of the list of span_ba embeddings is: {}".format(len(embs_span_ba)))
    #with open(folder + "/" + prefix + "_embs_span_ba{}.pk".format(n), 'wb') as f:
    #    pickle.dump(embs_span_ba, f)

    torch.save(embs_span_ba, folder+"/"+prefix+"_embs_span_ba{}.pt".format(n))

# Extract Surrounding Subsentences and Embed the Subsentences

In [None]:
# Extract context of mentions by surrounding 2n words
n_words = [1,2,3,4,5,6,7,8,9,10,15,20]

for n in n_words:
    surroundings = []
    labels = []
    with elapsed_timer() as elapsed_n:
        for i in range(train_df_nn.shape[0]):
        
            sent = train_df_nn.iloc[i].sentTextClndUni
            em1 = train_df_nn.iloc[i].em1
            em2 = train_df_nn.iloc[i].em2
            surr = sc.surrounding2N(sent, em1, em2, n)
            if surr != None:
                surroundings.append(surr.strip())
                labels.append(train_df_nn.iloc[i].relIdx)

        if len(surroundings) == 0:
            print("No more context extraction at {} words".format(n))
            break
        duration = '%.1f' % elapsed_n()
    print("Extraction the context surrounding {} words takes {} seconds".format(n, duration))

    with elapsed_timer() as elapsed_encoding:
        embs_surroundings = build_gem_space.transform_sentences(surroundings, glove_model_path)
        duration = '%.1f' % elapsed_encoding()
    print("Encoding the context surrounding {} words takes {} seconds".format(n, duration)) 

    print("The length of the list of surrounding embeddings is: {}".format(len(embs_surroundings)))

    #embs_surroundings_labels = zip(embs_surroundings, labels)

    #with open(folder + "/" + prefix + "_embs_surroundings_labels{}.pk".format(n), 'wb') as f:
    #    pickle.dump(embs_surroundings_labels, f)

    torch.save(embs_surroundings, folder+"/"+prefix+"_embs_surroundings{}.pt".format(n))

    with open(folder + "/" + prefix + "_embs_surroundings_labels{}.pk".format(n), 'wb') as f:
        pickle.dump(labels, f)

In [None]:
import pickle
with open(folder + "/" + prefix + "_embs_surroundings_labels9.pk", 'rb') as f:
    embs_surroundings_labels9 = pickle.load(f)

In [None]:
embs, labels = zip(*embs_surroundings_labels9)

In [None]:
len(embs)