# Analysis of the Embeddings of Sentence and Subsentences

## The embedding methods under investigation are:
* GEM
* sentBERT
* Skipthrought
* InferSentv1
* InferSentv2
* DCT
* QuickThought
* Glov
* Laser

### This notebook uses the embeddings generated by GEM. To analyze the embeddings generated by other methods, simple load the embeddings.

In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import sentence_context as sc
import clusterability as clus
import clustering_validation as clus_val

## Set a Timer

In [None]:
# For timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start

# Tet Clusterability

In [None]:
import pickle

# Clusterability: Spatial Histogram

In [None]:
with open('/content/drive/My Drive/Colab Notebooks/RTER/data/embs_gem/NYT_train_gem_kls_embs.pk', 'rb') as f:
    kls_embs = pickle.load(f)

In [None]:
kls_embs.mean(), kls_embs.std()

In [None]:
with open('/content/drive/My Drive/Colab Notebooks/RTER/data/embs_gem/NYT_train_gem_kls_embs_span.pk', 'rb') as f:
    kls_embs_span = pickle.load(f)

In [None]:
kls_embs_span.mean(), kls_embs_span.std()

# Test Clusterability of All Embeddings by GEM

In [None]:
folder = '/content/drive/My Drive/Colab Notebooks/RTER/data/embs_gem/'

In [None]:
import os
os.listdir(folder)

# Load in the Embeddings for sentences, spans, spans and before and after words

In [None]:
files = ['NYT_train_gem_embs.pt',
 'NYT_train_gem_embs_span.pt',
 'NYT_train_gem_embs_span_ba1.pt',
 'NYT_train_gem_embs_span_ba2.pt',
 'NYT_train_gem_embs_span_ba3.pt',
 'NYT_train_gem_embs_span_ba4.pt',
 'NYT_train_gem_embs_span_ba5.pt',
 'NYT_train_gem_embs_span_ba6.pt',
 'NYT_train_gem_embs_span_ba7.pt',
 'NYT_train_gem_embs_span_ba8.pt',
 'NYT_train_gem_embs_span_ba9.pt',
 'NYT_train_gem_embs_span_ba10.pt',
 'NYT_train_gem_embs_span_ba15.pt',
 'NYT_train_gem_embs_span_ba20.pt',]

In [None]:
files

In [None]:
import torch
embs = []
for fname in files:
    file = folder + fname
    embs.append(torch.load(file))

In [None]:
len(embs)

In [None]:
# PCA for Embeddings
from sklearn.decomposition import PCA
embs_pcas = []
for i in embs:
    embs_pcas.append(PCA(n_components=2).fit_transform(np.array(i)))

In [None]:
#with open('/content/drive/My Drive/Colab Notebooks/RTER/data/embs_gem/NYT_train_gem_embs_pcas.pk', 'wb') as f:
#    pickle.dump(embs_pcas, f)

In [None]:
# Compute KLs for embeddings
%%time
kls_embs_pcas = []
for i in embs_pcas:
    kls_embs_pcas.append(clus.spaHist(i, bins=74, n = 500))

In [None]:
with open('/content/drive/My Drive/Colab Notebooks/RTER/data/embs_gem/NYT_train_gem_kls_embs_pcas_all.pk', 'wb') as f:
    pickle.dump(kls_embs_pcas, f)

# Load All KLs for Embeddings of Sentences, Spans, and Spans_Before_After n Words

In [None]:
with open('/content/drive/My Drive/Colab Notebooks/RTER/data/embs_gem/NYT_train_gem_kls_embs_pcas_all.pk', 'rb') as f:
    kls_embs_pcas = pickle.load(f)

In [None]:
#for i in range(len(kls_embs_pcas)):
import seaborn as sns
plt.figure(figsize=(15, 9))
sns.distplot(kls_embs_pcas[0], hist=False, label="Sentence")
sns.distplot(kls_embs_pcas[1], hist=False, label="Span")
sns.distplot(kls_embs_pcas[2], hist=False, label="Span and ba1")
sns.distplot(kls_embs_pcas[3], hist=False, label="Span and ba2")
sns.distplot(kls_embs_pcas[4], hist=False, label="Span and ba3")
sns.distplot(kls_embs_pcas[5], hist=False, label="Span and ba4")
sns.distplot(kls_embs_pcas[6], hist=False, label="Span and ba5")
sns.distplot(kls_embs_pcas[7], hist=False, label="Span and ba6")
sns.distplot(kls_embs_pcas[8], hist=False, label="Span and ba7")
sns.distplot(kls_embs_pcas[9], hist=False, label="Span and ba8")
sns.distplot(kls_embs_pcas[10], hist=False, label="Span and ba9")
sns.distplot(kls_embs_pcas[11], hist=False, label="Span and ba10")
sns.distplot(kls_embs_pcas[12], hist=False, label="Span and ba15")
sns.distplot(kls_embs_pcas[13], hist=False, label="Span and ba20")
plt.legend()

In [None]:
kls_means = []
kls_stds = []
for i in kls_embs_pcas:
    kls_means.append(i.mean())
    kls_stds.append(i.std())

In [None]:
import pandas as pd

In [None]:
kls_df = pd.DataFrame({'kls_means':kls_means, 'kls_stds':kls_stds})

In [None]:
kls_df.index = ['sentence', 'span', 'span and ba1', 'span and ba2', 'span and ba3', 'span and ba4', \
        'span and ba5', 'span and ba6', 'span and ba7', 'span and ba8', 'span and ba9', 'span and ba10', \
        'span and ba15', 'span and ba20']

In [None]:
kls_df.to_csv('/content/drive/My Drive/Colab Notebooks/RTER/data/embs_gem/NYT_train_kls_embs_pcas_means_stds.csv')

In [None]:
plt.figure(figsize=(15, 9))
plt.plot(kls_means)
#plt.plot(kls_stds)
labs = ['sentence', 'span', 'span and ba1', 'span and ba2', 'span and ba3', 'span and ba4', \
        'span and ba5', 'span and ba6', 'span and ba7', 'span and ba8', 'span and ba9', 'span and ba10', \
        'span and ba15', 'span and ba20']
plt.xticks(np.arange(14), labs, rotation=45)

# Load the Embeddings for Surrounding n Words

In [None]:
files_surr = ['NYT_train_gem_embs_surroundings1.pt',
 'NYT_train_gem_embs_surroundings2.pt',
 'NYT_train_gem_embs_surroundings3.pt',
 'NYT_train_gem_embs_surroundings4.pt',
 'NYT_train_gem_embs_surroundings5.pt',
 'NYT_train_gem_embs_surroundings6.pt',
 'NYT_train_gem_embs_surroundings7.pt',
 'NYT_train_gem_embs_surroundings8.pt',
 'NYT_train_gem_embs_surroundings9.pt',
 'NYT_train_gem_embs_surroundings10.pt',
 'NYT_train_gem_embs_surroundings15.pt',
 'NYT_train_gem_embs_surroundings20.pt']

In [None]:
files_surr

In [None]:
files_surr_labels = [
 'NYT_train_gem_embs_surroundings_labels1.pk',
 'NYT_train_gem_embs_surroundings_labels2.pk',
 'NYT_train_gem_embs_surroundings_labels3.pk',
 'NYT_train_gem_embs_surroundings_labels4.pk',
 'NYT_train_gem_embs_surroundings_labels5.pk',
 'NYT_train_gem_embs_surroundings_labels6.pk',
 'NYT_train_gem_embs_surroundings_labels7.pk',
 'NYT_train_gem_embs_surroundings_labels8.pk',
 'NYT_train_gem_embs_surroundings_labels9.pk',
 'NYT_train_gem_embs_surroundings_labels10.pk',
 'NYT_train_gem_embs_surroundings_labels15.pk',
 'NYT_train_gem_embs_surroundings_labels20.pk']

In [None]:
files_surr_labels

In [None]:
import os
import torch
import pickle

embs_surr = []

for fname in files_surr:
    file = "embs_gem/" + fname
    embs_surr.append(torch.load(file))

In [None]:
len(embs_surr)

In [None]:
embs_surr[4].shape

In [None]:
embs_surr_labels = []

for fname in files_surr_labels:
    file = "embs_gem/" + fname
    with open(file, 'rb') as f:
        embs_surr_labels.append(pickle.load(f))

In [None]:
len(embs_surr_labels)

In [None]:
len(embs_surr_labels[4])

In [None]:
for i in range(len(embs_surr)):
    print(embs_surr[i].shape, len(embs_surr_labels[i]))

In [None]:
from sklearn.decomposition import PCA 
embs_surr_pcas = []
for i in embs_surr:
    embs_surr_pcas.append(PCA(n_components=2).fit_transform(np.array(i)))

In [None]:
with open('embs_gem/NYT_train_gem_embs_surr_all_pcas.pk', 'wb') as f:
    pickle.dump(embs_surr_pcas, f)

In [None]:
for i in range(len(embs_surr_pcas)):
    print(embs_surr_pcas[i].shape)

In [None]:
(np.sqrt(embs_surr_pcas[3].shape[0] / 20)).astype(int)

In [None]:
%%time
kls_embs_surr_pcas = []
for i in embs_surr_pcas:
    bins = (np.sqrt(i.shape[0] / 20)).astype(int)
    print("The number of bins is {}.".format(bins))
    kls_embs_surr_pcas.append(clus.spaHist(i, bins=bins, n = 500))

In [None]:
with open('embs_gem/NYT_train_gem_kls_embs_surr_pcas_all.pk', 'wb') as f:
    pickle.dump(kls_embs_surr_pcas, f)

# Load All KLs for Surroundings of 2n Words

In [None]:
with open('embs_gem/NYT_train_gem_kls_embs_surr_pcas_all.pk', 'rb') as f:
    kls_embs_surr_pcas = pickle.load(f)

In [None]:
import seaborn as sns
plt.figure(figsize=(15, 9))
sns.distplot(kls_embs_surr_pcas[0], hist=False, label="surroungs1")
sns.distplot(kls_embs_surr_pcas[1], hist=False, label="surroungs2")
sns.distplot(kls_embs_surr_pcas[2], hist=False, label="surroungs3")
sns.distplot(kls_embs_surr_pcas[3], hist=False, label="surroungs4")
sns.distplot(kls_embs_surr_pcas[4], hist=False, label="surroungs5")
sns.distplot(kls_embs_surr_pcas[5], hist=False, label="surroungs6")
sns.distplot(kls_embs_surr_pcas[6], hist=False, label="surroungs7")
sns.distplot(kls_embs_surr_pcas[7], hist=False, label="surroungs8")
sns.distplot(kls_embs_surr_pcas[8], hist=False, label="surroungs9")
sns.distplot(kls_embs_surr_pcas[9], hist=False, label="surroungs10")
sns.distplot(kls_embs_surr_pcas[10], hist=False, label="surroungs15")
sns.distplot(kls_embs_surr_pcas[11], hist=False, label="surroungs20")
plt.legend()

In [None]:
kls_embs_surr_pcas_means = []
kls_embs_surr_pcas_stds = []
for i in kls_embs_surr_pcas:
    kls_embs_surr_pcas_means.append(i.mean())
    kls_embs_surr_pcas_stds.append(i.std())

In [None]:
kls_surr_pcas_df = pd.DataFrame({'kls_surr_means':kls_embs_surr_pcas_means, 'kls_surr_stds':kls_embs_surr_pcas_stds})

In [None]:
kls_surr_pcas_df.shape

In [None]:
kls_surr_pcas_df.index = ['surroundings1', 'surroundings2', 'surroundings3', 'surroundings4', 'surroundings5', 'surroundings6', \
        'surroundings7', 'surroundings8', 'surroundings9', 'surroundings10', 'surroundings15', 'surroundings20']

In [None]:
kls_surr_pcas_df.to_csv('embs_gem/NYT_train_gem_kls_embs_surr_pcas_means_stds.csv')

In [None]:
plt.figure(figsize=(15, 9))
plt.plot(kls_embs_surr_pcas_means)
#plt.plot(kls_stds)
labs = ['surroundings1', 'surroundings2', 'surroundings3', 'surroundings4', 'surroundings5', 'surroundings6', \
        'surroundings7', 'surroundings8', 'surroundings9', 'surroundings10', 'surroundings15']
plt.xticks(np.arange(14), labs, rotation=45)

# Clustering Validation

In [None]:
import pickle

In [None]:
train_df = pd.read_csv("embs_gem/NYT_train_df_nn.csv")

In [None]:
train_df.shape

In [None]:
train_df.columns

In [None]:
relIdxs = train_df.relIdx

In [None]:
relIdxs.shape

In [None]:
n_clusts = relIdxs.nunique()

In [None]:
n_clusts

In [None]:
from sklearn.cluster import KMeans

In [None]:
files = ['NYT_train_gem_embs.pt',
 'NYT_train_gem_embs_span.pt',
 'NYT_train_gem_embs_span_ba1.pt',
 'NYT_train_gem_embs_span_ba2.pt',
 'NYT_train_gem_embs_span_ba3.pt',
 'NYT_train_gem_embs_span_ba4.pt',
 'NYT_train_gem_embs_span_ba5.pt',
 'NYT_train_gem_embs_span_ba6.pt',
 'NYT_train_gem_embs_span_ba7.pt',
 'NYT_train_gem_embs_span_ba8.pt',
 'NYT_train_gem_embs_span_ba9.pt',
 'NYT_train_gem_embs_span_ba10.pt',
 'NYT_train_gem_embs_span_ba15.pt',
 'NYT_train_gem_embs_span_ba20.pt']


In [None]:
import torch
emb_span_ba3 = torch.load('embs_gem/'+files[4])

In [None]:
emb_span_ba3.shape

In [None]:
for i, file in enumerate(files):
    with elapsed_timer() as elapsed:
        
        file_save = ""
        if i == 0:
            file_save = "embs_gem/NYT_train_gem_km_embs_sentence.pk"
        elif i == 1:
            file_save = "embs_gem/NYT_train_gem_km_embs_span.pk"
        elif i == 12:
            file_save = "embs_gem/NYT_train_gem_km_embs_span_ba15.pk"
        elif i == 13:
            file_save = "embs_gem/NYT_train_gem_km_embs_span_ba20.pk"
        else:
            file_save = "embs_gem/NYT_train_gem_km_embs_span_ba{}.pk".format(i-1)
        
        print(i, file_save)
        #with open(file_save, 'wb') as f:
            #pickle.dump(km_embs, f)

In [None]:
km_span_ba_all = []
for i, file in enumerate(files):
    with elapsed_timer() as elapsed:
        emb = torch.load('embs_gem/'+file)
        clusts_embs = KMeans(init='k-means++', n_clusters=n_clusts, n_init=20, random_state = 21)
        km_embs = clusts_embs.fit(np.array(emb))
        km_span_ba_all.append(km_embs)
        
        file_save = ""
        if i == 0:
            file_save = "embs_gem/NYT_train_gem_km_embs_sentence.pk"
        elif i == 1:
            file_save = "embs_gem/NYT_train_gem_km_embs_span.pk"
        elif i == 12:
            file_save = "embs_gem/NYT_train_gem_km_embs_span_ba15.pk"
        elif i == 13:
            file_save = "embs_gem/NYT_train_gem_km_embs_span_ba20.pk"
        else:
            file_save = "embs_gem/NYT_train_gem_km_embs_span_ba{}.pk".format(i-1)
        
        with open(file_save, 'wb') as f:
            pickle.dump(km_embs, f)
        
    duration = '%.1f' % elapsed()
    print("Clustering the {}th file takes {} secs".format(i, duration))


In [None]:
len(km_span_ba_all)

In [None]:
relIdxs.nunique(), pd.Series(km_span_ba_all[3].labels_).nunique()

In [None]:
km_embs_meas = []
for km_emb in km_span_ba_all:
    km_embs_meas.append(clus_val.clustering_measures(relIdxs, km_emb.labels_))

In [None]:
km_embs_meas[4]

In [None]:
meas_df = pd.DataFrame(km_embs_meas)


In [None]:
meas_df

In [None]:
meas_df.index = ['Sentence', 'Span', 'Span and ba1', 'Span and ba2', 'Span and ba3', 'Span and ba4',
                 'Span and ba5', 'Span and ba6', 'Span and ba7', 'Span and ba8', 'Span and ba9',
                 'Span and ba10', 'Span and ba15', 'Span and ba20']

In [None]:
meas_df.to_csv('embs_gem/NYT_train_gem_km_embs_span_ba_all_measures.csv')

In [None]:
plt.figure(figsize=(12, 15))
meas_df.T.plot(figsize=(12, 15))
axes = plt.gca()
#axes.tick_params(axis ='y', pad=20)
plt.yticks(np.arange(0, 0.6, 0.01))
plt.xticks(rotation=45)
plt.autoscale(axis='y')

# Clustering Validation on Surroundings with 2n Words

In [None]:
files_surr

In [None]:
files_surr_labels

In [None]:
labs = ['surroundings1', 'surroundings2', 'surroundings3', 'surroundings4', 'surroundings5', 'surroundings6', \
        'surroundings7', 'surroundings8', 'surroundings9', 'surroundings10', 'surroundings15', 'surroundings20']

In [None]:
embs_surr_all = []
for file in files_surr:
    embs_surr_all.append(torch.load('embs_gem/' + file))

In [None]:
embs_surr_labels_all = []
for file in files_surr_labels:
    with open('embs_gem/' + file, 'rb') as f:
        embs_surr_labels_all.append(pickle.load(f))

In [None]:
len(embs_surr_all), len(embs_surr_labels_all)

In [None]:
type(embs_surr_labels_all[3])

In [None]:
for i in range(len(embs_surr_all)):
    print((embs_surr_all[i]).shape, len(embs_surr_labels_all[i]), len(np.unique(embs_surr_labels_all[i])))

In [None]:
km_surr_all = []
for i, emb in enumerate(embs_surr_all):
    with elapsed_timer() as elapsed:
        n_clusts = len(np.unique(embs_surr_labels_all[i]))
        clusts_embs = KMeans(init='k-means++', n_clusters=n_clusts, n_init=20, random_state = 21)
        km_embs = clusts_embs.fit(np.array(emb))
        km_surr_all.append(km_embs)
        
        if i == 10:
            file_save = "embs_gem/NYT_train_gem_km_embs_surrounding15.pk"
        elif i == 11:
            file_save = "embs_gem/NYT_train_gem_km_embs_surrounding20.pk"
        else:
            file_save = "embs_gem/NYT_train_gem_km_embs_surrounding{}.pk".format(i+1)
        
        with open(file_save, 'wb') as f:
            pickle.dump(km_embs, f)
        
    duration = '%.1f' % elapsed()
    print("Clustering the {}th file takes {} secs".format(i+1, duration))


In [None]:
km_embs_meas = []
for i, km_emb in enumerate(km_surr_all):
    km_embs_meas.append(clus_val.clustering_measures(embs_surr_labels_all[i], km_emb.labels_))

In [None]:
meas_df = pd.DataFrame(km_embs_meas)
meas_df.index=labs
meas_df

In [None]:
plt.figure(figsize=(12, 15))
meas_df.T.plot(figsize=(12, 15))
axes = plt.gca()
#axes.tick_params(axis ='y', pad=20)
plt.yticks(np.arange(0, 0.6, 0.01))
plt.xticks(rotation=45)
plt.autoscale(axis='y')

In [None]:
meas_df.to_csv('embs_gem/NYT_train_gem_km_embs_surroundings_measures.csv')