In [2]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
from string import punctuation
import numpy as np
import pandas as pd
import pickle
import pickle, re

## Imports
import torch
import torch.nn as nn
import torch.optim as optim

## Embeddings
from flair.data import Sentence
from flair.embeddings import FlairEmbeddings

## Ploting
%matplotlib inline
from torch.nn.functional import interpolate
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings("ignore")

## Crawling NIPs Paper Information w/ Beautiful Soup

In [None]:
url = 'https://nips.cc/Conferences/2019/AcceptedPapersInitial'
response = requests.get(url)

In [None]:
def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

In [None]:
def clean_title(x):
    x = str(x).replace("<b>","").replace("</b>","")
    x = x.replace("-"," ")
    x = strip_punctuation(x)
    x = x.lower()
    return x

In [None]:
soup = BeautifulSoup(response.text, "html.parser")
paper_title = soup.findAll('b')
paper_title = paper_title[3:] ## Removing some initial garbage
paper_title = [clean_title(x) for x in paper_title]
print(len(paper_title))
paper_title[0:10]

# Embeddings - Using Flair

In [None]:
flair_embedding_forward = FlairEmbeddings('news-forward')

In [None]:
def get_flair_embedding(sent):
    sentence = Sentence(sent)
    flair_embedding_forward.embed(sentence)
    all_tensors = torch.zeros(flair_embedding_forward.embedding_length)
    for token in sentence:
        all_tensors+=token.embedding
    return all_tensors/len(sentence)

In [None]:
def get_cosine_distance(text_1, text_2):
    cos = nn.CosineSimilarity(dim=0, eps=1e-6)
    cos_sim = cos(text_1, text_2)
    return cos_sim.detach().numpy()

In [None]:
def get_flair_embedding_batch(sent_list):
    size_array = len(sent_list)
    tensor_array = torch.full((len(sent_list), flair_embedding_forward.embedding_length), fill_value = 0)
    for i,sent in tqdm_notebook(enumerate(sent_list)):
        sentence = Sentence(sent)
        flair_embedding_forward.embed(sentence)
        all_tensors = torch.zeros(flair_embedding_forward.embedding_length)
        for token in sentence:
            all_tensors+=token.embedding
        all_tensors_avg = all_tensors/len(sentence)
        tensor_array[i]=all_tensors_avg
    return tensor_array

## Embeddings - BERT

In [None]:
from flair.embeddings import BertEmbeddings

In [None]:
bert_embedding = BertEmbeddings()

In [None]:
bert_embedding.embedding_length

In [None]:
def get_bert_embedding(sent):
    sentence = Sentence(sent)
    bert_embedding.embed(sentence)
    all_tensors = torch.zeros(bert_embedding.embedding_length)
    for token in sentence:
        all_tensors+=token.embedding
    return all_tensors/len(sentence)

In [None]:
get_bert_embedding("is this it").shape

In [None]:
def get_bert_embedding_batch(sent_list):
    size_array = len(sent_list)
    tensor_array = torch.full((len(sent_list), bert_embedding.embedding_length), fill_value = 0)
    for i,sent in tqdm_notebook(enumerate(sent_list)):
        embed = get_bert_embedding(sent)
        tensor_array[i]=embed
    return tensor_array

In [None]:
embeddings_papers_bert = get_bert_embedding_batch(list(nips_2018_dataframe['clean_field'])).detach()

In [None]:
with open("nips_2018_bert.pkl", "wb") as f:
    pickle.dump((nips_2018_dataframe,embeddings_papers_bert),f)

## Getting Abstracts w/ BeatifulSoup

In [None]:
url = 'http://papers.nips.cc/paper/7288-kalman-normalization-normalizing-internal-representations-across-network-layers'

In [None]:
def get_title_abstract(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    title = soup.find(attrs={'class' : 'subtitle'})
    abstract = soup.find(attrs={'class' : 'abstract'})
    return title.get_text(), abstract.get_text()

In [None]:
get_title_abstract(url)

## Get all links for a given NIPs year

In [None]:
links_url = 'http://papers.nips.cc/book/advances-in-neural-information-processing-systems-31-2018'

In [None]:
response = requests.get(links_url)
soup = BeautifulSoup(response.text, "html.parser")
main_wrapper = soup.find(attrs={'class' : 'main wrapper clearfix'})

In [None]:
paper_urls = []
for a in main_wrapper.find_all('a', href=True):
    if '/paper/' in str(a):
        paper_urls.append("http://papers.nips.cc" + a['href'])
print("Total papers in 2018: {}".format(len(paper_urls)))
paper_urls[0:5]

In [None]:
nips_2018 = {}
for i in tqdm_notebook(range(len(paper_urls))):
    t, a = get_title_abstract(paper_urls[i])
    nips_2018[i] = {"title":t, "abstract":a}
nips_2018[2]

In [None]:
nips_2018_dataframe = pd.DataFrame.from_dict(nips_2018, orient='index')
nips_2018_dataframe.tail()

In [None]:
nips_2018_dataframe['clean_field'] = nips_2018_dataframe['title'] + ' ' + nips_2018_dataframe['abstract']
nips_2018_dataframe['clean_field'] = nips_2018_dataframe['clean_field'].apply(lambda x: clean_title(x))
nips_2018_dataframe['clean_field'][1008]

In [None]:
embeddings_papers = get_flair_embedding_batch(list(nips_2018_dataframe['clean_field'])).detach()

In [None]:
with open("nips_2018.pkl", "wb") as f:
    pickle.dump((nips_2018_dataframe,embeddings_papers),f)

# Embeddings - Using ELMo

In [None]:
## AllenNLP
import allennlp
from allennlp.modules.elmo import Elmo, batch_to_ids

In [None]:
!ls ../../../vectors/

In [None]:
elmo_weights_key = '../../../vectors/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
elmo_config_key = '../../../vectors/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
elmo = Elmo(elmo_config_key, elmo_weights_key, num_output_representations = 1, dropout=0.3, requires_grad = False)

In [None]:
def get_elmo(sent):
    elmo.eval()
    sent = [sent.split()]
    character_ids = batch_to_ids(sent)
    if torch.cuda.is_available():
        character_ids = character_ids.cuda()
    embeddings = elmo(character_ids)
    rep = embeddings['elmo_representations'][0]
    rep = rep.squeeze(dim=0)
    avg = rep.mean(dim=0)
    avg = avg/torch.norm(avg)
    return avg

In [None]:
def get_elmo_embedding_batch(sent_list):
    size_array = len(sent_list)
    tensor_array = torch.full((len(sent_list), elmo.get_output_dim()), fill_value = 0)
    for i,sent in tqdm_notebook(enumerate(sent_list)):
        embed = get_elmo(sent)
        tensor_array[i]=embed
    return tensor_array

In [None]:
embeddings_papers_elmo = get_elmo_embedding_batch(list(nips_2018_dataframe['clean_field']))

In [None]:
embeddings_papers_elmo[0]

In [None]:
with open("nips_2018_elmo.pkl", "wb") as f:
    pickle.dump((nips_2018_dataframe,embeddings_papers_elmo),f)

## Loading Dataframe

In [None]:
!ls -lah

In [None]:
embed_type = 'bert'
file_embeds = {
    'elmo':   'nips_2018_elmo.pkl',
    'flair':  'nips_2018.pkl',
    'bert':   'nips_2018_bert.pkl'
}

In [None]:
with open(file_embeds[embed_type], "rb") as f:
    nips_2018_dataframe, embeddings_papers = pickle.load(f)

In [None]:
if embeddings_papers.requires_grad:
    embeddings_papers = embeddings_papers.detach()

In [None]:
embeddings_papers.shape

In [None]:
nips_2018_dataframe.head()

In [None]:
def plot_nearest_papers(nearest_dataframe, perplexity = 20):
    ## Do TSNE and plot
    tsne=TSNE(n_components=2, perplexity = perplexity, method ='barnes_hut',verbose=1)
    sentences_tsne = tsne.fit_transform(embeddings_papers.numpy())  
    nearest_array = np.full((len(nearest_dataframe['index']),sentences_tsne.shape[1]), 5).astype('float32')
    for i, point in enumerate(list(nearest_dataframe['index'])):
        nearest_array[i] = sentences_tsne[point]
        
    ## Sets to plot
    initial_points = (sentences_tsne[:,0],sentences_tsne[:,1])
    nearest_points = (nearest_array[:,0],nearest_array[:,1])
    
    ## Plotting and adding label
    plt.subplots(figsize=(30, 15))
    plt.grid()
    plt.scatter(initial_points[0], initial_points[1], c='y', marker='o')
    plt.scatter(nearest_points[0], nearest_points[1], c='r', marker='x', s = 30)
    for label, x, y in zip(nearest_dataframe['Paper Title'], nearest_points[0], nearest_points[1]):
         font = {'size' : 16, 'weight' : 'normal'}
         plt.rc('font', **font)
         plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset pixels')
    return None

In [None]:
### Get distance function
def calculate_nearest_paper(query_text, topn=5, query_type='full'):
    text_distances = pd.DataFrame(columns=['Paper Title', 'Distance', 'index'])
    if embed_type == 'flair':
        query_embedding = get_flair_embedding(query_text)
    elif embed_type == 'elmo':
        query_embedding = get_elmo(query_text)
    elif embed_type == 'bert':
        query_embedding = get_bert_embedding(query_text)
    for i, txt in enumerate(range(len(nips_2018_dataframe))):
        dist = get_cosine_distance(query_embedding,embeddings_papers[i])
        text_distances.loc[i]=[nips_2018_dataframe['title'][i].split('.')[0],dist, i]
    text_distances= text_distances.sort_values(by=['Distance'], ascending = False).reset_index(drop=True)[0:topn]
    return text_distances

In [None]:
num_papers = 10

In [None]:
ix = np.random.choice(len(nips_2018_dataframe))
print("{} - {}".format(ix, nips_2018_dataframe['title'][ix]))
nearest_papers = calculate_nearest_paper(nips_2018_dataframe['clean_field'][ix], topn=num_papers)
nearest_papers

In [None]:
plot_nearest_papers(nearest_papers, perplexity = 4)

## DPP papers

In [None]:
from numpy.random import rand, randn
from scipy.linalg import qr
from numpy import linalg as LA
from dppy.finite_dpps import FiniteDPP

In [None]:
def get_diverse_papers(samp_size, total_samp = 10):
    eig_vecs, _ = qr(embeddings_papers)
    eigenvals_sentences  = LA.eigvals(eig_vecs).astype('float64')
    DPP = FiniteDPP(kernel_type='likelihood',
                **{'L': (eig_vecs * eigenvals_sentences).dot(eig_vecs.T)})
    
    DPP.flush_samples()
    for _ in range(total_samp):
        DPP.sample_exact_k_dpp(size=samp_size)
    rand_sample = np.random.choice(len(DPP.list_of_samples))
    diverse = DPP.list_of_samples[rand_sample]
    diverse_papers_df = pd.DataFrame(columns=['Paper Title', 'index'])
    for i, ix in enumerate(diverse):
        diverse_papers_df.loc[i] = [nips_2018_dataframe['title'][ix], ix ]
    return diverse_papers_df

In [None]:
diverse_papers_df = get_diverse_papers(num_papers)
diverse_papers_df

In [None]:
plot_nearest_papers(diverse_papers_df, perplexity = 10)

In [None]:
plot_nearest_papers(nearest_papers, perplexity = 10)

In [None]:
diverse_papers_df

In [None]:
nearest_papers