# Project: Semantic Search with Transformers

## Task 1: Import the Libraries

In [1]:
import faiss
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn import preprocessing
import numpy as np
import json

import os

## Task 2: Load the Data

In [2]:
from torch.utils.data import Dataset, DataLoader

In [3]:
with open("arxivData.json", 'r') as f:
    data = json.load(f)

In [4]:
class ArxivDataset(Dataset):
    
    def __init__(self, fpath, transform=None):
        super().__init__()
        self.transform = transform
        with open(fpath, 'r') as f:
            self.data = json.load(f)
        self.id2idx = preprocessing.LabelEncoder()
        self.id2idx.fit_transform([self.data[idx]['id'] for idx in range(len(self.data))])
    
    def __len__(self):

        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        
        arxiv_id = item['id'] 
        author = item['author']
        year, month, day = item['year'], item['month'], item['day']
        link = item['link']
        summary = item['summary']
        tag = item['tag']

        if self.transform:
            inputs = self.transform(summary)
        else:
            inputs = summary

        return inputs, arxiv_id, author, year, month, day, link, tag

In [5]:
dataset = ArxivDataset(fpath="arxivData.json")
data_loader = DataLoader(dataset, batch_size=16)

## Task 3: Retrieve the Model

In [6]:
model = SentenceTransformer('sentence-transformers/distilbert-base-nli-stsb-mean-tokens')

In [7]:
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))

print(model.device)

mps:0


## Task 4: Generate or Load the Embeddings

In [8]:
embeddings = model.encode(next(iter(data_loader))[0][0])

In [9]:
embeddings.shape[0]

768

## Task 5: Data Preparation and Helper Methods

In [10]:
#dataset.id2idx.transform(next(iter(data_loader))[1])

In [44]:
nth_b = 0
embed_list = []
idx_list = []

for b in data_loader:
    
    nth_b += 1
    embed_list.append(model.encode(b[0]))
    idx_list.append(dataset.id2idx.transform(b[1]))

    if nth_b % 5000 == 0:
        print(f"current batch number = {nth_b}")
        break

In [45]:
xb = np.concatenate(embed_list)
idxes = np.concatenate(idx_list)

In [46]:
import dill as pkl

#with open('xb.pkl', 'wb') as f:
#    pkl.dump(xb, f)

#with open('xb.pkl', 'rb') as f:
#    xb = pkl.load(f)

In [47]:
xb.shape

(41000, 768)

## Task 6: Set up the Index

In [48]:
d = 768
nb = xb.shape[0]
nq = nb // 100
nlists = 100
metric = faiss.METRIC_INNER_PRODUCT

In [49]:
quantizer = faiss.IndexFlatIP(d)
faiss_index = faiss.IndexIVFFlat(quantizer, d, nlists, metric)
# id map wrap
faiss_index = faiss.IndexIDMap(faiss_index)

# normalize vectors
faiss.normalize_L2(xb)

faiss_index.train(xb)
faiss_index.add_with_ids(xb, idxes)

#faiss_index = faiss.IndexFlatIP(d)
#faiss_index.add(xb[:100])



## Task 7: Search with a Summary

In [50]:
k = 10

D, I = faiss_index.search(xb[4:7], k)

print(D)
print(I)

[[0.9999998  0.98586136 0.77610016 0.7680489  0.7616006  0.75104433
  0.7494104  0.7444728  0.73899674 0.73893905]
 [1.         0.8435317  0.8281011  0.82043254 0.8179097  0.8083361
  0.80728793 0.80710757 0.8039176  0.8035967 ]
 [0.9999999  0.98586136 0.76494646 0.76281214 0.75649136 0.7482132
  0.74329126 0.7393939  0.73532283 0.7311379 ]]
[[31468 36310 23379 36818 32305  7352 32303 40505  8559 16827]
 [32183 13544 23507 17366 21982  9574 29730 14475 40846 38761]
 [36310 31468 36818 23379 32303 32305 40505  7352 18184 16827]]


In [51]:
for ith in I:
    print(dataset.id2idx.inverse_transform(ith))
    print(dataset.__getitem__(ith[0]))

['1709.02349v2' '1801.06700v1' '1612.03929v5' '1802.01433v2'
 '1709.10431v1' '1303.2651v2' '1709.10423v1' 'cs/0605036v1' '1307.3091v1'
 '1511.06342v4']
('Recurrent neural networks (RNNs) have been widely used for processing\nsequential data. However, RNNs are commonly difficult to train due to the\nwell-known gradient vanishing and exploding problems and hard to learn\nlong-term patterns. Long short-term memory (LSTM) and gated recurrent unit\n(GRU) were developed to address these problems, but the use of hyperbolic\ntangent and the sigmoid action functions results in gradient decay over layers.\nConsequently, construction of an efficiently trainable deep network is\nchallenging. In addition, all the neurons in an RNN layer are entangled\ntogether and their behaviour is hard to interpret. To address these problems, a\nnew type of RNN, referred to as independently recurrent neural network\n(IndRNN), is proposed in this paper, where neurons in the same layer are\nindependent of each othe

In [52]:
print(xb[4:7])

[[-0.04065348  0.04045936  0.03203844 ...  0.02620858 -0.03878679
   0.04875945]
 [-0.03931895  0.01726937  0.01561792 ... -0.01546924 -0.03887791
   0.02976324]
 [-0.0304892   0.03507083  0.03469085 ...  0.02206326 -0.03384019
   0.04024854]]


In [58]:
model.encode(dataset.__getitem__(4)[0])[:5]

array([-0.5476006 ,  0.5449859 ,  0.43155602,  0.02201444,  0.20519091],
      dtype=float32)

In [53]:
print(dataset.__getitem__(4)[0])

We present MILABOT: a deep reinforcement learning chatbot developed by the
Montreal Institute for Learning Algorithms (MILA) for the Amazon Alexa Prize
competition. MILABOT is capable of conversing with humans on popular small talk
topics through both speech and text. The system consists of an ensemble of
natural language generation and retrieval models, including template-based
models, bag-of-words models, sequence-to-sequence neural network and latent
variable neural network models. By applying reinforcement learning to
crowdsourced data and real-world user interactions, the system has been trained
to select an appropriate response from the models in its ensemble. The system
has been evaluated through A/B testing with real-world users, where it
performed significantly better than many competing systems. Due to its machine
learning architecture, the system is likely to improve with additional data.


In [54]:
print(dataset.__getitem__(I[0][0])[0])

Recurrent neural networks (RNNs) have been widely used for processing
sequential data. However, RNNs are commonly difficult to train due to the
well-known gradient vanishing and exploding problems and hard to learn
long-term patterns. Long short-term memory (LSTM) and gated recurrent unit
(GRU) were developed to address these problems, but the use of hyperbolic
tangent and the sigmoid action functions results in gradient decay over layers.
Consequently, construction of an efficiently trainable deep network is
challenging. In addition, all the neurons in an RNN layer are entangled
together and their behaviour is hard to interpret. To address these problems, a
new type of RNN, referred to as independently recurrent neural network
(IndRNN), is proposed in this paper, where neurons in the same layer are
independent of each other and they are connected across layers. We have shown
that an IndRNN can be easily regulated to prevent the gradient exploding and
vanishing problems while allowing 

In [55]:
print(dataset.__getitem__(I[0][4])[0])

This paper concerns the distributed training of nonlinear kernel machines on
Map-Reduce. We show that a re-formulation of Nystr\"om approximation based
solution which is solved using gradient based techniques is well suited for
this, especially when it is necessary to work with a large number of basis
points. The main advantages of this approach are: avoidance of computing the
pseudo-inverse of the kernel sub-matrix corresponding to the basis points;
simplicity and efficiency of the distributed part of the computations; and,
friendliness to stage-wise addition of basis points. We implement the method
using an AllReduce tree on Hadoop and demonstrate its value on a few large
benchmark datasets.


In [56]:
print(dataset.__getitem__(I[0][-1])[0])

Idea Density (ID) measures the rate at which ideas or elementary predications
are expressed in an utterance or in a text. Lower ID is found to be associated
with an increased risk of developing Alzheimer's disease (AD) (Snowdon et al.,
1996; Engelman et al., 2010). ID has been used in two different versions:
propositional idea density (PID) counts the expressed ideas and can be applied
to any text while semantic idea density (SID) counts pre-defined information
content units and is naturally more applicable to normative domains, such as
picture description tasks. In this paper, we develop DEPID, a novel
dependency-based method for computing PID, and its version DEPID-R that enables
to exclude repeating ideas---a feature characteristic to AD speech. We conduct
the first comparison of automatically extracted PID and SID in the diagnostic
classification task on two different AD datasets covering both closed-topic and
free-recall domains. While SID performs better on the normative dataset,