In [1]:
import os
import json
import pandas as pd
import numpy as np
from zipfile import ZipFile
import time

# Preprocessing imports
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer

# Used to create the dense document vectors and Faiss index.
import torch
import transformers
import faiss

from sklearn.metrics.pairwise import euclidean_distances

Dataset -->  Collection of 31000+ paper meta data. Data contains all paper related to ML, CL, NER, AI and CV field publish between 1992 to 2018-Feb. (researches about computer science all around the world.)

In [2]:
df = pd.read_json("arxivData.json")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41000 entries, 0 to 40999
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   author   41000 non-null  object
 1   day      41000 non-null  int64 
 2   id       41000 non-null  object
 3   link     41000 non-null  object
 4   month    41000 non-null  int64 
 5   summary  41000 non-null  object
 6   tag      41000 non-null  object
 7   title    41000 non-null  object
 8   year     41000 non-null  int64 
dtypes: int64(3), object(6)
memory usage: 2.8+ MB


**Preprocessing**

In [4]:
# List of stop words
add_stop = ['said', 'say', 'like']
stop_words = ENGLISH_STOP_WORDS.union(add_stop)

# List of punctuation
punc = list(set(string.punctuation))

In [5]:
# Splits words on white spaces (leaves contractions intact) and splits out
# trailing punctuation
def casual_tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens


def process_text(text):
    text = casual_tokenizer(text)
    text = [each.lower() for each in text]
    text = [re.sub('[0-9]+', '', each) for each in text]
    #text = [expandContractions(each, c_re=c_re) for each in text]
    text = [SnowballStemmer('english').stem(each) for each in text]
    text = [w for w in text if w not in punc]
    text = [w for w in text if w not in stop_words]
    text = [each for each in text if len(each) > 1]
    text = [each for each in text if ' ' not in each]
    return str(" ".join(text))

In [6]:
# Funtion to remove duplicate words
def unique_words(text): 
    ulist = []
    [ulist.append(x) for x in text if x not in ulist]
    return ulist


def word_count(text):
    return len(str(text).split(' '))

In [7]:
# Helper function to create batches to iterate over
def batch(data, batch_size):
    length = len(data)
    for i in range(0, length, batch_size):
        yield data[i:min(i + batch_size, length)]

In [8]:
# apply preprocessing
df['processed_text'] = df['summary'].apply(process_text)

In [9]:
df['summary'] = df['summary'].str.slice(0, 512)
df['processed_text'] = df['processed_text'].str.slice(0, 512)
df = df.dropna()


### **DistilBert and SciBert**

In [10]:
DISTILBERT = 'distilbert-base-uncased'
SCIBERT = 'allenai/scibert_scivocab_uncased'

#load the models
disbert_model = transformers.AutoModel.from_pretrained(DISTILBERT)
scibert_model = transformers.AutoModel.from_pretrained(SCIBERT)
disbert_model.eval()
scibert_model.eval()

#create a transformer tokenizer for BERT using both models
disbert_tokenizer = transformers.AutoTokenizer.from_pretrained(DISTILBERT)
scibert_tokenizer = transformers.AutoTokenizer.from_pretrained(SCIBERT)

# use the cuda if one is available
if torch.cuda.is_available():
  disbert_model = disbert_model.to(torch.device("cuda"))
  scibert_model = scibert_model.to(torch.device("cuda"))

print(disbert_model.device)
print(scibert_model.device)

cuda:0
cuda:0


In [11]:
def get_embeddings(model, tokenizer, data):
    embeddings = []
    for ba in batch(data, 1000):
        # tokenize the summaries and load it to the cuda
        tokenized = tokenizer(ba, padding=True, truncation=True, max_length=512, return_tensors="pt")

        if torch.cuda.is_available():
          tokenized = tokenized.to(torch.device("cuda"))

        # classify the text with the model
        with torch.no_grad():
           output = model(**tokenized)

        # extract the embeddings from the last layer of the model
        for item in output[0]:
          embeddings.append(item[0,:].cpu().detach().numpy())
    return embeddings

In [12]:
def get_embedding(model, tokenizer, text):
    # tokenize the summaries and load it to the cuda
    tokenized = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")

    if torch.cuda.is_available():
      tokenized = tokenized.to(torch.device("cuda"))

    # classify the text with the model
    with torch.no_grad():
       output = model(**tokenized)
    # extract the embeddings from the last layer of the model
    return output[0][0][0,:].cpu().detach().numpy().reshape(1, 768)

In [13]:
df['disbert_embeddings'] = get_embeddings(disbert_model, disbert_tokenizer, df['summary'].to_list())

In [14]:
df['scibert_embeddings'] = get_embeddings(scibert_model, scibert_tokenizer, df['summary'].to_list())

In [15]:
df['disbert_embeddings_pre'] = get_embeddings(disbert_model, disbert_tokenizer, df['processed_text'].to_list())

In [16]:
df['scibert_embeddings_pre'] = get_embeddings(scibert_model, scibert_tokenizer, df['processed_text'].to_list())

**Faiss**

In [17]:
def create_faiss_index(embeddings):
    # change the datatype of the embeddings
    embeddings = np.array(embeddings).astype("float32")
    # Initialize the index with the correct dimensions
    index = faiss.IndexFlatL2(embeddings.shape[1])
    # Add embeddings to the faiss index
    index.add(embeddings)
    print(f"Number of vectors in the Faiss index: {index.ntotal}")
    return index

In [18]:
def do_top_k_search(embeddings, query_text, model, tokenizer, top_k, verbose=True):
    #Compute Embedding of query text
    embedding_q = get_embedding(model, tokenizer, query_text)
    
    #Search and time of process
    st = time.time()
    distances = []
    for em in embeddings:
        
        em = np.expand_dims(em, axis = 0)
        distances.append(euclidean_distances(em, embedding_q)[0][0])
        
    top_k_arguments = np.argsort(np.array(distances))[:top_k]
    dis = np.array(distances)
    et = time.time()
    if verbose:
        print(f'L2 distance: {dis[top_k_arguments].tolist()}\n\nPaper Indexes: {top_k_arguments.tolist()}')
    return et - st, top_k_arguments

In [19]:
def do_faiss_search(index, model, tokenizer, query_text, k, verbose=True):
    # Retrieve the 10 nearest neighbours
    # D = distance, I= index
    st = time.time()
    D, I = index.search(get_embedding(model, tokenizer, query_text), k=k)
    et = time.time()
    if verbose:
        print(f'L2 distance: {D.flatten().tolist()}\n\nPaper indexes: {I.flatten().tolist()}')
    return et - st, I

In [20]:
disbert_index = create_faiss_index(df['disbert_embeddings'].to_list())

Number of vectors in the Faiss index: 41000


In [21]:
scibert_index = create_faiss_index(df['scibert_embeddings'].to_list())

Number of vectors in the Faiss index: 41000


In [22]:
disbert_pre_index = create_faiss_index(df['disbert_embeddings_pre'].to_list())

Number of vectors in the Faiss index: 41000


In [23]:
scibert_pre_index = create_faiss_index(df['scibert_embeddings_pre'].to_list())

Number of vectors in the Faiss index: 41000


In [24]:
sample_list = [9005, 25945, 25249, 13395, 1531, 31107, 1268, 18395, 36071, 13256]
models_tokenizer = [("disbert", disbert_model, disbert_tokenizer), ("scibert", scibert_model, scibert_tokenizer)]
embeddings = [[("no", disbert_index, df['disbert_embeddings'].to_list()), ("yes", disbert_pre_index, df['disbert_embeddings_pre'].to_list())], 
              [("no", scibert_index, df['scibert_embeddings'].to_list()), ("yes", scibert_pre_index, df['scibert_embeddings_pre'].to_list())]]

In [25]:
results_by_sample = {}
if len(models_tokenizer) == len(embeddings):
    for sample in sample_list:
        sample_results = {}
        for i, (model_name, model, tokenizer) in enumerate(models_tokenizer):
            pre_results = {}
            for preprocessing, index, embedding_list in embeddings[i]:
                
                faiss_time, I = do_faiss_search(index, model, tokenizer, df.iloc[sample].summary, 6, verbose=False)
                
                top_k_time, indexes_top = do_top_k_search(embedding_list, df.iloc[sample].summary, model, tokenizer, 6, verbose=False)
                
                pre_results[preprocessing] = {
                    "faiss": {
                        "query": sample,
                        "index": I,
                        "time": faiss_time
                    },
                    "top_k": {
                        "query": sample,
                        "index": indexes_top,
                        "time": top_k_time
                    }
                }
            sample_results[model_name] = pre_results
        results_by_sample[sample] = sample_results

In [26]:
def index2details(df, I, column="title", faiss=True):
    # Prints the paper titles based on the paper indexes.
    if faiss:
        I = I[0]
    for i, idx in enumerate(I):
        print("\t\t\t{}. {}".format(i, df.iloc[idx][column]))
    print()

In [27]:
faiss_avg_time = []
top_k_avg_time = []
for sample, sample_values in results_by_sample.items():
    print("\033[1mSample title: {}\033[0m".format(df.iloc[sample]["title"]))
    for model, model_values in sample_values.items():
        print("\033[1m\tModel:", model, "\033[0m")
        for preprocessing, preprocessing_values in model_values.items():
            print("\033[1m\t\tPreprocessing?", preprocessing, "\033[0m")
            
            faiss_avg_time.append(preprocessing_values["faiss"]["time"])
            top_k_avg_time.append(preprocessing_values["top_k"]["time"])
            
            if all(preprocessing_values["faiss"]["index"][0] == preprocessing_values["top_k"]["index"]):
                print("\033[1m\t\t\tFaiss and top k:", "\033[0m")
                index2details(df, preprocessing_values["faiss"]["index"])
            else:
                print("\033[1m\t\t\tFaiss:", "\033[0m")
                index2details(df, preprocessing_values["faiss"]["index"])
                print("\033[1m\t\t\tTop k:", "\033[0m")
                index2details(df, preprocessing_values["top_k"]["index"], faiss=False)
            

faiss_avg_time = sum(faiss_avg_time) / len(faiss_avg_time)
top_k_avg_time = sum(top_k_avg_time) / len(top_k_avg_time)

print("Faiss avg time:", faiss_avg_time)
print("Top k avg time:", top_k_avg_time)

[1mSample title: Temporal Topic Modeling to Assess Associations between News Trends and
  Infectious Disease Outbreaks[0m
[1m	Model: disbert [0m
[1m		Preprocessing? no [0m
[1m			Faiss and top k: [0m
			0. Temporal Topic Modeling to Assess Associations between News Trends and
  Infectious Disease Outbreaks
			1. Identifying Patterns of Associated-Conditions through Topic Models of
  Electronic Medical Records
			2. NegBio: a high-performance tool for negation and uncertainty detection
  in radiology reports
			3. Application of multiview techniques to NHANES dataset
			4. On the Ground Validation of Online Diagnosis with Twitter and Medical
  Records
			5. Fast clustering for scalable statistical analysis on structured images

[1m		Preprocessing? yes [0m
[1m			Faiss and top k: [0m
			0. Fractionally Predictive Spiking Neurons
			1. Comment on "Biologically inspired protection of deep networks from
  adversarial attacks"
			2. TrueLabel + Confusions: A Spectrum of Probabilist