In [3]:
import pandas as pd
import numpy as np
import os

import json
import re

In [1]:
# The embeddings and the dataframe created and saved in Part 1

PATH_TO_EMBEDS = 'compressed_array.npz'
PATH_TO_DF = 'compressed_dataframe.csv.gz'

In [None]:
def run_faiss_search(query_text, top_k):
    
    # Run FAISS exhaustive search
    
    query = [query_text]

    # Vectorize the query string
    query_embedding = model.encode(query)

    # Run the query
    # index_vals refers to the chunk_list index values
    scores, index_vals = faiss_index.search(query_embedding, top_k)
    
    # Get the list of index vals
    index_vals_list = index_vals[0]
    
    return index_vals_list
    

def run_rerank(index_vals_list, query_text):
    
    chunk_list = list(df_data['prepared_text'])

    # Replace the chunk index values with the corresponding strings
    pred_strings_list = [chunk_list[item] for item in index_vals_list]

    # Format the input for the cross encoder
    # The input to the cross_encoder is a list of lists
    # [[query_text, pred_text1], [query_text, pred_text2], ...]

    cross_input_list = []

    for item in pred_strings_list:

        new_list = [query_text, item]

        cross_input_list.append(new_list)


    # Put the pred text into a dataframe
    df = pd.DataFrame(cross_input_list, columns=['query_text', 'pred_text'])

    # Save the orginal index (i.e. df_data index values)
    df['original_index'] = index_vals_list

    # Now, score all retrieved passages using the cross_encoder
    cross_scores = cross_encoder.predict(cross_input_list)

    # Add the scores to the dataframe
    df['cross_scores'] = cross_scores

    # Sort the DataFrame in descending order based on the scores
    df_sorted = df.sort_values(by='cross_scores', ascending=False)
    
    # Reset the index (*This was missed previously*)
    df_sorted = df_sorted.reset_index(drop=True)

    pred_list = []

    for i in range(0,len(df_sorted)):

        text = df_sorted.loc[i, 'pred_text']

        # Get the arxiv id
        # original_index refers to the index values in df_filtered
        original_index = df_sorted.loc[i, 'original_index']
        arxiv_id = df_data.loc[original_index, 'id']
        cat_text = df_data.loc[original_index, 'cat_text']
        title = df_data.loc[original_index, 'title']

        # Crete the link to the research paper pdf
        link_to_pdf = f'https://arxiv.org/pdf/{arxiv_id}'

        item = {
            'arxiv_id': arxiv_id,
            'link_to_pdf': link_to_pdf,
            'cat_text': cat_text,
            'title': title,
            'abstract': text
        }

        pred_list.append(item)

    return pred_list


def print_search_results(pred_list, num_results_to_print):
    
    for i in range(0,num_results_to_print):
        
        pred_dict = pred_list[i]
        
        link_to_pdf = pred_dict['link_to_pdf']
        abstract = pred_dict['abstract']
        cat_text = pred_dict['cat_text']
        title = pred_dict['title']

        print('Title:',title)
        print('Categories:',cat_text)
        print('Abstract:',abstract)
        print('Link to pdf:',link_to_pdf)
        print()
    
   
def run_arxiv_search(query_text, num_results_to_print, top_k=5):
    
    # Run a faiss greedy search
    pred_index_list = run_faiss_search(query_text, top_k)

    # This returns a list of dicts with length equal to top_k
    pred_list = run_rerank(pred_index_list, query_text)
    
    # Print the results
    print_search_results(pred_list, num_results_to_print)

In [4]:
# Load the compressed array
embeddings = np.load(PATH_TO_EMBEDS)

# Access the array by the name you specified ('my_array' in this case)
embeddings = embeddings['array_data']

embeddings.shape

(2564718, 384)

In [5]:
# Load the compressed DataFrame

df_data = pd.read_csv(PATH_TO_DF, compression='gzip')

print(df_data.shape)

#df_data.head()

(2564718, 6)


  df_data = pd.read_csv(PATH_TO_DF, compression='gzip')


In [6]:
# Initialize FAISS

import faiss

embed_length = embeddings.shape[1]

faiss_index = faiss.IndexFlatL2(embed_length)

# Add the embeddings to the index
faiss_index.add(embeddings)

faiss_index.is_trained

True

In [7]:
# Initialize sentence_transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [8]:
# Initialize the cross_encoder for reranking

from sentence_transformers import CrossEncoder

# We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
# *** PLEASE ENTER YOUR SEARCH QUERY HERE ***

query_text = """

I want to read some papers about facial recognition and its social issue

"""


# RUN THE SEARCH
num_results_to_print = 5 # top_k = 300
top_k = 10
run_arxiv_search(query_text, num_results_to_print, top_k)

Title: Responsible Facial Recognition and Beyond
Categories: Computer Vision and Pattern Recognition, Computers and Society
Abstract: Responsible Facial Recognition and Beyond {title} Facial recognition is changing the way we live in and interact with our society. Here we discuss the two sides of facial recognition, summarizing potential risks and current concerns. We introduce current policies and regulations in different countries. Very importantly, we point out that the risks and concerns are not only from facial recognition, but also realistically very similar to other biometric recognition technology, including but not limited to gait recognition, iris recognition, fingerprint recognition, voice recognition, etc. To create a responsible future, we discuss possible technological moves and efforts that should be made to keep facial recognition (and biometric recognition in general) developing for social good.
Link to pdf: https://arxiv.org/pdf/1909.12935

Title: Human Expression Rec

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


In [None]:
def calculate_bleu(candidate_abstract, reference_abstract):
    # Tokenize the abstracts into words
    candidate_tokens = candidate_abstract.split()
    reference_tokens = reference_abstract.split()

    # Use smoothing for better scores with short sequences
    smooth_fn = SmoothingFunction().method1

    # Compute BLEU score
    bleu_score = sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smooth_fn)
    return bleu_score


In [None]:
def evaluate_bleu(pred_list, references):
    bleu_scores = []
    for pred, ref in zip(pred_list, references):
        candidate_abstract = pred['abstract']
        reference_abstract = ref  # Replace with the actual reference abstract
        bleu = calculate_bleu(candidate_abstract, reference_abstract)
        bleu_scores.append(bleu)
    return bleu_scores
