In [1]:
import pandas as pd
import numpy as np
import os

import json
import re

In [2]:
os.listdir('dataset')

['.DS_Store',
 'dataset2.json',
 'cs_si_dataset.csv',
 'dataset.csv',
 'see.txt',
 'dataset.json']

In [3]:
# All Arxiv category codes
# Source: https://www.kaggle.com/code/artgor/arxiv-metadata-exploration

# https://arxiv.org/category_taxonomy
# https://info.arxiv.org/help/api/user-manual.html#subject_classifications


category_map = {
# These created errors when mapping categories to descriptions
'acc-phys': 'Accelerator Physics',
'adap-org': 'Not available',
'q-bio': 'Not available',
'cond-mat': 'Not available',
'chao-dyn': 'Not available',
'patt-sol': 'Not available',
'dg-ga': 'Not available',
'solv-int': 'Not available',
'bayes-an': 'Not available',
'comp-gas': 'Not available',
'alg-geom': 'Not available',
'funct-an': 'Not available',
'q-alg': 'Not available',
'ao-sci': 'Not available',
'atom-ph': 'Atomic Physics',
'chem-ph': 'Chemical Physics',
'plasm-ph': 'Plasma Physics',
'mtrl-th': 'Not available',
'cmp-lg': 'Not available',
'supr-con': 'Not available',
###

# Added
'econ.GN': 'General Economics', 
'econ.TH': 'Theoretical Economics', 
'eess.SY': 'Systems and Control', 
    
'astro-ph': 'Astrophysics',
'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
'astro-ph.EP': 'Earth and Planetary Astrophysics',
'astro-ph.GA': 'Astrophysics of Galaxies',
'astro-ph.HE': 'High Energy Astrophysical Phenomena',
'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
'astro-ph.SR': 'Solar and Stellar Astrophysics',
'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
'cond-mat.mtrl-sci': 'Materials Science',
'cond-mat.other': 'Other Condensed Matter',
'cond-mat.quant-gas': 'Quantum Gases',
'cond-mat.soft': 'Soft Condensed Matter',
'cond-mat.stat-mech': 'Statistical Mechanics',
'cond-mat.str-el': 'Strongly Correlated Electrons',
'cond-mat.supr-con': 'Superconductivity',
'cs.AI': 'Artificial Intelligence',
'cs.AR': 'Hardware Architecture',
'cs.CC': 'Computational Complexity',
'cs.CE': 'Computational Engineering, Finance, and Science',
'cs.CG': 'Computational Geometry',
'cs.CL': 'Computation and Language',
'cs.CR': 'Cryptography and Security',
'cs.CV': 'Computer Vision and Pattern Recognition',
'cs.CY': 'Computers and Society',
'cs.DB': 'Databases',
'cs.DC': 'Distributed, Parallel, and Cluster Computing',
'cs.DL': 'Digital Libraries',
'cs.DM': 'Discrete Mathematics',
'cs.DS': 'Data Structures and Algorithms',
'cs.ET': 'Emerging Technologies',
'cs.FL': 'Formal Languages and Automata Theory',
'cs.GL': 'General Literature',
'cs.GR': 'Graphics',
'cs.GT': 'Computer Science and Game Theory',
'cs.HC': 'Human-Computer Interaction',
'cs.IR': 'Information Retrieval',
'cs.IT': 'Information Theory',
'cs.LG': 'Machine Learning',
'cs.LO': 'Logic in Computer Science',
'cs.MA': 'Multiagent Systems',
'cs.MM': 'Multimedia',
'cs.MS': 'Mathematical Software',
'cs.NA': 'Numerical Analysis',
'cs.NE': 'Neural and Evolutionary Computing',
'cs.NI': 'Networking and Internet Architecture',
'cs.OH': 'Other Computer Science',
'cs.OS': 'Operating Systems',
'cs.PF': 'Performance',
'cs.PL': 'Programming Languages',
'cs.RO': 'Robotics',
'cs.SC': 'Symbolic Computation',
'cs.SD': 'Sound',
'cs.SE': 'Software Engineering',
'cs.SI': 'Social and Information Networks',
'cs.SY': 'Systems and Control',
'econ.EM': 'Econometrics',             
'eess.AS': 'Audio and Speech Processing',
'eess.IV': 'Image and Video Processing',
'eess.SP': 'Signal Processing',               
'gr-qc': 'General Relativity and Quantum Cosmology',
'hep-ex': 'High Energy Physics - Experiment',
'hep-lat': 'High Energy Physics - Lattice',
'hep-ph': 'High Energy Physics - Phenomenology',
'hep-th': 'High Energy Physics - Theory',
'math.AC': 'Commutative Algebra',
'math.AG': 'Algebraic Geometry',
'math.AP': 'Analysis of PDEs',
'math.AT': 'Algebraic Topology',
'math.CA': 'Classical Analysis and ODEs',
'math.CO': 'Combinatorics',
'math.CT': 'Category Theory',
'math.CV': 'Complex Variables',
'math.DG': 'Differential Geometry',
'math.DS': 'Dynamical Systems',
'math.FA': 'Functional Analysis',
'math.GM': 'General Mathematics',
'math.GN': 'General Topology',
'math.GR': 'Group Theory',
'math.GT': 'Geometric Topology',
'math.HO': 'History and Overview',
'math.IT': 'Information Theory',
'math.KT': 'K-Theory and Homology',
'math.LO': 'Logic',
'math.MG': 'Metric Geometry',
'math.MP': 'Mathematical Physics',
'math.NA': 'Numerical Analysis',
'math.NT': 'Number Theory',
'math.OA': 'Operator Algebras',
'math.OC': 'Optimization and Control',
'math.PR': 'Probability',
'math.QA': 'Quantum Algebra',
'math.RA': 'Rings and Algebras',
'math.RT': 'Representation Theory',
'math.SG': 'Symplectic Geometry',
'math.SP': 'Spectral Theory',
'math.ST': 'Statistics Theory',
'math-ph': 'Mathematical Physics',
'nlin.AO': 'Adaptation and Self-Organizing Systems',
'nlin.CD': 'Chaotic Dynamics',
'nlin.CG': 'Cellular Automata and Lattice Gases',
'nlin.PS': 'Pattern Formation and Solitons',
'nlin.SI': 'Exactly Solvable and Integrable Systems',
'nucl-ex': 'Nuclear Experiment',
'nucl-th': 'Nuclear Theory',
'physics.acc-ph': 'Accelerator Physics',
'physics.ao-ph': 'Atmospheric and Oceanic Physics',
'physics.app-ph': 'Applied Physics',
'physics.atm-clus': 'Atomic and Molecular Clusters',
'physics.atom-ph': 'Atomic Physics',
'physics.bio-ph': 'Biological Physics',
'physics.chem-ph': 'Chemical Physics',
'physics.class-ph': 'Classical Physics',
'physics.comp-ph': 'Computational Physics',
'physics.data-an': 'Data Analysis, Statistics and Probability',
'physics.ed-ph': 'Physics Education',
'physics.flu-dyn': 'Fluid Dynamics',
'physics.gen-ph': 'General Physics',
'physics.geo-ph': 'Geophysics',
'physics.hist-ph': 'History and Philosophy of Physics',
'physics.ins-det': 'Instrumentation and Detectors',
'physics.med-ph': 'Medical Physics',
'physics.optics': 'Optics',
'physics.plasm-ph': 'Plasma Physics',
'physics.pop-ph': 'Popular Physics',
'physics.soc-ph': 'Physics and Society',
'physics.space-ph': 'Space Physics',
'q-bio.BM': 'Biomolecules',
'q-bio.CB': 'Cell Behavior',
'q-bio.GN': 'Genomics',
'q-bio.MN': 'Molecular Networks',
'q-bio.NC': 'Neurons and Cognition',
'q-bio.OT': 'Other Quantitative Biology',
'q-bio.PE': 'Populations and Evolution',
'q-bio.QM': 'Quantitative Methods',
'q-bio.SC': 'Subcellular Processes',
'q-bio.TO': 'Tissues and Organs',
'q-fin.CP': 'Computational Finance',
'q-fin.EC': 'Economics',
'q-fin.GN': 'General Finance',
'q-fin.MF': 'Mathematical Finance',
'q-fin.PM': 'Portfolio Management',
'q-fin.PR': 'Pricing of Securities',
'q-fin.RM': 'Risk Management',
'q-fin.ST': 'Statistical Finance',
'q-fin.TR': 'Trading and Market Microstructure',
'quant-ph': 'Quantum Physics',
'stat.AP': 'Applications',
'stat.CO': 'Computation',
'stat.ME': 'Methodology',
'stat.ML': 'Machine Learning',
'stat.OT': 'Other Statistics',
'stat.TH': 'Statistics Theory'
}

In [4]:
# https://www.kaggle.com/code/matthewmaddock/nlp-arxiv-dataset-transformers-and-umap

# This takes about 1 minute.


cols = ['id', 'title', 'abstract', 'categories']
data = []
file_name = 'dataset/dataset2.json'


with open(file_name, encoding='latin-1') as f:
    for line in f:
        doc = json.loads(line)
        lst = [doc['id'], doc['title'], doc['abstract'], doc['categories']]
        data.append(lst)

df_data = pd.DataFrame(data=data, columns=cols)

print(df_data.shape)

df_data.head()

(2586192, 4)


Unnamed: 0,id,title,abstract,categories
0,704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph
1,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG
2,704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph
3,704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,math.CA math.FA


In [5]:
def get_cat_text(x):
    
    cat_text = ''
    
    # Put the codes into a list
    cat_list = x.split(' ')
    
    for i, item in enumerate(cat_list):
        
        cat_name = category_map[item]
        
        # If there was no description available
        # for the category code then don't include it in the text.
        if cat_name != 'Not available':
            
            if i == 0:
                cat_text = cat_name
            else:
                cat_text = cat_text + ', ' + cat_name
 
    # Remove leading and trailing spaces
    cat_text = cat_text.strip()
    
    return cat_text
    

df_data['cat_text'] = df_data['categories'].apply(get_cat_text)

df_data.head()

Unnamed: 0,id,title,abstract,categories,cat_text
0,704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph,High Energy Physics - Phenomenology
1,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG,"Combinatorics, Computational Geometry"
2,704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph,General Physics
3,704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO,Combinatorics
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,math.CA math.FA,"Classical Analysis and ODEs, Functional Analysis"


In [6]:
# Print details of one paper

i = 1

print('Id:',df_data.loc[i, 'id'])
print()
print('Title:',df_data.loc[i, 'title'])
print()
print('Categories:',df_data.loc[i, 'cat_text'])
print()
print('Abstract:',df_data.loc[i, 'abstract'])

Id: 0704.0002

Title: Sparsity-certifying Graph Decompositions

Categories: Combinatorics, Computational Geometry

Abstract:   We describe a new algorithm, the $(k,\ell)$-pebble game with colors, and use
it obtain a characterization of the family of $(k,\ell)$-sparse graphs and
algorithmic solutions to a family of problems concerning tree decompositions of
graphs. Special instances of sparse graphs appear in rigidity theory and have
received increased attention in recent years. In particular, our colored
pebbles generalize and strengthen the previous results of Lee and Streinu and
give a new proof of the Tutte-Nash-Williams characterization of arboricity. We
also present a new decomposition that certifies sparsity based on the
$(k,\ell)$-pebble game with colors. Our work also exposes connections between
pebble game algorithms and previous sparse graph algorithms by Gabow, Gabow and
Westermann and Hendrickson.



In [7]:
# Replace newline characters ('\n') with a space
# Remove leading and trailing spaces

def clean_text(x):
    
    # Replace newline characters with a space
    new_text = x.replace("\n", " ")
    # Remove leading and trailing spaces
    new_text = new_text.strip()
    
    return new_text

df_data['title'] = df_data['title'].apply(clean_text)
df_data['abstract'] = df_data['abstract'].apply(clean_text)

#df_filtered.head()

In [8]:
# Append the title to the abstract

df_data['prepared_text'] = df_data['title'] + ' {title} ' + df_data['abstract']

#df_data.head()

In [9]:
# Create a list of text chunks

chunk_list = list(df_data['prepared_text'])

# The ids are used to create web links to each paper.
# You can access each paper directly on ArXiv using these links:
# https://arxiv.org/abs/{id}: ArXiv page for the paper
# https://arxiv.org/pdf/{id}: Direct link to download the PDF

arxiv_id_list = list(df_data['id'])
cat_list = list(df_data['cat_text'])

print(len(chunk_list))
print(len(arxiv_id_list))
print(len(cat_list))

2586192
2586192
2586192


In [10]:
chunk_list[0]

'Calculation of prompt diphoton production cross sections at Tevatron and   LHC energies {title} A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders. All next-to-leading order perturbative contributions from quark-antiquark, gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as all-orders resummation of initial-state gluon radiation valid at next-to-next-to-leading logarithmic accuracy. The region of phase space is specified in which the calculation is most reliable. Good agreement is demonstrated with data from the Fermilab Tevatron, and predictions are made for more detailed tests with CDF and DO data. Predictions are shown for distributions of diphoton pairs produced at the energy of the Large Hadron Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs boson are contrasted with those produced from QCD processes at the LHC, showing that enhanced 

In [11]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

# Sentences are encoded by calling model.encode()
embeddings = model.encode(chunk_list)

print(embeddings.shape)
print('Embedding length', embeddings.shape[1])

  from tqdm.autonotebook import tqdm, trange


(2586192, 384)
Embedding length 384


In [12]:
type(embeddings)

numpy.ndarray

In [13]:
# Save the array in compressed format
np.savez_compressed('compressed_array.npz', array_data=embeddings)

!ls

RAG copy.ipynb
RAG.ipynb
Untitled Diagram.pdf
[ENSF 619.5] Social dimensions of technologies in computer & society articles.pdf
apigemini.txt
arxiv-metadata-oai-snapshot-2.json
compressed_array.npz
compressed_dataframe.csv
compressed_dataframe.csv.gz
[34mdataset[m[m
datasetconvert.py
differences.txt
filter.py
filtered_differences.txt
runsystem.ipynb
see.ipynb
systemapi.py
[34mtest[m[m
testapigemini.ipynb


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
# Check the size of the saved file

import os

# Get the size of the file in bytes
file_size_bytes = os.path.getsize('compressed_array.npz')

# Convert bytes to megabytes
file_size_mb = file_size_bytes / (1024 * 1024)

print("File size:", file_size_mb, "MB")

File size: 3512.1689653396606 MB


In [15]:
# How to load the saved array

# Load the compressed array
loaded_embeddings = np.load('compressed_array.npz')

# Access the array by the name you specified ('my_array' in this case)
loaded_embeddings = loaded_embeddings['array_data']

loaded_embeddings.shape

(2586192, 384)

In [16]:
# Save the DataFrame in compressed format

df_data.to_csv('compressed_dataframe.csv.gz', compression='gzip', index=False)

!ls

RAG copy.ipynb
RAG.ipynb
Untitled Diagram.pdf
[ENSF 619.5] Social dimensions of technologies in computer & society articles.pdf
apigemini.txt
arxiv-metadata-oai-snapshot-2.json
compressed_array.npz
compressed_dataframe.csv
compressed_dataframe.csv.gz
[34mdataset[m[m
datasetconvert.py
differences.txt
filter.py
filtered_differences.txt
runsystem.ipynb
see.ipynb
systemapi.py
[34mtest[m[m
testapigemini.ipynb


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
# How to load the compressed DataFrame

df = pd.read_csv('compressed_dataframe.csv.gz', compression='gzip')

print(df.shape)

df.head(2)

  df = pd.read_csv('compressed_dataframe.csv.gz', compression='gzip')


(2586192, 6)


Unnamed: 0,id,title,abstract,categories,cat_text,prepared_text
0,704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturbati...,hep-ph,High Energy Physics - Phenomenology,Calculation of prompt diphoton production cros...
1,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG,"Combinatorics, Computational Geometry",Sparsity-certifying Graph Decompositions {titl...


# FAISS


In [18]:
import faiss

embed_length = embeddings.shape[1]

index = faiss.IndexFlatL2(embed_length)

# Check if the index is trained.
# No training needed when using greedy search i.e. IndexFlatL2
index.is_trained

True

In [19]:
# Add the embeddings to the index

index.add(loaded_embeddings)

# Check the total number of embeddings in the index
index.ntotal

2586192

In [20]:
# Run a query

# query_text = """
# I want to create an invisibility cloak similar to the one in Harry Potter.
# """
query_text = """ 
I want to read some papers about facial recognition and its social issue
"""
query = [query_text]


# Vectorize the query string
# from sentence_transformers import SentenceTransformer

# model = SentenceTransformer("all-MiniLM-L6-v2")
query_embedding = model.encode(query)

# Set the number of outputs we want
top_k = 3

# Run the query
# index_vals refers to the chunk_list index values
scores, index_vals = index.search(query_embedding, top_k)

print(index_vals)
print(scores)

[[1182973 1417302  191803]]
[[0.7161845 0.8309304 0.8536004]]


In [21]:
# Let's print the first search result

pred_indexes = index_vals[0]

i = 0
chunk_index = pred_indexes[i]
text = chunk_list[chunk_index]

text

'Responsible Facial Recognition and Beyond {title} Facial recognition is changing the way we live in and interact with our society. Here we discuss the two sides of facial recognition, summarizing potential risks and current concerns. We introduce current policies and regulations in different countries. Very importantly, we point out that the risks and concerns are not only from facial recognition, but also realistically very similar to other biometric recognition technology, including but not limited to gait recognition, iris recognition, fingerprint recognition, voice recognition, etc. To create a responsible future, we discuss possible technological moves and efforts that should be made to keep facial recognition (and biometric recognition in general) developing for social good.'

# Nearest Neigbor Search in investigation

In [22]:
# # How many clusters (voronoid cells) do we want?
# # Example: For 4 centroilds we need at least 156 embeddings in
# # order to train the index.
# num_centroids = 5

# quantizer = faiss.IndexFlatL2(embed_length)

# index = faiss.IndexIVFFlat(quantizer, embed_length, num_centroids)

In [23]:
# # Train the index
# # After the index is trained it's ready to receive data

# index.train(loaded_embeddings)

# index.is_trained

In [24]:
# # Add the embeddings to the index

# index.add(embeddings)

# # Check how many embeddings are in the index
# index.ntotal

In [25]:
# query = [query_text]
# query_embedding = model.encode(query)

# top_k = 5


# # Run the query
# # index_vals refers to the chunk_list index values
# scores, index_vals = index.search(query_embedding, top_k)

# print(index_vals)
# print(scores)

In [26]:
# # Let's print the first search result

# pred_indexes = index_vals[0]

# i = 3
# chunk_index = pred_indexes[i]
# text = chunk_list[chunk_index]

# text

In [27]:
# # So far we've just been searching the cell with 
# # the nearest centroid.
# # Setting nprobe allows us to search more of
# # the nearest cells. e.g. nprobe = 4 means w will search 4 cells.
# # This can be done if we were not getting good results and wanted
# # to improve performance. The time taken also increases as we are
# # comparing to more vectors.

# index.nprobe = 4

In [28]:
# query = [query_text]
# query_embedding = model.encode(query)

# top_k = 5

# # Run the query
# # index_vals refers to the chunk_list index values
# scores, index_vals = index.search(query_embedding, top_k)

# print(index_vals)
# print(scores)

In [29]:
# # Let's print the third search result

# pred_indexes = index_vals[0]

# i = 3
# chunk_index = pred_indexes[i]
# text = chunk_list[chunk_index]

# text

In [30]:
from sentence_transformers import CrossEncoder

# We use a cross-encoder to re-rank the results
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [31]:
# [1] Run a search

query = [query_text]
query_embedding = model.encode(query)

top_k = 10
D, I = index.search(query_embedding, top_k)

list(I[0])

[np.int64(1182973),
 np.int64(1417302),
 np.int64(191803),
 np.int64(1334997),
 np.int64(1574334),
 np.int64(1383220),
 np.int64(1579102),
 np.int64(799967),
 np.int64(1363353),
 np.int64(1754843)]

In [32]:
# [2] Get the text associated with each search result

pred_list = list(I[0])

# Replace the chunk index values with the corresponding strings
pred_strings_list = [chunk_list[item] for item in pred_list]

pred_strings_list[0]

'Responsible Facial Recognition and Beyond {title} Facial recognition is changing the way we live in and interact with our society. Here we discuss the two sides of facial recognition, summarizing potential risks and current concerns. We introduce current policies and regulations in different countries. Very importantly, we point out that the risks and concerns are not only from facial recognition, but also realistically very similar to other biometric recognition technology, including but not limited to gait recognition, iris recognition, fingerprint recognition, voice recognition, etc. To create a responsible future, we discuss possible technological moves and efforts that should be made to keep facial recognition (and biometric recognition in general) developing for social good.'

In [33]:
# Format the input for the cross encoder

# The input to the cross_encoder is a list of lists
# [[query_text, pred_text1], [query_text, pred_text2], ...]

cross_input_list = []

for item in pred_strings_list:
    
    new_list = [query[0], item]
    
    cross_input_list.append(new_list)

In [34]:
cross_input_list[2]

[' \nI want to read some papers about facial recognition and its social issue\n',
 'Facial Recognition Technology: An analysis with scope in India {title} A facial recognition system is a computer application for automatically identifying or verifying a person from a digital image or a video frame from a video source. One of the way is to do this is by comparing selected facial features from the image and a facial database.It is typically used in security systems and can be compared to other biometrics such as fingerprint or eye iris recognition systems. In this paper we focus on 3-D facial recognition system and biometric facial recognision system. We do critics on facial recognision system giving effectiveness and weaknesses. This paper also introduces scope of recognision system in India.']

In [35]:
# Put the pred text into a dataframe

df = pd.DataFrame(cross_input_list, columns=['query_text', 'pred_text'])
df['original_index'] = I[0]

df.head()

Unnamed: 0,query_text,pred_text,original_index
0,\nI want to read some papers about facial rec...,Responsible Facial Recognition and Beyond {tit...,1182973
1,\nI want to read some papers about facial rec...,About Face: A Survey of Facial Recognition Eva...,1417302
2,\nI want to read some papers about facial rec...,Facial Recognition Technology: An analysis wit...,191803
3,\nI want to read some papers about facial rec...,Facial Recognition: A cross-national Survey on...,1334997
4,\nI want to read some papers about facial rec...,SoK: Anti-Facial Recognition Technology {title...,1574334


In [36]:
# Now, score all retrieved passages using the cross_encoder

cross_scores = cross_encoder.predict(cross_input_list)

cross_scores

array([ 0.8622672, -3.610136 , -3.691679 ,  0.3292061, -1.1912436,
       -3.9569216, -7.8567657, -5.4584627, -0.8588654, -5.335023 ],
      dtype=float32)

In [37]:
# Add the scores to the dataframe

df['cross_scores'] = cross_scores

df.head()

Unnamed: 0,query_text,pred_text,original_index,cross_scores
0,\nI want to read some papers about facial rec...,Responsible Facial Recognition and Beyond {tit...,1182973,0.862267
1,\nI want to read some papers about facial rec...,About Face: A Survey of Facial Recognition Eva...,1417302,-3.610136
2,\nI want to read some papers about facial rec...,Facial Recognition Technology: An analysis wit...,191803,-3.691679
3,\nI want to read some papers about facial rec...,Facial Recognition: A cross-national Survey on...,1334997,0.329206
4,\nI want to read some papers about facial rec...,SoK: Anti-Facial Recognition Technology {title...,1574334,-1.191244


In [38]:
# Sort the DataFrame in descending order based on the scores

df_sorted = df.sort_values(by='cross_scores', ascending=False)

# Reset the index (*This was missed previously*)
df_sorted = df_sorted.reset_index(drop=True)

df_sorted.head(10)

Unnamed: 0,query_text,pred_text,original_index,cross_scores
0,\nI want to read some papers about facial rec...,Responsible Facial Recognition and Beyond {tit...,1182973,0.862267
1,\nI want to read some papers about facial rec...,Facial Recognition: A cross-national Survey on...,1334997,0.329206
2,\nI want to read some papers about facial rec...,Understanding bias in facial recognition techn...,1363353,-0.858865
3,\nI want to read some papers about facial rec...,SoK: Anti-Facial Recognition Technology {title...,1574334,-1.191244
4,\nI want to read some papers about facial rec...,About Face: A Survey of Facial Recognition Eva...,1417302,-3.610136
5,\nI want to read some papers about facial rec...,Facial Recognition Technology: An analysis wit...,191803,-3.691679
6,\nI want to read some papers about facial rec...,GenderRobustness: Robustness of Gender Detecti...,1383220,-3.956922
7,\nI want to read some papers about facial rec...,Robustness Disparities in Face Detection {titl...,1754843,-5.335023
8,\nI want to read some papers about facial rec...,Automated Inference on Sociopsychological Impr...,799967,-5.458463
9,\nI want to read some papers about facial rec...,Cinderella's shoe won't fit Soundarya: An audi...,1579102,-7.856766


In [39]:
# Compare the orginal predicted index order and 
# the re-ranked index order

print('Original order:',I[0])
print('Reranked order:',list(df_sorted['original_index']))

Original order: [1182973 1417302  191803 1334997 1574334 1383220 1579102  799967 1363353
 1754843]
Reranked order: [1182973, 1334997, 1363353, 1574334, 1417302, 191803, 1383220, 1754843, 799967, 1579102]


In [40]:

# Print the output

# Print three results
num_results = 3

for i in range(0,num_results):
    
    text = df_sorted.loc[i, 'pred_text']
    
    original_index = df_sorted.loc[i, 'original_index']
    arxiv_id = df_data.loc[original_index, 'id']
    cat_text = df_data.loc[original_index, 'cat_text']
    
    # Crete the link to the research paper pdf
    link_to_pdf = f'https://arxiv.org/pdf/{arxiv_id}'
    
    print('Link to pdf:',link_to_pdf)
    print('Categories:',cat_text)
    print('Abstract:',text)
    print()

Link to pdf: https://arxiv.org/pdf/1909.12935
Categories: Computer Vision and Pattern Recognition, Computers and Society
Abstract: Responsible Facial Recognition and Beyond {title} Facial recognition is changing the way we live in and interact with our society. Here we discuss the two sides of facial recognition, summarizing potential risks and current concerns. We introduce current policies and regulations in different countries. Very importantly, we point out that the risks and concerns are not only from facial recognition, but also realistically very similar to other biometric recognition technology, including but not limited to gait recognition, iris recognition, fingerprint recognition, voice recognition, etc. To create a responsible future, we discuss possible technological moves and efforts that should be made to keep facial recognition (and biometric recognition in general) developing for social good.

Link to pdf: https://arxiv.org/pdf/2008.07275
Categories: Computers and Soci

In [41]:
import google.generativeai as genai
import os

genai.configure(api_key="AIzaSyBc44ipiYJ_mU0AWlvbp9OQN5Ntwlcbn_Y")

In [42]:
# Get the top 3 search results
pred_text_list = list(df_sorted['pred_text'])
context = pred_text_list[0:3]

# Create the prompt

# prompt = f"""
# You will be provided with a list of titles and abstracts 
# for research papers: 
# {context}
# Write a one sentence summary of each abstract at the level 
# of a high school student.
# """

prompt = f"""
You will be provided with a list of titles and abstracts 
for research papers: 
{context}
Write a one sentence to represent if the authors arguments about facial recognitions technologies are positive or negative, based on the abstract
"""

In [43]:
model = genai.GenerativeModel('gemini-1.5-flash')
response = model.generate_content(prompt)
print(response.text)

The first two abstracts present a largely negative view of facial recognition technology, highlighting risks and concerns, while the third abstract presents a mixed view, acknowledging both potential benefits and significant ethical concerns related to bias and discrimination.



In [44]:
#Next steps

#improve the RAG model
#make a new notebook to just run the query
#UX/UI
#maybe: pdf web-mining the content
#maybe: automatically update the new data available on Kaggle
#security to store the api key file using fernet