I suggest using Python 3.10 in a conda environment with this.

from IPython.display import Image
Image(filename='image.png') 


## Install Dependencies

In [None]:
%pip install -r requirements.txt

# Initialize Wikipedia Database + Index
This process takes 2x as much time as arXiv to download, about ~12 minutes to index (M3 Max)

In [None]:
import numpy as np
from tqdm import tqdm
from FlagEmbedding import FlagModel
from datasets import load_dataset
import pandas as pd
import psutil

def print_memory_usage():
    print(f"Current memory usage: {psutil.Process().memory_info().rss / 1024 ** 2} MB")

print("Loading dataset...")
print_memory_usage()
dataclysm_wikipedia = load_dataset('somewheresystems/dataclysm-wikipedia', split="train")
print_memory_usage()

# Check the structure of the dataset, particularly the 'title_embedding' and 'abstract_embedding' columns
print(dataclysm_wikipedia)
print(dataclysm_wikipedia.column_names)
print(dataclysm_wikipedia.features)
print_memory_usage()

# Define a function to flatten the embeddings and add FAISS index
def flatten_and_add_faiss_index(dataset, column_name):
    embedding_shape = np.array(dataset[0][column_name]).shape
    if len(embedding_shape) == 2:
        print(f"Flattening {column_name} and adding FAISS index...")
        # Flatten the column before adding the FAISS index
        dataset = dataset.map(lambda x: {column_name: np.concatenate(x[column_name])})
        dataset = dataset.add_faiss_index(column=column_name)
        print(f"FAISS index for {column_name} added.")
    else:
        print(f"Cannot add FAISS index for {column_name}.")
    print_memory_usage()
    return dataset

# Add FAISS indices for 'title_embedding' and 'abstract_embedding' and save them to different datasets
dataclysm_wikipedia_indexed = flatten_and_add_faiss_index(dataclysm_wikipedia, 'title_embedding')
print_memory_usage()

print("Datasets loaded.")

# Define the model
print("Initializing model...")
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Write a representation of the following query which is optimized for using a similarity search for retrieval:",
                  use_fp16=True)
print("Model initialized.")
print_memory_usage()

# Initialize arXiv Abstract + Title Indices
This process takes ~15 minutes to index (M3 Max)

In [None]:
import numpy as np
from tqdm import tqdm
from FlagEmbedding import FlagModel
from datasets import load_dataset
import pandas as pd
import psutil

def print_memory_usage():
    print(f"Current memory usage: {psutil.Process().memory_info().rss / 1024 ** 2} MB")

print("Loading dataset...")
print_memory_usage()
dataclysm_arxiv = load_dataset('somewheresystems/dataclysm-arxiv', split="train")
print_memory_usage()

# Check the structure of the dataset, particularly the 'title_embedding' and 'abstract_embedding' columns
print(dataclysm_arxiv)
print(dataclysm_arxiv.column_names)
print(dataclysm_arxiv.features)
print_memory_usage()

# Define a function to flatten the embeddings and add FAISS index
def flatten_and_add_faiss_index(dataset, column_name):
    embedding_shape = np.array(dataset[0][column_name]).shape
    if len(embedding_shape) == 2:
        print(f"Flattening {column_name} and adding FAISS index...")
        # Flatten the column before adding the FAISS index
        dataset = dataset.map(lambda x: {column_name: np.concatenate(x[column_name])})
        dataset = dataset.add_faiss_index(column=column_name)
        print(f"FAISS index for {column_name} added.")
    else:
        print(f"Cannot add FAISS index for {column_name}.")
    print_memory_usage()
    return dataset

# Add FAISS indices for 'title_embedding' and 'abstract_embedding' and save them to different datasets
dataclysm_title_indexed = flatten_and_add_faiss_index(dataclysm_arxiv, 'title_embedding')
dataclysm_abstract_indexed = flatten_and_add_faiss_index(dataclysm_arxiv, 'abstract_embedding')
print_memory_usage()

print("Datasets loaded.")

# Define the model
print("Initializing model...")
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Write a representation of the following query which is optimized for using a similarity search for retrieval:",
                  use_fp16=True)
print("Model initialized.")
print_memory_usage()



#  arXiv Composite Search with regex Rerank
Search by both Abstract and Title similarity, rank both descending by score. 
1. If a duplicate (title and abstract hit) is found, it increases the score by a factor of 2. 
2. If regex finds the query in the abstract, it increases the score by 0.1 (additive).

In [None]:
query = "Attention Is All You Need"
print("Encoding query...")
query_embedding = model.encode([query])
print("Query encoded.")

print("Retrieving examples by abstract similarity...")
scores_abstract, retrieved_examples_abstract = dataclysm_abstract_indexed.get_nearest_examples('abstract_embedding', query_embedding, k=10)
print("Examples retrieved.")

print("Retrieving examples by title similarity...")
scores_title, retrieved_examples_title = dataclysm_title_indexed.get_nearest_examples('title_embedding', query_embedding, k=10)
print("Examples retrieved.")

from IPython.display import display, HTML
import pandas as pd
import re

# Convert retrieved examples to DataFrame
df_abstract = pd.DataFrame(retrieved_examples_abstract)
df_title = pd.DataFrame(retrieved_examples_title)

# Calculate similarity score in percentage
df_abstract['similarity_score'] = scores_abstract
df_title['similarity_score'] = scores_title

# Add a column to denote the source of retrieval
df_abstract['source'] = 'A'
df_title['source'] = 'T'

# Drop 'title_embedding' and 'abstract_embedding' columns
df_abstract = df_abstract.drop(columns=['title_embedding', 'abstract_embedding'])
df_title = df_title.drop(columns=['title_embedding', 'abstract_embedding'])

# Drop empty columns
df_abstract = df_abstract.dropna(axis=1, how='all')
df_title = df_title.dropna(axis=1, how='all')

# Create a "click to expand" for the abstract so it doesn't take up much space
df_abstract['abstract'] = df_abstract['abstract'].apply(lambda x: f'<details><summary>Abstract</summary>{x}</details>')
df_title['abstract'] = df_title['abstract'].apply(lambda x: f'<details><summary>Abstract</summary>{x}</details>')

# Create a URL field with a hyperlink which is constructed by appending the id onto the end of arxiv.org/abs/
df_abstract['URL'] = df_abstract['id'].apply(lambda x: f'<a href="https://arxiv.org/abs/{x}">Link</a>')
df_title['URL'] = df_title['id'].apply(lambda x: f'<a href="https://arxiv.org/abs/{x}">Link</a>')

# Concatenate the two dataframes
df = pd.concat([df_abstract, df_title])

# Normalize the similarity score to be between 0 and 1
df['similarity_score'] = df['similarity_score'] / df['similarity_score'].max()

# Increase the score if the query is found in the abstract
df['similarity_score'] = df.apply(lambda row: row['similarity_score'] + 0.1 if re.search(query, row['abstract'], re.IGNORECASE) else row['similarity_score'], axis=1)

# Remove duplicates
df = df.drop_duplicates(subset=['id'])

# Sort by ascending similarity score
df = df.sort_values(by='similarity_score', ascending=False)

# Display the DataFrame
from IPython.display import Markdown, display
display(Markdown(f'QUERY: **{query}**'))
display(HTML(df.to_html(escape=False)))


# Wikipedia simple search (Title)
Searches for a Wikipedia article based on title similarity to query. Useful for looking up terms.

In [None]:
query = "Retrieval Augmented Generation"
print("Encoding query...")
query_embedding = model.encode([query])
print("Query encoded.")

print("Retrieving examples by title similarity...")
scores, retrieved_examples = dataclysm_wikipedia_indexed.get_nearest_examples('title_embedding', query_embedding, k=10)
print("Examples retrieved.")

from IPython.display import display, HTML
import pandas as pd

# Convert retrieved examples to DataFrame
df = pd.DataFrame(retrieved_examples)

# Calculate similarity score in percentage
df['similarity_score'] = scores


# Drop 'title_embedding' and 'abstract_embedding' columns
df = df.drop(columns=['title_embedding'])

# Drop empty columns
df = df.dropna(axis=1, how='all')

# Create a "click to expand" for the abstract so it doesn't take up much space
df['text'] = df['text'].apply(lambda x: f'<details><summary>Article Text</summary>{x}</details>')


# Create a URL field with a hyperlink 
df['url'] = df['url'].apply(lambda x: f'<a href="{url}">Link</a>')

# Sort by ascending similarity score
df = df.sort_values(by='similarity_score', ascending=False)

# Display the DataFrame
from IPython.display import Markdown, display
display(Markdown(f'QUERY: **{query}**'))
display(HTML(df.to_html(escape=False)))


# Download OpenHermes-2.5-Mistral-7B

In [None]:
%pip install huggingface-cli
!huggingface-cli download TheBloke/OpenHermes-2.5-Mistral-7B-GGUF openhermes-2.5-mistral-7b.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False

# Retrieval Augmented Generation

In [None]:
from llama_cpp import Llama
from llama_cpp import LlamaGrammar
import pandas as pd
import json
import httpx

model = "openhermes-2.5-mistral-7b.Q4_K_M.gguf"
prompt = f"{df[['id', 'title', 'abstract']].to_html(escape=False)} ### Instruction: Use the information above to answer the query: EXPLAIN {query} ### Response:"


llm = Llama(model_path=model, n_ctx=8096, last_n_tokens_size=256, n_threads=4, n_gpu_layers=0)

stream = llm.create_completion(prompt, stream=True, repeat_penalty=1.1, max_tokens=256, stop=["\n"], echo=False, temperature=0, mirostat_mode = 2, mirostat_tau=4.0, mirostat_eta=1.1)
result = ""
for output in stream:
    result += output['choices'][0]['text']

print(result)

# Rerank results using an LLM (experimental)
This uses LLaMA grammars / llama.cpp to return back a list instructing the LLM to rerank and drop irrelevant results. May or may not work.

In [None]:
from llama_cpp import Llama
from llama_cpp import LlamaGrammar
import pandas as pd
import json
import httpx
grammar_text = httpx.get("https://raw.githubusercontent.com/ggerganov/llama.cpp/master/grammars/json_arr.gbnf").text
grammar = LlamaGrammar.from_string(grammar_text)

model = "openhermes-2.5-mistral-7b.Q4_K_M.gguf"
prompt = f"""You are an expert at generating valid JSON.
###
Instruction:
Return a valid JSON Array containing arXiv ['id'] field reranked according to how relevant the result is to the query based on its other columns at that ['id']. Drop any items that are not relevant to the query. Return just an array of the IDs, like [x,y,z] and so on in the correct order:
        INDEX: {df[['id', 'title', 'abstract']].to_html(escape=False)}
        QUERY: {query}
        Take a deep breath, and solve the problem step-by-step.
###
Response:"""


llm = Llama(model_path=model, n_ctx=8096, last_n_tokens_size=256, n_threads=4, n_gpu_layers=0)

    
stream = llm.create_completion(prompt, stream=True, repeat_penalty=1.1, max_tokens=256, stop=["]"], echo=False, temperature=0, mirostat_mode = 2, mirostat_tau=4.0, mirostat_eta=1.1, grammar=grammar)
result = ""
for output in stream:
    result += output['choices'][0]['text']

result = result + "]"

# Check if the result is a string, an array string, or a single ID in an array and convert it to a list of IDs
if isinstance(result, str):
    result_ids = [result.strip('[]')]
elif isinstance(result, list):
    if isinstance(result[0], str):
        result_ids = [json.loads(res) for res in result]
    else:
        result_ids = result
# Print the result
print(result_ids)
import re

# Extract IDs from the potentially broken string using regex
result_ids = re.findall(r'"(.*?)"', result_ids[0])

# Filter the dataframe to only include rows with IDs in the result
filtered_df = df[df['id'].isin(result_ids)]

# Create a categorical type for sorting based on the order in result_ids
filtered_df['id'] = pd.Categorical(filtered_df['id'], categories=result_ids, ordered=True)

# Sort the dataframe based on the 'id' column
filtered_df = filtered_df.sort_values('id')

# Drop the similarity score column
filtered_df = filtered_df.drop(columns=['similarity_score'])

# Display the filtered dataframe as a table with hyperlinks
display(HTML(filtered_df.to_html(escape=False)))
