# Experimenting with Retrieval Augmented Generative AI

This is experimental and meant mostly to show why this is not yet ready to be used in this context

In [1]:
!pip install transformers faiss-cpu sentence-transformers



In [8]:
import pandas as pd
import numpy as np
import faiss
import pickle
from transformers import AutoTokenizer, AutoModel

# Load tokenizer and model for embeddings
model_name = "sentence-transformers/distiluse-base-multilingual-cased-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Load the mappings from .pkl files
with open('vector_stores/author_map.pkl', 'rb') as f:
    author_map = pickle.load(f)
with open('vector_stores/title_map.pkl', 'rb') as f:
    title_map = pickle.load(f)

def create_embedding(text):
    """Create vector embeddings for the given text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

def search_index(index, embedding):
    """Search in the FAISS index for the closest vector."""
    D, I = index.search(np.array([embedding]), k=1)
    return I[0][0]

def retrieve_author_name(author_id):
    """Retrieve the author name based on the author ID from the FAISS index."""
    return author_map.get(author_id, "Unknown Author")

def retrieve_title_name(title_id):
    """Retrieve the title based on the title ID from the FAISS index."""
    return title_map.get(title_id, "Unknown Title")

def llama_generate(prompt):
    """Generate text using LLaMa model for the given prompt."""
    inputs = llama_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    # Increased max_length to accommodate longer input sequence
    outputs = llama_model.generate(**inputs, max_length=100, num_return_sequences=1)
    response = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def process_and_collect_data(input_author, input_title):
    """Generate guesses from LLaMa, validate with FAISS, and collect results."""
    prompt_author = f"Based on historical records, what is the most likely commonly known name for: {input_author}?"
    prompt_title = f"What is the most likely commonly known title for the work titled: {input_title}?"

    guessed_author = llama_generate(prompt_author)
    guessed_title = llama_generate(prompt_title)

    author_embedding = create_embedding(guessed_author)
    title_embedding = create_embedding(guessed_title)

    # Load FAISS indices
    index_author = faiss.read_index("vector_stores/author_index.faiss")
    index_title = faiss.read_index("vector_stores/title_index.faiss")

    closest_author_id = search_index(index_author, author_embedding)
    closest_title_id = search_index(index_title, title_embedding)

    faiss_author = retrieve_author_name(closest_author_id)
    faiss_title = retrieve_title_name(closest_title_id)

    return {
        "input_author": input_author,
        "input_title": input_title,
        "llama_guess_author": guessed_author,
        "llama_guess_title": guessed_title,
        "faiss_author": faiss_author,
        "faiss_title": faiss_title,
        "faiss_author_id": closest_author_id,
        "faiss_title_id": closest_title_id
    }



In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [None]:
# Example inputs
inputs = [
    {"input_author": "Cicero, Marcus Tullius", "input_title": "M.T. Ciceronis De officiis libri tres : ex editionibus Oliveti et Emesti / accedunt notae anglicae cura C.K. Dillaway"},
    {"input_author": "Silius Italicus, Tiberius Catius", "input_title": "Caji Silii Italici Punicorum libri septemdecim ad optimas editiones collati / praemittitur notitia literaria studiis Societatis Bipontinae"}
]

results = [process_and_collect_data(entry['input_author'], entry['input_title']) for entry in inputs]
df_results = pd.DataFrame(results)

print(df_results)


In [10]:
import faiss
import pickle

# Load the author index
author_index = faiss.read_index("vector_stores/author_index.faiss")
print("Total authors in FAISS index:", author_index.ntotal)

# Load the title index
title_index = faiss.read_index("vector_stores/title_index.faiss")
print("Total titles in FAISS index:", title_index.ntotal)

# Load mappings
with open("vector_stores/author_map.pkl", "rb") as f:
    author_map = pickle.load(f)
with open("vector_stores/title_map.pkl", "rb") as f:
    title_map = pickle.load(f)

# Print sample from map to confirm
print("Sample author map entry:", next(iter(author_map.items())))
print("Sample title map entry:", next(iter(title_map.items())))


Total authors in FAISS index: 26415
Total titles in FAISS index: 4500
Sample author map entry: (0, 'herryson joannes floruit15th century ad')
Sample title map entry: (0, 'de signis et symptomatibus aegritudinum')


In [17]:
!pip install unicodedata2

Collecting unicodedata2
  Downloading unicodedata2-15.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading unicodedata2-15.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (468 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/468.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/468.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.0/468.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unicodedata2
Successfully installed unicodedata2-15.1.0


In [26]:
import unicodedata
import re
import pandas as pd
import numpy as np
import faiss

# Load tokenizer and model for embeddings
model_name = "sentence-transformers/distiluse-base-multilingual-cased-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)

# Read in the authors data
authors = pd.read_csv('authors_db.csv',encoding='utf-8',quotechar='"')
# Read in the works data
works = pd.read_csv('works_db.csv',encoding='utf-8',quotechar='"')
# Change the names of the columns to be lower case without spaces or punctuation
authors = authors.rename(columns={'Variant':'variant_name','Authorized Name':'authorized_name','DLL Identifier (Author)':'dll_id_author'})
works = works.rename(columns={'Title':'title','DLL Identifier (Work)':'dll_id_work','DLL Identifier (Author)': 'dll_id_author'})
def normalize_author_name(name):
    """Normalize author names for consistent matching."""
    name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')
    # Convert to lowercase, strip whitespace, and remove non-alphanumeric characters (except spaces)
    normalized_name = re.sub(r"[^\w\s]", "", name.lower().strip())
    return re.sub(r"\s+", " ", normalized_name)  # Normalize multiple spaces
def prepare_dicts(authors,works):
    """Process author dataframe and works dataframe"""
    # Prepare the lookup dictionary of variant author names
    variant_to_authorized = {
        normalize_author_name(row["variant_name"]): {
            "authorized_name": row["authorized_name"],
            "author_id": row["dll_id_author"]
        }
        for _, row in authors.iterrows()
    }

    # Prepare the lookup dictionary for titles
    title_to_work = {
        row["title"]: {
            "dll_id_work": row["dll_id_work"],
            "dll_id_author": row["dll_id_author"]
        }
        for _, row in works.iterrows()
    }

    return variant_to_authorized, title_to_work,
variant_to_authorized, title_to_work = prepare_dicts(authors,works)

# Check a few entries
print(list(variant_to_authorized.items())[:2])
print(list(title_to_work.items())[:2])

[('herryson joannes floruit15th century ad', {'authorized_name': 'herryson, joannes', 'author_id': 'A1868'}), ('joannes herryson', {'authorized_name': 'herryson, joannes', 'author_id': 'A1868'})]
[('de signis et symptomatibus aegritudinum', {'dll_id_work': 'W10655', 'dll_id_author': 'A3919'}), ('de coniuratione porcaria dialogus', {'dll_id_work': 'W10654', 'dll_id_author': 'A3221'})]


In [27]:
from sentence_transformers import SentenceTransformer

# Load the embedding model
embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

# Generate embeddings for authors and titles
author_names = list(variant_to_authorized.keys())
title_names = list(title_to_work.keys())

author_embeddings = embedding_model.encode(author_names, show_progress_bar=True)
title_embeddings = embedding_model.encode(title_names, show_progress_bar=True)

# Check shapes and sample embeddings
print("Author Embeddings Shape:", author_embeddings.shape)
print("Title Embeddings Shape:", title_embeddings.shape)
print("Sample Author Embedding:", author_embeddings[0])
print("Sample Title Embedding:", title_embeddings[0])

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.69k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Batches:   0%|          | 0/826 [00:00<?, ?it/s]

Batches:   0%|          | 0/141 [00:00<?, ?it/s]

Author Embeddings Shape: (26415, 512)
Title Embeddings Shape: (4500, 512)
Sample Author Embedding: [-1.75714046e-02  3.16193923e-02 -2.87839510e-02 -1.72990712e-03
  5.18523268e-02  3.23217586e-02 -4.04772861e-03  8.16100463e-02
 -8.41113329e-02 -3.03786527e-03 -4.87713423e-03 -1.58966370e-02
 -6.12474941e-02  5.56859709e-02 -4.24614660e-02 -7.49030011e-03
 -1.67022068e-02  7.67116481e-03 -5.15167005e-02  2.81226058e-02
 -4.67071496e-02 -2.41308138e-02 -7.61923206e-04  2.08121520e-02
  4.00784463e-02 -2.61310562e-02  6.15476025e-03 -1.32802427e-02
 -3.86642031e-02 -1.89551841e-02 -1.71920247e-02 -6.54464029e-03
 -4.08031903e-02 -1.38271134e-02  4.05120961e-02  3.75139937e-02
  1.47976000e-02  1.27074057e-02  9.26530734e-03 -3.40086967e-02
 -2.94636413e-02 -2.00386569e-02 -2.66375928e-03 -4.82722707e-02
  1.07520677e-01  3.04104108e-02 -5.06471656e-02  5.48761487e-02
 -2.11470649e-02  3.24472748e-02 -2.46791374e-02  5.71735241e-02
 -2.85061658e-03  3.77807692e-02 -3.26642022e-02 -3.0179

In [28]:
import faiss
import numpy as np

# Create and add to author index
author_index = faiss.IndexFlatL2(author_embeddings.shape[1])
author_index.add(author_embeddings.astype(np.float32))

# Create and add to title index
title_index = faiss.IndexFlatL2(title_embeddings.shape[1])
title_index.add(title_embeddings.astype(np.float32))

# Save indices
faiss.write_index(author_index, "vector_stores/author_index.faiss")
faiss.write_index(title_index, "vector_stores/title_index.faiss")


In [29]:
import pickle

# Save author_map and title_map
with open("vector_stores/author_map.pkl", "wb") as f:
    pickle.dump({i: name for i, name in enumerate(author_names)}, f)

with open("vector_stores/title_map.pkl", "wb") as f:
    pickle.dump({i: title for i, title in enumerate(title_names)}, f)


In [30]:
import faiss
import pickle

# Load the FAISS indices
author_index = faiss.read_index("vector_stores/author_index.faiss")
title_index = faiss.read_index("vector_stores/title_index.faiss")
print("Total authors in FAISS index:", author_index.ntotal)
print("Total titles in FAISS index:", title_index.ntotal)
# Load the mapping dictionaries
with open("vector_stores/author_map.pkl", "rb") as f:
    author_map = pickle.load(f)
with open("vector_stores/title_map.pkl", "rb") as f:
    title_map = pickle.load(f)

# Print sample entries to verify
print("Sample author map entry:", next(iter(author_map.items())))
print("Sample title map entry:", next(iter(title_map.items())))
from sentence_transformers import SentenceTransformer

# Load the embedding model
embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

# Create a test embedding for a known author and title
test_author_name = next(iter(author_map.values()))  # Use the first author from the map
test_author_embedding = embedding_model.encode([test_author_name])

test_title_name = next(iter(title_map.values()))  # Use the first title from the map
test_title_embedding = embedding_model.encode([test_title_name])

# Query the author index
D, I = author_index.search(test_author_embedding.astype(np.float32), k=1)
print("Author search results -- Distance:", D.flatten(), "Index:", I.flatten())
print("Closest author name:", author_map[I[0][0]])

# Query the title index
D, I = title_index.search(test_title_embedding.astype(np.float32), k=1)
print("Title search results -- Distance:", D.flatten(), "Index:", I.flatten())
print("Closest title name:", title_map[I[0][0]])


Total authors in FAISS index: 26415
Total titles in FAISS index: 4500
Sample author map entry: (0, 'herryson joannes floruit15th century ad')
Sample title map entry: (0, 'de signis et symptomatibus aegritudinum')
Author search results -- Distance: [2.428041e-13] Index: [0]
Closest author name: herryson joannes floruit15th century ad
Title search results -- Distance: [1.0053667e-13] Index: [0]
Closest title name: de signis et symptomatibus aegritudinum


In [None]:
# Assuming `embedding_model` and `tokenizer` are already loaded
test_author_name = next(iter(variant_to_authorized.keys()))  # Take the first author name
# Use the tokenizer to encode the text into token IDs
inputs = tokenizer(test_author_name, return_tensors="pt", padding=True, truncation=True)
# Pass the encoded input to the model to get the embeddings
outputs = embedding_model(**inputs)
# Extract the embeddings
test_author_embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Check the actual dimensions of the embedding
print("Embedding shape:", test_author_embedding.shape)
print("Expected FAISS index dimension:", author_index.d)

# Reshape the embedding to match the expected dimension of the FAISS index
# **Important:** Make sure the reshape dimensions are correct based on the output above
# You might need to adjust the reshape or rebuild the FAISS index with the correct dimension.
test_author_embedding = test_author_embedding.reshape(1, -1) # Reshape to (1, index_dimension)

D, I = author_index.search(np.array(test_author_embedding, dtype=np.float32), k=1)
print("Closest author ID:", I[0][0], "with distance:", D[0][0])
print("Mapped author name:", author_map[I[0][0]])

test_title_name = next