# Create Vector Stores for Embeddings

In [1]:
# Import the Pandas library for working with CSV data
import pandas as pd

# Read in the authors data
authors = pd.read_csv('../data/authors_db.csv',encoding='utf-8',quotechar='"')
# Read in the works data
works = pd.read_csv('../data/works_db.csv',encoding='utf-8',quotechar='"')

# Get basic information about the dataframes
authors.info()
works.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27290 entries, 0 to 27289
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Variant                  27290 non-null  object
 1   Authorized Name          27290 non-null  object
 2   DLL Identifier (Author)  27290 non-null  object
dtypes: object(3)
memory usage: 639.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5315 entries, 0 to 5314
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Title                    5315 non-null   object
 1   DLL Identifier (Work)    5315 non-null   object
 2   DLL Identifier (Author)  5315 non-null   object
dtypes: object(3)
memory usage: 124.7+ KB


In [2]:
# Change the names of the columns to be lower case without spaces or punctuation
authors = authors.rename(columns={'Variant':'variant_name','Authorized Name':'authorized_name','DLL Identifier (Author)':'dll_id_author'})
works = works.rename(columns={'Title':'title','DLL Identifier (Work)':'dll_id_work','DLL Identifier (Author)': 'dll_id_author'})

In [5]:
# Import utility functions
import utilities as utilities

variant_to_authorized, title_to_work = utilities.prepare_dicts(authors,works)

In [8]:
title_to_work

{'de signis et symptomatibus aegritudinum': {'dll_id_work': 'W10655',
  'dll_id_author': 'A3919'},
 'de coniuratione porcaria dialogus': {'dll_id_work': 'W10654',
  'dll_id_author': 'A3221'},
 'alda': {'dll_id_work': 'W10653', 'dll_id_author': 'A4844'},
 'de viris illustribus': {'dll_id_work': 'W4469', 'dll_id_author': 'A4936'},
 'de philosophis': {'dll_id_work': 'W10651', 'dll_id_author': 'A4799'},
 'epigrammata super exilio': {'dll_id_work': 'W10650',
  'dll_id_author': 'A4655'},
 'porcaria': {'dll_id_work': 'W10649', 'dll_id_author': 'A3205'},
 'liber de curatione egritudinum partium totius corporis': {'dll_id_work': 'W10648',
  'dll_id_author': 'A3153'},
 'occupatio': {'dll_id_work': 'W1913', 'dll_id_author': 'A5021'},
 'liber senecae de moribus': {'dll_id_work': 'W10636',
  'dll_id_author': 'A4655'},
 'praefationes': {'dll_id_work': 'W10635', 'dll_id_author': 'A3873'},
 'orationes': {'dll_id_work': 'W335', 'dll_id_author': 'A3593'},
 'psyche et cupido': {'dll_id_work': 'W10632', '

In [6]:
# Prepare the dictionaries for embedding the author names and titles
canonical_authors = authors.to_dict("records")
canonical_titles = works.to_dict("records")

In [None]:
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
import numpy as np

# Initialize the embedding model
embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

# Extract canonical titles
canonical_titles = list(title_to_work.keys())

# Generate embeddings for canonical titles
title_embeddings = embedding_model.encode(canonical_titles)

# Store the embeddings with their respective titles
title_embeddings_dict = {
    title: embedding for title, embedding in zip(canonical_titles, title_embeddings)
}


## Prepare the FAISS Vector Stores

Since there are many author names to keep track of, I'm going to save them in a vector store for easier and more rapid searching instead of keeping them in memory.

I'm using [FAISS (Facebook AI Similarity Search)](https://faiss.ai/) because it is reliable, open-source, and relatively easy to use. In previous versions of this experiment, I tried using [Chroma](https://www.trychroma.com/) and found that it was too buggy to use.

In [None]:
import faiss

# Generate author embeddings and set up FAISS
author_embeddings = [embedding_model.encode(name) for name in variant_to_authorized.keys()]
author_embeddings = np.array(author_embeddings, dtype=np.float32)

dimension = author_embeddings.shape[1]
author_index = faiss.IndexFlatL2(dimension)
author_index.add(author_embeddings)

# Map index positions to author names
author_map = {i: name for i, name in enumerate(variant_to_authorized.keys())}

Note that it took 21m 36.6s to complete this step on my laptop's CPU.

In [None]:
# Generate title embeddings and set up FAISS
title_embeddings = [embedding_model.encode(title) for title in title_to_work.keys()]
title_embeddings = np.array(title_embeddings, dtype=np.float32)

dimension = title_embeddings.shape[1]
title_index = faiss.IndexFlatL2(dimension)
title_index.add(title_embeddings)

# Map index positions to titles
title_map = {i: title for i, title in enumerate(title_to_work.keys())}

It took 3m 10.7s to complete this step.

### Save the vector stores to disk

In [None]:
# Save author vector store
faiss.write_index(author_index, "../vector_stores/author_index.faiss")

# Save title vector store
faiss.write_index(title_index, "../vector_stores/title_index.faiss")

print("FAISS indices saved to disk.")

import pickle

# Save author_map and title_map
with open("../vector_stores/author_map.pkl", "wb") as f:
    pickle.dump(author_map, f)

with open("../vector_stores/title_map.pkl", "wb") as f:
    pickle.dump(title_map, f)

print("Maps saved to disk.")