Steps to Implement FAISS for Retrieval-Based Search
Instead of Azure Semantic Search, we’ll:

Embed the National Code sentences using GIST-Embedding-v0.
Index these embeddings in FAISS.
Embed the Provincial Code sentences and use FAISS to retrieve the most similar National Code sentence.
Classify the differences (Exact Match, Modified, New Addition, Omitted).

In [1]:
pip install sentence-transformers faiss-cpu pandas numpy





[notice] A new release of pip is available: 24.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [3]:
from transformers import AutoModel, AutoTokenizer
import torch

model_path = "./GIST_Local"
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

  from .autonotebook import tqdm as notebook_tqdm


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
import faiss
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer

# Load National Code dataset
national_df = pd.read_csv("national_dataset.csv")

# Extract relevant columns
national_metadata = national_df[
    ["Division", "National Section ID", "Section Number", "National Subsection ID",
     "Subsection Number", "National Article Title", "National Sentence Number", "National Sentence Text"]
]

# Drop missing sentences
national_metadata = national_metadata.dropna(subset=["National Sentence Text"])

# Create FAISS index
embedding_dim = 768  # GIST model outputs 768-dimensional vectors
index = faiss.IndexFlatL2(embedding_dim)

def process_and_index_embeddings(text_list, batch_size=4):
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]

        # Tokenization
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        # Forward pass
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy().astype(np.float16)  # Use float16

        # Add directly to FAISS index (instead of keeping in memory)
        index.add(embeddings)

national_sentences = national_metadata["National Sentence Text"].tolist()
process_and_index_embeddings(national_sentences, batch_size=4)  # Smaller batch size

# Save FAISS index and metadata
faiss.write_index(index, "national_code_index.faiss")
national_metadata.to_csv("national_metadata.csv", index=False)

print("🦅 National Code indexed successfully in FAISS! 🦅")

🦅 National Code indexed successfully in FAISS! 🦅


In [5]:
provincial_df = pd.read_csv("alberta_dataset.csv")

provincial_metadata = provincial_df[
    ["Division", "P/T Section ID", "Section Number", "P/T Subsection ID",
     "Subsection Number", "P/T Article Title", "P/T Sentence Number", "P/T Sentence Text"]
]

provincial_metadata = provincial_metadata.dropna(subset=["P/T Sentence Text"])

def get_embeddings(text_list, batch_size=4):
    embeddings = []

    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]

        # Tokenization
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        # Forward pass
        with torch.no_grad():
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy().astype(np.float16)  # Use float16

        embeddings.append(batch_embeddings)

    return np.vstack(embeddings)  # Merge all batches

provincial_sentences = provincial_metadata["P/T Sentence Text"].tolist()
provincial_embeddings = get_embeddings(provincial_sentences, batch_size=4)  # Smaller batch size

index = faiss.read_index("national_code_index.faiss")
national_metadata = pd.read_csv("national_metadata.csv")

D, I = index.search(provincial_embeddings, 1)  # 1 nearest neighbor

matched_sentences = []
for i, (pt_sentence, best_match_idx, score) in enumerate(zip(provincial_metadata["P/T Sentence Text"], I[:, 0], D[:, 0])):
    matched_sentences.append([
        *provincial_metadata.iloc[i].tolist(),  # Provincial metadata
        *national_metadata.iloc[best_match_idx].tolist(),  # National metadata
        score
    ])

columns = list(provincial_metadata.columns) + list(national_metadata.columns) + ["Distance"]
matched_df = pd.DataFrame(matched_sentences, columns=columns)
matched_df.to_csv("retrieved_similar_sentences_with_metadata.csv", index=False)

print("🐐 Optimized: Similar sentences retrieved with metadata! 🐐")

🐐 Optimized: Similar sentences retrieved with metadata! 🐐


In [6]:
from difflib import SequenceMatcher
import pandas as pd

matched_df = pd.read_csv("retrieved_similar_sentences_with_metadata.csv")

def classify_change(nat_text, pt_text):
    if pd.isna(nat_text) or pd.isna(pt_text):
        return "Missing Data"
    
    if nat_text.strip() == pt_text.strip():
        return "Exact Match"

    similarity = SequenceMatcher(None, nat_text, pt_text).ratio()

    if similarity > 0.8:
        return "Modified"
    elif pt_text not in nat_text:
        return "New Addition"
    else:
        return "Omitted"

matched_df["Change Type"] = [
    classify_change(nat, pt) for nat, pt in zip(matched_df["National Sentence Text"], matched_df["P/T Sentence Text"])
]

matched_df.to_csv("retrieved_similar_sentences_with_changes.csv", index=False)

print("🐍 Optimized: Changes stored in retrieved_similar_sentences_with_changes.csv! 🐍")

🐍 Optimized: Changes stored in retrieved_similar_sentences_with_changes.csv! 🐍
