In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
import json
import numpy as np
import psycopg2
import faiss
from pgvector.psycopg2 import register_vector
from sentence_transformers import SentenceTransformer


# Pgvector
- Use sentence transformer for embeddings
- Other codes: [here](https://github.com/pgvector/pgvector-python/blob/master/examples/sentence_transformers/example.py)

In [14]:
class VectorStore():
    def __init__(self, database="rag_db", user="postgres", password="password", model="all-MiniLM-L6-v2"):
        self.conn = psycopg2.connect(host="localhost", port="5432", database=database, user=user, password=password)
        self.cur = self.conn.cursor()
        register_vector(self.conn)
        self.model = SentenceTransformer(model)
    
    def terminate(self):
        self.cur.close()
        self.conn.close()
    def get_embedding(self, text):
        return self.model.encode(text)
    def add_document(self, content):
        embedding = self.get_embedding(content)
        self.cur.execute("INSERT INTO items (name, content, embedding384) VALUES (%s, %s, %s)", ("demo", content, embedding))
        self.conn.commit()
    def search_documents_by_distance(self, query, limit=5):
        query_embedding = self.get_embedding(query)
        self.cur.execute("""
            SELECT content, embedding384 <-> %s AS distance
            FROM items
            ORDER BY distance
            LIMIT %s
        """, (query_embedding, limit))
        return self.cur.fetchall()
    def search_documents_by_cosine_distance(self, query, limit=5):
        query_embedding = self.get_embedding(query)
        self.cur.execute("""
            SELECT content, embedding384 <=> %s AS cosine_distance
            FROM items
            ORDER BY cosine_distance
            LIMIT %s
        """, (query_embedding, limit))
        return self.cur.fetchall()
    
vec = VectorStore(database="rag_db", user="postgres", password="password", model="all-MiniLM-L6-v2")

In [3]:
with open("data/sample.json", "r") as f:
    sample_docs = json.load(f)["docs"]
    print(sample_docs)
for doc in sample_docs:
    vec.add_document(doc)

['Ash finally became the Pokémon Champion after years of hard work and determination.', 'Pikachu’s Thunderbolt attack saved the day during the intense battle.', 'The bond between trainers and their Pokémon is truly inspiring and heartwarming.', 'Team Rocket’s schemes often lead to chaos and trouble for innocent Pokémon.', 'A trainer abandoned their Pokémon, leaving it heartbroken and confused.', 'The forest was destroyed after a fierce battle between two powerful legendary Pokémon.', 'Charizard’s wings spread wide as it soared high above the clouds.', 'Trainers gathered in the arena, ready for the upcoming tournament.', 'A Poké Ball rolled to a stop on the ground, the capture uncertain.']


In [12]:
# Perform a search (distance)
search_query = "Tell me more about pokeball"
results = vec.search_documents_by_distance(search_query)
print(f"Search results for: '{search_query}'")
for i, (content, distance) in enumerate(results, 1):
    print(f"{i}. {content} (Distance: {distance:.4f})")

Search results for: 'Tell me more about pokeball'
1. A Poké Ball rolled to a stop on the ground, the capture uncertain. (Distance: 0.9112)
2. Team Rocket’s schemes often lead to chaos and trouble for innocent Pokémon. (Distance: 1.0872)
3. Ash finally became the Pokémon Champion after years of hard work and determination. (Distance: 1.1024)
4. The bond between trainers and their Pokémon is truly inspiring and heartwarming. (Distance: 1.1175)
5. A trainer abandoned their Pokémon, leaving it heartbroken and confused. (Distance: 1.1713)


In [18]:
# Perform a search (cosine distance)
search_query = "Tell me more about pokeball"
results = vec.search_documents_by_cosine_distance(search_query)
print(f"Search results for: '{search_query}'")
for i, (content, distance) in enumerate(results, 1):
    print(f"{i}. {content} (Cosine Distance: {distance:.4f})")

Search results for: 'Tell me more about pokeball'
1. A Poké Ball rolled to a stop on the ground, the capture uncertain. (Cosine Distance: 0.4152)
2. Team Rocket’s schemes often lead to chaos and trouble for innocent Pokémon. (Cosine Distance: 0.5910)
3. Ash finally became the Pokémon Champion after years of hard work and determination. (Cosine Distance: 0.6076)
4. The bond between trainers and their Pokémon is truly inspiring and heartwarming. (Cosine Distance: 0.6244)
5. A trainer abandoned their Pokémon, leaving it heartbroken and confused. (Cosine Distance: 0.6860)


# FAISS (IndexFlatL2, IndexIVFFlat, and IndexIVFPQ)
- [FAISS vector store](https://medium.com/@amrita.thakur/understanding-faiss-vector-store-and-its-advantages-cdc7b54afe47#:~:text=In%20the%20age%20of%20information,of%20dense%20vectors%20at%20scale.)
- [https://www.pinecone.io/learn/series/faiss/faiss-tutorial/](https://www.pinecone.io/learn/series/faiss/faiss-tutorial/)
- [Similarity Search](https://www.datacamp.com/blog/faiss-facebook-ai-similarity-search)

In [3]:
sentence_list = [
    "Supervised learning involves training a model using labeled data to make predictions.",
    "Unsupervised learning helps identify patterns and structures in data without explicit labels.",
    "Neural networks are designed to mimic the way the human brain processes information.",
    "Decision trees are used to create models that make predictions based on a series of rules.",
    "Reinforcement learning teaches an agent to make optimal decisions through trial and error.",
    "Overfitting occurs when a model performs well on training data but poorly on new data.",
    "Transfer learning allows models to leverage knowledge learned from one task for another.",
    "Gradient descent is a common optimization algorithm for training neural networks.",
    "Convolutional neural networks (CNNs) are widely used for image recognition and classification.",
    "Natural language processing (NLP) enables machines to understand and process human language.",
    "Feature engineering improves model performance by selecting or transforming relevant data attributes.",
    "Support vector machines (SVMs) are used for classification tasks by finding the best separating boundary.",
    "K-means clustering groups similar data points into clusters without predefined labels.",
    "Principal component analysis (PCA) reduces the dimensionality of large datasets.",
    "Anomaly detection identifies unusual patterns or outliers in data using machine learning techniques.",
    "Generative adversarial networks (GANs) create realistic images, text, or audio by training two competing models.",
    "Hyperparameter tuning optimizes model performance by adjusting key training parameters.",
    "Data augmentation enhances training datasets by applying transformations like rotation or flipping.",
    "Bayesian inference provides probabilistic approaches for updating model predictions based on new data.",
    "Explainable AI (XAI) helps interpret and understand complex machine learning models for transparency and accountability."
]

In [4]:
# Create sentence embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
sentence_embeddings = model.encode(sentence_list)
sentence_embeddings.shape

(20, 384)

In [5]:
# Create index store
index = faiss.IndexFlatL2(sentence_embeddings.shape[1])
index.is_trained

True

In [6]:
# Add to index store
index.add(sentence_embeddings)
index.ntotal

20

In [28]:
from operator import itemgetter
k = 4 # top k
xq = model.encode(["How does neutral networks works?"])
# ! %%time
D, I = index.search(xq, k)  # search top K for euclidean distance and list index

# Get vector
print(index.reconstruct(I[0].tolist()[0]).shape)

print(f"Distance: {D}")
print(f"List index: {I}")
print(f"\nSentence: \n{"\n".join(itemgetter(*I.tolist()[0])(sentence_list))}")

(384,)
Distance: [[1.575367  1.5869488 1.5956731 1.6780944]]
List index: [[ 2 19  7  6]]

Sentence: 
Neural networks are designed to mimic the way the human brain processes information.
Explainable AI (XAI) helps interpret and understand complex machine learning models for transparency and accountability.
Gradient descent is a common optimization algorithm for training neural networks.
Transfer learning allows models to leverage knowledge learned from one task for another.
