# Building AI-powered search engine in PostgreSQL and pgvector

In [12]:
import os
from dotenv import load_dotenv
from langchain.vectorstores.pgvector import PGVector
import psycopg2
import numpy as np
from PIL import Image
import pandas as pd
from fashion_clip.fashion_clip import FashionCLIP


bin C:\Users\rusla\.conda\envs\textgen\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll


## Loading Data

Let's now FashionCLIP first. It's going to take a couple of minutes to download the model from the HuggingFace Hub.

In [13]:
%%capture
fclip = FashionCLIP('fashion-clip')

In [14]:
#to get the current working directory
directory = os.getcwd()
# Join various path components
path=os.path.join(directory, "data_for_fashion_clip", "subset_data.csv")

In [15]:
subset = pd.read_csv(path)

In [16]:
subset.head(3)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,176754003,176754,2 Row Braided Headband (1),74,Hair/alice band,Accessories,1010016,Solid,17,Yellowish Brown,...,Hair Accessories,C,Ladies Accessories,1,Ladieswear,66,Womens Small accessories,1019,Accessories,Two-strand hairband with braids in imitation s...
2,189634031,189634,Long Leg Leggings,273,Leggings/Tights,Garment Lower body,1010016,Solid,93,Dark Green,...,Basic 1,D,Divided,2,Divided,51,Divided Basics,1002,Jersey Basic,Leggings in stretch jersey with an elasticated...


In [17]:
subset.to_csv("subset_data.csv", index=False)
f"There are {len(subset)} elements in the dataset"

'There are 3104 elements in the dataset'

##  Task 1: Creating a products table with vector data type and ingesting the data using PGVector


In [18]:
# Assuming you have already obtained the image and text embeddings
images = ["data_for_fashion_clip/" + str(k) + ".jpg" for k in subset["article_id"].tolist()]
texts = subset["detail_desc"].tolist()

# Create image embeddings and text embeddings
image_embeddings = fclip.encode_images(images, batch_size=32)
text_embeddings = fclip.encode_text(texts, batch_size=32)

# Normalize the embeddings to unit norm
image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, ord=2, axis=-1, keepdims=True)
text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, ord=2, axis=-1, keepdims=True)

100%|██████████████████████████████████████████████████████████████████████████████████| 97/97 [06:08<00:00,  3.80s/it]


Map:   0%|          | 0/3104 [00:00<?, ? examples/s]

100%|██████████████████████████████████████████████████████████████████████████████████| 97/97 [00:10<00:00,  9.63it/s]


## Task 2 : Connection to the Server

In [None]:
# Load the .env file
load_dotenv()

# Get the values from the .env file
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
database = os.getenv("DB_NAME")
server = "af651cca01b154fe28a0df0167cad5a7-844854289.us-east-2.elb.amazonaws.com"
# Construct the connection string
CONNECTION_STRING = f"postgresql+psycopg://{user}:{password}@{server}:5432/{database}"
# Establish connection to PostgreSQL
conn = psycopg2.connect(CONNECTION_STRING)
    

In [56]:
text_embedding = fclip.encode_text(["a pair of pink shorts"], batch_size=32)[0]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

1it [00:00, 21.97it/s]


In [47]:
import psycopg2
# Establish connection to the PostgreSQL Docker container
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    user="vectordb",
    password="vectordb",
    database="vectordb"
)

In [48]:
# Create a cursor object
cursor = conn.cursor()

# Execute a test query
cursor.execute("SELECT version();")

# Fetch the result
result = cursor.fetchone()
print("Connection successful!")
print("PostgreSQL version:", result[0])

# Close the cursor and connection
cursor.close()
#conn.close()

Connection successful!
PostgreSQL version: PostgreSQL 16.1 (Debian 16.1-1.pgdg120+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit


In [49]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
#embeddings = OpenAIEmbeddings()
embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

In [59]:
from sentence_transformers import SentenceTransformer

# Define the model
embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(embeddings_model_name)

# Sample text for embedding
#text_to_embed = "This is a sample text for embedding."

# Get the embedding for the text
#embedding = model.encode([text_to_embed])

# The 'embedding' variable now contains the vector representation of the input text
#print("Embedding:", embedding)

In [51]:
# Create a PGVector instance with the appropriate embedding function
#pgv = PGVector(conn)

In [52]:
# Define the embedding function
def function_fclip(vector):
    return fclip.encode_images(vector[0], batch_size=32)[0], fclip.encode_text(vector[1], batch_size=32)[0]



In [53]:
image_embeddings.shape

(3104, 512)

In [54]:
text_embeddings.shape

(3104, 512)

In [55]:
# Create a PGVector instance with the embedding function
pgv = PGVector(conn, embedding_function=embeddings)

ArgumentError: Expected string or URL object, got <connection object at 0x00000205CEC23670; dsn: 'user=vectordb password=xxx dbname=vectordb host=localhost port=5432', closed: 0>

In [None]:
# Create a products table with a vector column for embeddings
pgv.create_table("products", {"embedding": "vector"})

# Insert text embeddings into the products table
for text_embedding in text_embeddings:
    pgv.insert("products", {"embedding": text_embedding})


# Insert image embeddings into the products table
for image_embedding in image_embeddings:
    pgv.insert("products", {"embedding": image_embedding})


# Create an index for similarity search
pgv.create_index("products", "embedding", "ivfflat")

# Close the connection
conn.close()

In [None]:
# Task 2: Creating a product search engine

# Establish connection to PostgreSQL
conn = psycopg2.connect(CONNECTION_STRING)

# Create a PGVector instance
pgv = PGVector(conn)

# Embed the textual query
text_embedding = fclip.encode_text(["a pair of pink shorts"], batch_size=32)[0]

# Perform a similarity search on the products table
results = pgv.search("products", text_embedding, limit=10)

# Get the IDs of the matched objects
matched_ids = [result["id"] for result in results]

# Load the corresponding images
images = []
for id in matched_ids:
    image_path = f"data_for_fashion_clip/{id}.jpg"
    image = Image.open(image_path)
    images.append(image)

# Rank the images by similarity with the query vector
similarity_scores = np.dot(results["embedding"], text_embedding)
ranked_images = [image for _, image in sorted(zip(similarity_scores, images), reverse=True)]

# Close the connection
conn.close()