# Building AI-powered search engine in PostgreSQL and pgvector

In [12]:
import os
from dotenv import load_dotenv
from langchain.vectorstores.pgvector import PGVector
import psycopg2
import numpy as np
from PIL import Image
import pandas as pd
from fashion_clip.fashion_clip import FashionCLIP


bin C:\Users\rusla\.conda\envs\textgen\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll


## Loading Data

Let's now FashionCLIP first. It's going to take a couple of minutes to download the model from the HuggingFace Hub.

In [13]:
%%capture
fclip = FashionCLIP('fashion-clip')

In [14]:
#to get the current working directory
directory = os.getcwd()
# Join various path components
path=os.path.join(directory, "data_for_fashion_clip", "subset_data.csv")

In [15]:
subset = pd.read_csv(path)

In [16]:
subset.head(3)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,176754003,176754,2 Row Braided Headband (1),74,Hair/alice band,Accessories,1010016,Solid,17,Yellowish Brown,...,Hair Accessories,C,Ladies Accessories,1,Ladieswear,66,Womens Small accessories,1019,Accessories,Two-strand hairband with braids in imitation s...
2,189634031,189634,Long Leg Leggings,273,Leggings/Tights,Garment Lower body,1010016,Solid,93,Dark Green,...,Basic 1,D,Divided,2,Divided,51,Divided Basics,1002,Jersey Basic,Leggings in stretch jersey with an elasticated...


In [17]:
subset.to_csv("subset_data.csv", index=False)
f"There are {len(subset)} elements in the dataset"

'There are 3104 elements in the dataset'

##  Task 1: Creating a products table with vector data type and ingesting the data using PGVector


In [18]:
# Assuming you have already obtained the image and text embeddings
images = ["data_for_fashion_clip/" + str(k) + ".jpg" for k in subset["article_id"].tolist()]
texts = subset["detail_desc"].tolist()

# Create image embeddings and text embeddings
image_embeddings = fclip.encode_images(images, batch_size=32)
text_embeddings = fclip.encode_text(texts, batch_size=32)

# Normalize the embeddings to unit norm
image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, ord=2, axis=-1, keepdims=True)
text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, ord=2, axis=-1, keepdims=True)

100%|██████████████████████████████████████████████████████████████████████████████████| 97/97 [06:08<00:00,  3.80s/it]


Map:   0%|          | 0/3104 [00:00<?, ? examples/s]

100%|██████████████████████████████████████████████████████████████████████████████████| 97/97 [00:10<00:00,  9.63it/s]


## Task 2 : Connection to the Server

In [None]:
# Load the .env file
load_dotenv()

# Get the values from the .env file
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
database = os.getenv("DB_NAME")
server = "af651cca01b154fe28a0df0167cad5a7-844854289.us-east-2.elb.amazonaws.com"
# Construct the connection string
CONNECTION_STRING = f"postgresql+psycopg://{user}:{password}@{server}:5432/{database}"
# Establish connection to PostgreSQL
conn = psycopg2.connect(CONNECTION_STRING)
    

In [56]:
text_embedding = fclip.encode_text(["a pair of pink shorts"], batch_size=32)[0]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

1it [00:00, 21.97it/s]


In [90]:
def test_connection(conn):
    # Create a cursor object
    cursor = conn.cursor()
    # Execute a test query
    cursor.execute("SELECT version();")
    # Fetch the result
    result = cursor.fetchone()
    print("Connection successful!")
    print("PostgreSQL version:", result[0])
    # Close the cursor and connection
    cursor.close()
    #conn.close()

In [184]:
import psycopg2
# Establish connection to the PostgreSQL Docker container
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    user="testuser",
    password="testpwd",
    database="vectordb"
)
test_connection(conn)

Connection successful!
PostgreSQL version: PostgreSQL 15.4 (Debian 15.4-2.pgdg120+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit


In [194]:
# Establish connection to the PostgreSQL Docker container
import psycopg2
# Get the values from the .env file
user = "testuser"
password ="testpwd"
database = "vectordb"
server = "localhost"
# Construct the connection string
CONNECTION_STRING = f"host={server} port=5432 user={user} password={password} dbname={database}"
# Establish connection to PostgreSQL
conn = psycopg2.connect(CONNECTION_STRING)
test_connection(conn)

Connection successful!
PostgreSQL version: PostgreSQL 15.4 (Debian 15.4-2.pgdg120+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit


In [195]:
import os
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Get the values from the .env file
user = "testuser"
password ="testpwd"
database = "vectordb"
#server = "af651cca01b154fe28a0df0167cad5a7-844854289.us-east-2.elb.amazonaws.com"
server="localhost"
# Construct the connection string
CONNECTION_STRING = f"postgresql+psycopg://{user}:{password}@{server}:5432/{database}"
# Print the connection string
print(CONNECTION_STRING)
test_connection(conn)

postgresql+psycopg://testuser:testpwd@localhost:5432/vectordb
Connection successful!
PostgreSQL version: PostgreSQL 15.4 (Debian 15.4-2.pgdg120+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit


In [134]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
#embeddings = OpenAIEmbeddings()
embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

In [135]:
from sentence_transformers import SentenceTransformer

# Define the model
embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(embeddings_model_name)

# Sample text for embedding
#text_to_embed = "This is a sample text for embedding."

# Get the embedding for the text
#embedding = model.encode([text_to_embed])

# The 'embedding' variable now contains the vector representation of the input text
#print("Embedding:", embedding)

In [136]:
# Create a PGVector instance with the appropriate embedding function
#pgv = PGVector(conn)

In [137]:
# Define the embedding function
def function_fclip(vector):
    return fclip.encode_images(vector[0], batch_size=32)[0], fclip.encode_text(vector[1], batch_size=32)[0]

In [138]:
#image_embeddings.shape

In [217]:
# Define the embedding function
def my_embedding_function(text):
    # Generate text embeddings using fclip.encode_text
    text_embeddings = fclip.encode_text([text], batch_size=1)
    # Normalize the embeddings to unit norm
    text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, ord=2, axis=-1, keepdims=True)
    return text_embeddings[0]  # Return the single text embedding
to_find= my_embedding_function("Blue jacket")

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 22.67it/s]


In [219]:
textos.shape

(512,)

In [149]:
# Create a PGVector instance with the custom embedding function
pgv = PGVector(CONNECTION_STRING, embedding_function=my_embedding_function)

In [150]:
#CONNECTION_STRING = f"host={server} port=5432 user={user} password={password} dbname={database}"

In [151]:
CONNECTION_STRING

'postgresql+psycopg://testuser:testpwd@localhost:5432/vectordb'

In [152]:
#CONNECTION_STRING = f"postgresql+psycopg://{user}:{password}@{server}:5432/{database}"


In [158]:
text_embedding = my_embedding_function("Hello world")


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 22.43it/s]


In [204]:
import psycopg2
# Establish connection to the PostgreSQL Docker container
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    user="testuser",
    password="testpwd",
    database="vectordb"
)
# Create a cursor object to execute SQL queries
cursor = conn.cursor()
# Define the SQL query to drop the table if it exists
drop_table_query = "DROP TABLE IF EXISTS products;"

# Execute the SQL query to drop the table
cursor.execute(drop_table_query)

# Commit the changes to the database
conn.commit()

# Define the SQL query to create the table
create_table_query = """\
CREATE TABLE products (
    id SERIAL PRIMARY KEY,
    image_embedding vector,
    text_embedding vector
);
"""

# Execute the SQL query to create the table
cursor.execute(create_table_query)

# Commit the changes to the database
conn.commit()

# Define the SQL query to insert the data into the table
insert_data_query = "INSERT INTO products (image_embedding, text_embedding) VALUES (%s, %s)"

# Iterate through the embeddings and insert them into the table
for image_embedding, text_embedding in zip(image_embeddings[:100], text_embeddings[:100]):
    try:
        cursor.execute(insert_data_query, (image_embedding.tolist(), text_embedding.tolist()))
        # Commit the changes to the database
        conn.commit()
    except Exception as e:
        print(f"Error inserting data: {e}")
        # Rollback the transaction in case of an error
        conn.rollback()

# Close the cursor and connection
cursor.close()
conn.close()

v2

In [207]:
import psycopg2
from pgvector.psycopg2 import register_vector

# Establish connection to the PostgreSQL Docker container
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    user="testuser",
    password="testpwd",
    database="vectordb"
)

# Create a cursor object to execute SQL queries
cursor = conn.cursor()

# Define the SQL query to drop the table if it exists
drop_table_query = "DROP TABLE IF EXISTS products;"

# Execute the SQL query to drop the table
cursor.execute(drop_table_query)

# Commit the changes to the database
conn.commit()

# Define the SQL query to create the table
create_table_query = """\
CREATE TABLE products (
    id SERIAL PRIMARY KEY,
    image_embedding vector,
    text_embedding vector
);
"""

# Execute the SQL query to create the table
cursor.execute(create_table_query)

# Commit the changes to the database
conn.commit()

# Define the SQL query to insert the data into the table
insert_data_query = "INSERT INTO products (image_embedding, text_embedding) VALUES (%s, %s)"

# Iterate through the embeddings and insert them into the table
for image_embedding, text_embedding in zip(image_embeddings[:100], text_embeddings[:100]):
    try:
        cursor.execute(insert_data_query, (image_embedding.tolist(), text_embedding.tolist()))
        # Commit the changes to the database
        conn.commit()
    except Exception as e:
        print(f"Error inserting data: {e}")
        # Rollback the transaction in case of an error
        conn.rollback()

In [206]:
# Create an index for similarity search using L2 distance
create_index_query = "CREATE INDEX ON products USING ivfflat (text_embedding vector_l2_ops) WITH (lists = 100);"
cursor.execute(create_index_query)

# Vacuum and analyze the table for optimal performance
vacuum_analyze_query = "VACUUM ANALYZE products;"
cursor.execute(vacuum_analyze_query)

# Close the cursor and connection
cursor.close()
conn.close()

InternalError_: column does not have dimensions


In [209]:
text_embedding.shape

(512,)

In [214]:
image_embedding.shape

(512,)

In [211]:
import psycopg2
from pgvector.psycopg2 import register_vector

# Establish connection to the PostgreSQL Docker container
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    user="testuser",
    password="testpwd",
    database="vectordb"
)

# Create a cursor object to execute SQL queries
cursor = conn.cursor()

# Define the SQL query to drop the table if it exists
drop_table_query = "DROP TABLE IF EXISTS products;"

# Execute the SQL query to drop the table
cursor.execute(drop_table_query)

# Commit the changes to the database
conn.commit()

# Define the SQL query to create the table
create_table_query = """\
CREATE TABLE products (
    id SERIAL PRIMARY KEY,
    image_embedding vector(512),
    text_embedding vector(512) -- Specify the dimensions of the vector
);
"""

# Execute the SQL query to create the table
cursor.execute(create_table_query)

# Commit the changes to the database
conn.commit()

In [212]:
# Define the SQL query to insert the data into the table
insert_data_query = "INSERT INTO products (image_embedding, text_embedding) VALUES (%s, %s)"

# Iterate through the embeddings and insert them into the table
for image_embedding, text_embedding in zip(image_embeddings[:100], text_embeddings[:100]):
    try:
        cursor.execute(insert_data_query, (image_embedding.tolist(), text_embedding.tolist()))
        # Commit the changes to the database
        conn.commit()
    except Exception as e:
        print(f"Error inserting data: {e}")
        # Rollback the transaction in case of an error
        conn.rollback()

In [213]:
# Create an index for similarity search using L2 distance
create_index_query = "CREATE INDEX ON products USING ivfflat (text_embedding vector_l2_ops) WITH (lists = 100);"
cursor.execute(create_index_query)

# Vacuum and analyze the table for optimal performance
vacuum_analyze_query = "VACUUM ANALYZE products;"
cursor.execute(vacuum_analyze_query)

# Close the cursor and connection
cursor.close()
conn.close()

ActiveSqlTransaction: VACUUM cannot run inside a transaction block


In [237]:
import psycopg2
from pgvector.psycopg2 import register_vector

# Establish connection to the PostgreSQL Docker container
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    user="testuser",
    password="testpwd",
    database="vectordb"
)

# Create a cursor object to execute SQL queries
cursor = conn.cursor()

# Define the SQL query to drop the table if it exists
drop_table_query = "DROP TABLE IF EXISTS products;"

# Execute the SQL query to drop the table
cursor.execute(drop_table_query)

# Commit the changes to the database
conn.commit()

# Define the SQL query to create the table
create_table_query = """\
CREATE TABLE products (
    id SERIAL PRIMARY KEY,
    image_embedding vector(512), -- Specify the shape of the vector
    text_embedding vector(512) -- Specify the shape of the vector
);
"""

# Execute the SQL query to create the table
cursor.execute(create_table_query)

# Commit the changes to the database
conn.commit()

# Define the SQL query to insert the data into the table
insert_data_query = "INSERT INTO products (image_embedding, text_embedding) VALUES (%s, %s)"

# Iterate through the embeddings and insert them into the table
for image_embedding, text_embedding in zip(image_embeddings[:100], text_embeddings[:100]):
    try:
        cursor.execute(insert_data_query, (image_embedding.tolist(), text_embedding.tolist()))
        # Commit the changes to the database
        conn.commit()
    except Exception as e:
        print(f"Error inserting data: {e}")
        # Rollback the transaction in case of an error
        conn.rollback()

In [238]:
# Create an index for similarity search using L2 distance
create_index_query = "CREATE INDEX ON products USING ivfflat (text_embedding vector_l2_ops) WITH (lists = 100);"
cursor.execute(create_index_query)

# Commit the changes to the database
conn.commit()

# Close the cursor
cursor.close()

# Vacuum and analyze the table for optimal performance
vacuum_analyze_query = "VACUUM ANALYZE products;"
# Execute the VACUUM ANALYZE query outside the transaction block
conn.autocommit = True
cursor = conn.cursor()
cursor.execute(vacuum_analyze_query)

# Close the cursor and connection
cursor.close()
conn.close()

In [239]:
import numpy as np
import psycopg2

# Define the embedding function
def my_embedding_function(text):
    # Generate text embeddings using fclip.encode_text
    text_embeddings = fclip.encode_text([text], batch_size=1)
    # Normalize the embeddings to unit norm
    text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, ord=2, axis=-1, keepdims=True)
    return text_embeddings[0]  # Return the single text embedding

data = my_embedding_function("Blue jacket")

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 23.26it/s]


In [240]:
# Establish connection to the PostgreSQL Docker container
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    user="testuser",
    password="testpwd",
    database="vectordb"
)

# Create a cursor object to execute SQL queries
cursor = conn.cursor()

In [241]:
type(data)

numpy.ndarray

In [242]:
data.shape

(512,)

In [243]:
import numpy as np
import psycopg2

# Define the embedding function
def my_embedding_function(text):
    # Generate text embeddings using fclip.encode_text
    text_embeddings = fclip.encode_text([text], batch_size=1)
    # Normalize the embeddings to unit norm
    text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, ord=2, axis=-1, keepdims=True)
    return text_embeddings[0]  # Return the single text embedding

data = my_embedding_function("Blue jacket")
data_list = data.tolist()  # Convert numpy.ndarray to list

# Establish connection to the PostgreSQL Docker container
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    user="testuser",
    password="testpwd",
    database="vectordb"
)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 24.37it/s]


In [244]:
# Create a cursor object to execute SQL queries
cursor = conn.cursor()

# Get the column names in the products table
cursor.execute("SELECT column_name FROM information_schema.columns WHERE table_name = 'products';")
column_names = [row[0] for row in cursor.fetchall()]
print("Column Names:", column_names)

Column Names: ['id', 'image_embedding', 'text_embedding']


In [246]:
import numpy as np
import psycopg2

# Define the embedding function
def my_embedding_function(text):
    # Generate text embeddings using fclip.encode_text
    text_embeddings = fclip.encode_text([text], batch_size=1)
    # Normalize the embeddings to unit norm
    text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, ord=2, axis=-1, keepdims=True)
    return text_embeddings[0]  # Return the single text embedding

data = my_embedding_function("Blue jacket")
data_list = data.tolist()  # Convert numpy.ndarray to list

# Establish connection to the PostgreSQL Docker container
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    user="testuser",
    password="testpwd",
    database="vectordb"
)

# Create a cursor object to execute SQL queries
cursor = conn.cursor()

# Rollback any pending transaction
conn.rollback()

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 22.70it/s]


In [247]:
# Perform similarity search query
cursor.execute("""
    -- SELECT id, description, text_embedding
    SELECT id, text_embedding
    FROM products
    ORDER BY text_embedding <-> %s::vector
    LIMIT 2;
""", (data_list,))

# Fetch the results
results = cursor.fetchall()

In [250]:
# Process and display the results
for result in results:
    product_id = result[0]
    
    print("Product Item Id:", product_id)
    print("Description:", result[1])
   # print("Embedding:", result[2])
    print("----------------------")

# Close the cursor and connection
cursor.close()
conn.close()

Product Item Id: 48
Description: [0.023410298,-0.013177148,-0.015617458,0.009955669,-0.01777295,0.030018194,-0.010092812,0.064039744,-0.0076911673,0.017779142,-0.04886508,0.020496665,-0.010309409,-0.02337357,-0.06787995,-0.014375856,-0.040318474,0.022590628,0.004351568,0.00746218,0.008021529,-0.0040085693,0.01916376,-0.00507574,0.013201847,0.023242235,-0.0047811167,-0.026150428,-0.013631476,-0.02263303,-0.0066080065,0.019676488,-0.05809817,0.031445757,0.02649331,-0.036014147,0.053888433,0.06373215,-0.014435454,0.05046232,-0.028136535,-0.021974277,0.038241096,0.013659625,-0.02442254,0.050292015,-0.02547141,0.008108971,0.007807562,-0.02977,-0.055766474,0.034117892,0.028400697,0.026897002,0.09132406,0.043552235,-0.017686795,0.018791791,-0.023812372,-0.0638997,-0.009898719,0.0078092995,0.0046702824,-0.013159851,0.044242352,0.016539695,-0.019756798,-0.0130028585,0.032416962,0.031888053,-0.4391654,-0.023254678,0.0066879787,-0.0005796678,-0.029091904,-0.056399718,-0.022650972,0.026844967,0.02

In [236]:
# Perform similarity search query
cursor.execute("""
    SELECT id, description, descriptions_embeddings
    FROM products
    ORDER BY descriptions_embeddings <-> %s::vector
    LIMIT 2;
""", (data_list,))

# Fetch the results
results = cursor.fetchall()

InFailedSqlTransaction: current transaction is aborted, commands ignored until end of transaction block


In [None]:
# Process and display the results
for result in results:
    product_id = result[0]
    
    print("Product Item Id:", product_id)
    print("Description:", result[1])
    print("Embedding:", result[2])
    print("----------------------")

# Close the cursor and connection
cursor.close()
conn.close()

In [228]:
from psycopg2.extensions import adapt


In [230]:
data_list = data.tolist()  # Convert numpy.ndarray to list
# Perform similarity search query
cursor.execute("""
    SELECT id, url, description, descriptions_embeddings
    FROM products
    ORDER BY descriptions_embeddings <-> %s::vector
    LIMIT 2;
""", (data_list,))

# Fetch the results
results = cursor.fetchall()

UndefinedColumn: column "url" does not exist
LINE 2:     SELECT id, url, description, descriptions_embeddings
                       ^


In [229]:

# Perform similarity search query
cursor.execute("""
    SELECT id, url, description, descriptions_embeddings
    FROM products
    ORDER BY descriptions_embeddings <-> %s::vector
    LIMIT 2;
""", (adapt(data),))
# Fetch the results
results = cursor.fetchall()


ProgrammingError: can't adapt type 'numpy.ndarray'

In [223]:
# Perform similarity search query
cursor.execute("""
    SELECT id, url, description, descriptions_embeddings
    FROM products
    ORDER BY descriptions_embeddings <-> %s
    LIMIT 2;
""", (np.array(data),))

# Fetch the results
results = cursor.fetchall()

ProgrammingError: can't adapt type 'numpy.ndarray'

In [None]:
# Process and display the results
for result in results:
    product_id = result[0]
    url = result[1].split('?')[0]
    
    # Fetch and display the image
    urldata = requests.get(url).content
    a = io.imread(url)
    plt.imshow(a)
    plt.axis('off')
    plt.show()
    
    print("Product Item Id:", product_id)
    print("URL:", url)
    print("Description:", result[2])
    print("Embedding:", result[3])
    print("----------------------")

# Close the cursor and connection
cursor.close()
conn.close()

In [None]:
python
import numpy as np
import psycopg2
from psycopg2.extensions import adapt

# Define the embedding function
def my_embedding_function(text):
    # Generate text embeddings using fclip.encode_text
    text_embeddings = fclip.encode_text([text], batch_size=1)
    # Normalize the embeddings to unit norm
    text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, ord=2, axis=-1, keepdims=True)
    return text_embeddings[0]  # Return the single text embedding

data = my_embedding_function("Blue jacket")

# Establish connection to the PostgreSQL Docker container
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    user="testuser",
    password="testpwd",
    database="vectordb"
)

# Create a cursor object to execute SQL queries
cursor = conn.cursor()

# Perform similarity search query
cursor.execute("""
    SELECT id, url, description, descriptions_embeddings
    FROM products
    ORDER BY descriptions_embeddings <-> %s::vector
    LIMIT 2;
""", (adapt(data),))

# Fetch the results
results = cursor.fetchall()

# Process and display the results
for result in results:
    product_id = result[0]
    url = result[1].split('?')[0]
    
    # Fetch and display the image
    urldata = requests.get(url).content
    a = io.imread(url)
    plt.imshow(a)
    plt.axis('off')
    plt.show()
    
    print("Product Item Id:", product_id)
    print("URL:", url)
    print("Description:", result[2])
    print("Embedding:", result[3])
    print("----------------------")

# Close the cursor and connection
cursor.close()
conn.close()


In [None]:
'''

# ... (previous code remains the same)

# Define the SQL query to create the table
create_table_query = """\
CREATE TABLE products (
    id SERIAL PRIMARY KEY,
    image_embedding vector,
    text_embedding vector,
    similarity float
);
"""

# Execute the SQL query to create the table
cursor.execute(create_table_query)

# Commit the changes to the database
conn.commit()

# Create a function to calculate the similarity between two embeddings
create_similarity_function_query = """\
CREATE OR REPLACE FUNCTION embedding_similarity(embed1 vector, embed2 vector) RETURNS float AS $$
BEGIN
  RETURN 1 - (embed1 <-> embed2);
END;
$$ LANGUAGE plpgsql;
"""

# Execute the SQL query to create the similarity function
cursor.execute(create_similarity_function_query)

# Commit the changes to the database
conn.commit()

# Define the SQL query to create an index using the similarity function
create_similarity_index_query = """\
CREATE INDEX products_similarity_index ON products (embedding_similarity(image_embedding, text_embedding));
"""

# Execute the SQL query to create the similarity index
cursor.execute(create_similarity_index_query)

# Commit the changes to the database
conn.commit()

# ... (the rest of the code remains the same)
'''

In [205]:
import numpy as np
from PIL import Image
import psycopg2
import os
from dotenv import load_dotenv

import os
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Get the values from the .env file
user = "testuser"
password ="testpwd"
database = "vectordb"
#server = "af651cca01b154fe28a0df0167cad5a7-844854289.us-east-2.elb.amazonaws.com"
server="localhost"
# Construct the connection string
CONNECTION_STRING = f"postgresql+psycopg://{user}:{password}@{server}:5432/{database}"
# Print the connection string
print(CONNECTION_STRING)
#test_connection(conn)

postgresql+psycopg://testuser:testpwd@localhost:5432/vectordb


In [189]:
# Create a cursor object to execute SQL queries
cursor = conn.cursor()
# Embed the textual query
text_embedding = fclip.encode_text(["a pair of pink shorts"], batch_size=32)[0]
# Normalize the text embedding to unit norm
text_embedding = text_embedding / np.linalg.norm(text_embedding)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

1it [00:00, 22.98it/s]


In [190]:
# Perform a similarity search on the products table using cosine similarity
search_query = """
SELECT id, image_embedding,
       1 - (image_embedding <-> %s) AS similarity
FROM products
ORDER BY similarity DESC
LIMIT 10;
"""
cursor.execute(search_query, (text_embedding.tolist(),))

UndefinedFunction: operator does not exist: vector <-> numeric[]
LINE 3:        1 - (image_embedding <-> ARRAY[0.011644341051578522,0...
                                    ^
HINT:  No operator matches the given name and argument types. You might need to add explicit type casts.


In [None]:
# Fetch the results
results = cursor.fetchall()

# Get the IDs of the matched objects and their similarity scores
matched_ids = [result[0] for result in results]
similarity_scores = [result[2] for result in results]

# Load the corresponding images
images = []
for id in matched_ids:
    image_path = f"data_for_fashion_clip/{id}.jpg"
    image = Image.open(image_path)
    images.append(image)

# Rank the images by similarity with the query vector
ranked_images = [image for _, image in sorted(zip(similarity_scores, images), reverse=True)]

# Close the cursor and connection
cursor.close()
conn.close()

In [None]:
# Task 2: Creating a product search engine

# Establish connection to PostgreSQL
conn = psycopg2.connect(CONNECTION_STRING)

# Create a PGVector instance
pgv = PGVector(conn)

# Embed the textual query
text_embedding = fclip.encode_text(["a pair of pink shorts"], batch_size=32)[0]

# Perform a similarity search on the products table
results = pgv.search("products", text_embedding, limit=10)

# Get the IDs of the matched objects
matched_ids = [result["id"] for result in results]

# Load the corresponding images
images = []
for id in matched_ids:
    image_path = f"data_for_fashion_clip/{id}.jpg"
    image = Image.open(image_path)
    images.append(image)

# Rank the images by similarity with the query vector
similarity_scores = np.dot(results["embedding"], text_embedding)
ranked_images = [image for _, image in sorted(zip(similarity_scores, images), reverse=True)]

# Close the connection
conn.close()

New code

In [None]:
import psycopg2
import numpy as np
from PIL import Image
from dotenv import load_dotenv
from pgvector import PGVector

# Assuming you have already obtained the image and text embeddings
images = ["data_for_fashion_clip/" + str(k) + ".jpg" for k in subset["article_id"].tolist()]
texts = subset["detail_desc"].tolist()

# Create image embeddings and text embeddings
image_embeddings = fclip.encode_images(images, batch_size=32)
text_embeddings = fclip.encode_text(texts, batch_size=32)

# Normalize the embeddings to unit norm
image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, ord=2, axis=-1, keepdims=True)
text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, ord=2, axis=-1, keepdims=True)

In [197]:

# Establish connection to the PostgreSQL Docker container
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    user="testuser",
    password="testpwd",
    database="vectordb"
)



cursor = conn.cursor()

# Define the SQL query to drop the table if it exists
drop_table_query = "DROP TABLE IF EXISTS products;"

# Execute the SQL query to drop the table
cursor.execute(drop_table_query)

# Commit the changes to the database
conn.commit()

# Define the SQL query to create the table
create_table_query = """
CREATE TABLE products (
    id SERIAL PRIMARY KEY,
    image_embedding vector,
    text_embedding vector
);
"""

# Execute the SQL query to create the table
cursor.execute(create_table_query)

# Commit the changes to the database
conn.commit()

# Define the SQL query to insert the data into the table
insert_data_query = "INSERT INTO products (image_embedding, text_embedding) VALUES (%s, %s)"

# Iterate through the embeddings and insert them into the table
for image_embedding, text_embedding in zip(image_embeddings[:100], text_embeddings[:100]):
    try:
        cursor.execute(insert_data_query, (image_embedding.tolist(), text_embedding.tolist()))
        # Commit the changes to the database
        conn.commit()
    except Exception as e:
        print(f"Error inserting data: {e}")
        # Rollback the transaction in case of an error
        conn.rollback()

# Close the cursor and connection
cursor.close()
conn.close()

In [203]:

user = "testuser"
password = "testpwd"
database = "vectordb"
server = "localhost"

# Construct the connection string
CONNECTION_STRING = f"postgresql+psycopg://{user}:{password}@{server}:5432/{database}"

# Create a cursor object to execute SQL queries
conn = psycopg2.connect(CONNECTION_STRING)
cursor = conn.cursor()


ProgrammingError: invalid dsn: missing "=" after "postgresql+psycopg://testuser:testpwd@localhost:5432/vectordb" in connection info string


In [200]:
# Task 2: Creating a product search engine
def my_embedding_function(text):
    # Generate text embeddings using fclip.encode_text
    text_embeddings = fclip.encode_text([text], batch_size=1)
    # Normalize the embeddings to unit norm
    text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, ord=2, axis=-1, keepdims=True)
    return text_embeddings[0]  # Return the single text embedding
# Establish connection to PostgreSQL
#conn = psycopg2.connect(CONNECTION_STRING)
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    user="testuser",
    password="testpwd",
    database="vectordb"
)

import os
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

# Get the values from the .env file
user = "testuser"
password ="testpwd"
database = "vectordb"
#server = "af651cca01b154fe28a0df0167cad5a7-844854289.us-east-2.elb.amazonaws.com"
server="localhost"
# Construct the connection string
CONNECTION_STRING = f"postgresql+psycopg://{user}:{password}@{server}:5432/{database}"

# Print the connection string
print(CONNECTION_STRING)


# Create a PGVector instance
pgv = PGVector(conn,embedding_function=my_embedding_function)

postgresql+psycopg://testuser:testpwd@localhost:5432/vectordb


ArgumentError: Expected string or URL object, got <connection object at 0x00000205CEC239A0; dsn: 'user=testuser password=xxx dbname=vectordb host=localhost port=5432', closed: 0>

In [None]:
# Embed the textual query
text_embedding = fclip.encode_text(["a pair of pink shorts"], batch_size=32)[0]

# Perform a similarity search on the products table
results = pgv.search("products", text_embedding, limit=10)

# Get the IDs of the matched objects
matched_ids = [result["id"] for result in results]

# Load the corresponding images
images = []
for id in matched_ids:
    image_path = f"data_for_fashion_clip/{id}.jpg"
    image = Image.open(image_path)
    images.append(image)

# Rank the images by similarity with the query vector
similarity_scores = np.dot(results["embedding"], text_embedding)
ranked_images = [image for _, image in sorted(zip(similarity_scores, images), reverse=True)]

# Close the connection
conn.close()