# Astra + Vertex GCP Gemini for multimodal ecommerce search

This notebook runs the data loading process

For generating a multimodal embeddi|ng, the image embedding was combined with the name + description embedding.

The products data, with the embedding, will be stored in a Astra collection, in the JSON format.

In [None]:
!pip install --upgrade --user google-cloud-aiplatform --quiet


In [1]:
from dotenv import load_dotenv, find_dotenv
import os
load_dotenv(find_dotenv(), override=True)

True

## GCP Settings

In [23]:
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Image

In [24]:
GCP_PROJECT_ID = os.environ['GCP_PROJECT_ID']
GCP_REGION = os.environ['GCP_REGION']
vertexai.init(project=GCP_PROJECT_ID, location=GCP_REGION)

In [87]:
from vertexai.preview.vision_models import MultiModalEmbeddingModel, Image
import numpy as np

# This function generates the embedding for image and text, balancing them at the end.
def get_multimodal_embedding(image_path, text):
    model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding@001")
    image = Image.load_from_file(image_path)
    emb = model.get_embeddings(image=image, contextual_text=text[:768])
    average_embedding = (np.array(emb.image_embedding) + np.array(emb.text_embedding)) / 2   
    return average_embedding.tolist()


# Astrapy

In [2]:
from astrapy.db import AstraDB, AstraDBCollection
from astrapy.ops import AstraDBOps

In [3]:
astra_db = AstraDB(token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"), api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"))

In [30]:
# GCP Gemini Embeddings Dimensions = 1408
# For creating the collection
collection = astra_db.create_collection(collection_name="ecommerce_products", dimension=1408)

ValueError: [{"message": "Cannot have more than 50 indexes, failed to create index on table with CREATE CUSTOM INDEX IF NOT EXISTS ecommerce_products_exists_keys ON \"default_keyspace\".\"ecommerce_products\" (exist_keys) USING 'StorageAttachedIndex'"}]

In [31]:
# GCP Gemini Embeddings Dimensions = 1408
# Mapping to an existent collection
collection = AstraDBCollection(
    collection_name="ecommerce_products", astra_db=astra_db
)


In [101]:
collection.delete_many({})

{'status': {'deletedCount': -1}}

# Data Loading

Here, we will read the dataset and prepare it for loading.

The products without price will be discarded.

In [77]:
import pandas as pd
import ast
import json

df = pd.read_csv('./flipkart_com-ecommerce_sample.csv')

def parse_string_to_array(s):
    try:
        return ast.literal_eval(s) if pd.notna(s) else []
    except (SyntaxError, ValueError):
        return []

def extract_category(full_category):
    parse =  parse_string_to_array(full_category)
    if len(parse) > 0:
        return [cat.strip() for cat in parse[0].split(">>")][0]
    else:
        return None

def convert_specification(input_string):
    try:
        json_string = input_string.replace("=>", ":")
        data_dict = json.loads(json_string)
        product_specification = data_dict["product_specification"]
        return product_specification
    except:
        return []

df = df.dropna()
df['image'] = df['image'].apply(parse_string_to_array)
df['category'] = df["product_category_tree"].apply(extract_category)
df['specification'] = df["product_specifications"].apply(convert_specification)
df.head(20)

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications,category,specification
0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,999.0,379.0,[http://img5a.flixcart.com/image/short/u/4/a/a...,False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",Clothing,[{'key': 'Number of Contents in Sales Package'...
1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32157.0,22646.0,[http://img6a.flixcart.com/image/sofa-bed/j/f/...,False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati...",Furniture,"[{'key': 'Installation & Demo Details', 'value..."
2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,999.0,499.0,[http://img5a.flixcart.com/image/shoe/7/z/z/re...,False,Key Features of AW Bellies Sandals Wedges Heel...,No rating available,No rating available,AW,"{""product_specification""=>[{""key""=>""Ideal For""...",Footwear,"[{'key': 'Ideal For', 'value': 'Women'}, {'key..."
3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,699.0,267.0,[http://img5a.flixcart.com/image/short/6/2/h/a...,False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",Clothing,[{'key': 'Number of Contents in Sales Package'...
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,220.0,210.0,[http://img5a.flixcart.com/image/pet-shampoo/r...,False,Specifications of Sicons All Purpose Arnica Do...,No rating available,No rating available,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",...",Pet Supplies,"[{'key': 'Pet Type', 'value': 'Dog'}, {'key': ..."
5,c2a17313954882c1dba461863e98adf2,2016-03-25 22:59:23 +0000,http://www.flipkart.com/eternal-gandhi-super-s...,Eternal Gandhi Super Series Crystal Paper Weig...,"[""Eternal Gandhi Super Series Crystal Paper We...",PWTEB7H2E4KCYUE3,430.0,430.0,[http://img5a.flixcart.com/image/paper-weight/...,False,Key Features of Eternal Gandhi Super Series Cr...,No rating available,No rating available,Eternal Gandhi,"{""product_specification""=>[{""key""=>""Model Name...",Eternal Gandhi Super Series Crystal Paper Weig...,"[{'key': 'Model Name', 'value': 'Gandhi Paper ..."
6,ce5a6818f7707e2cb61fdcdbba61f5ad,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FVVKRBAXHB,1199.0,479.0,[http://img6a.flixcart.com/image/short/p/j/z/a...,False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",Clothing,[{'key': 'Number of Contents in Sales Package'...
7,8542703ca9e6ebdf6d742638dfb1f2ca,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGYGHFUEXN,32157.0,22646.0,[http://img6a.flixcart.com/image/sofa-bed/e/x/...,False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati...",Furniture,"[{'key': 'Installation & Demo Details', 'value..."
8,29c8d290caa451f97b1c32df64477a2c,2016-03-25 22:59:23 +0000,http://www.flipkart.com/dilli-bazaaar-bellies-...,"dilli bazaaar Bellies, Corporate Casuals, Casuals","[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH3DZBFR88SCK,699.0,349.0,[http://img6a.flixcart.com/image/shoe/b/p/n/pi...,False,"Key Features of dilli bazaaar Bellies, Corpora...",No rating available,No rating available,dilli bazaaar,"{""product_specification""=>[{""key""=>""Occasion"",...",Footwear,"[{'key': 'Occasion', 'value': 'Ethnic, Casual,..."
9,4044c0ac52c1ee4b28777417651faf42,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FVUHAAVH9X,1199.0,479.0,[http://img5a.flixcart.com/image/short/5/z/c/a...,False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",Clothing,[{'key': 'Number of Contents in Sales Package'...


## Data processing

Here, we will read the dataset to load the data into Astra.

For each row, an embedding will be generated.

The first image of the product will be considered for the image embedding. These images are already downloaded and available at the base_dir.

The skip and load variables was used to limit the loading to and specified range of records.

Initially, I was using insert_many to insert 20 records, but it was complicated to discover which records had problem and couldn't be inserted. Then, I changed it to insert_one. 


In [102]:
%time
from tqdm import tqdm

# Loading all flipcart data to the Vector Table
skip = 5000
load = 6000
batch = 1
docs = []
errors = []
basepath = '../public/img'
for index, row in tqdm(df[skip:load].iterrows(), total=df[skip:load].shape[0], desc=f'Loading with Astrapy'):
    try:
        # Generate the embedding
        emb = get_multimodal_embedding(
                image_path = f"{basepath}/{row['uniq_id']}_{os.path.basename(row['image'][0])}",
                text = f'{row["product_name"]}'
            )
        
        #if an embedding was generated, load the data into Astra
        if emb :
            doc = {
                "_id": row["uniq_id"],
                "product_name": row["product_name"],
                "retail_price": row["retail_price"],
                "discounted_price": row["discounted_price"],
                "images": row["image"],
                "description": row["description"],
                "category": row["category"],
                "specification": row["specification"],
                "$vector": emb}
        
            collection.insert_one(doc)
            
    except Exception as error:
        errors.append(f"Error at IX {index} {error}")


    
print("Finished")
print(f"Errors: {len(errors)}")

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 11.9 µs


Loading with Astrapy: 100%|█████████████| 5000/5000 [5:43:03<00:00,  4.12s/it]

Finished
Errors: 405





In [None]:
#Inspecting errors
errors

In [100]:
#Inspecting the categories
grouped_df = df.groupby('category').size().reset_index(name='Count')
pd.set_option('display.max_rows', None)
grouped_df

Unnamed: 0,category,Count
0,883 Police Full Sleeve Solid Men's Jacket,1
1,"ABEEZ Boys, Men, Girls (Black, Pack of 1)",1
2,ANAND ARCHIES Girls Flats,2
3,ANAND ARCHIES Girls Wedges,1
4,ANASAZI Casual 3/4 Sleeve Solid Women's Top,1
5,ATV Pouch for Acer Liquid Z330 (STEEL BLUE),1
6,Abhinl Fashion Cotton Printed Semi-stitched Sa...,1
7,"Adidas IND PRO THI GUA Thigh Guard (White, Blu...",1
8,Ajaero Slim Fit Women's Dark Blue Jeans,2
9,Amita Home Furnishing Cotton Floral Single Bed...,1
