In [None]:
# !pip install -q --upgrade --user google-cloud-aiplatform==1.36.1
# ! pip install python-docx

In [1]:
import numpy as np
import pandas as pd
import numpy.linalg
import vertexai
from google.api_core import retry
from vertexai.language_models import TextEmbeddingModel, TextGenerationModel
from tqdm.auto import tqdm
import itertools,docx
from google.api_core import retry
tqdm.pandas()

project_id = "genai-demo-409412"
location = "us-central1"
vertexai.init(project=project_id, location=location)


embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

generation_model = TextGenerationModel.from_pretrained("text-bison@001")

In [2]:
user_query = """
                what products do you have for kids? Describe all the products in detail.
            """
generated_answer = generation_model.predict(user_query)
print(generated_answer.text)

We have a variety of products for kids, including toys, games, books, and clothes.

Toys: We have a wide variety of toys for kids of all ages, including dolls, action figures, cars, trucks, and building blocks. We also have a variety of educational toys that help kids learn and develop their skills.

Games: We have a variety of games for kids of all ages, including board games, card games, and video games. We also have a variety of outdoor games, such as basketball, soccer, and baseball.

Books: We have a variety of books for kids of all


In [3]:
def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    extracted_text = [paragraph.text for paragraph in doc.paragraphs]
    return extracted_text

docx_file_path = 'ecommerce_products.docx'
extracted_text = extract_text_from_docx(docx_file_path)
filtered_text = [line for line in extracted_text if line != '']

In [5]:
# filtered_text

In [7]:
# for i in filtered_text:
#     print(i)
#     print("\n")

In [8]:
def create_text_chunks(sequence, chunk_length, overlap_length):
    if len(sequence) <= chunk_length:
        return [sequence]
    return ["".join(parts) for parts in zip(*[sequence[i::chunk_length - overlap_length] for i in range(chunk_length)])]

def calculate_similarity(vector1):
    def similarity_metric(row):
        return np.dot(row, vector1) / (numpy.linalg.norm(row) * numpy.linalg.norm(vector1))
    return similarity_metric

@retry.Retry(timeout=300.0)
def retrieve_embeddings(input_text):
    return embedding_model.get_embeddings([input_text])[0].values

def build_vector_store(input_texts, chunk_size, overlap_size):
    store = pd.DataFrame()
    store["text_chunks"] = list(itertools.chain(*[create_text_chunks(text, chunk_size, overlap_size) for text in input_texts]))
    store["text_embeddings"] = store["text_chunks"].progress_apply(retrieve_embeddings).apply(np.array)
    return store

def derive_context(question, store,top_docs):
    question_vector = np.array(retrieve_embeddings(question))
    matched_indices = store["text_embeddings"].apply(calculate_similarity(question_vector)).sort_values(ascending=False)[:top_docs].index
    matched_texts = store[store.index.isin(matched_indices)]["text_chunks"]
    context_str = " ".join(matched_texts.values)
    return context_str

def generate_answer(query, store, num_documents=50, show_prompt=False):
    context_data = derive_context(query, store, num_documents)
    prompt_structure = f"""
        Your mission is to answer questions based on the given context. 
        Before you give an answer, make sure it is only from information in the context. 
        If the information is not in the context, just reply "I don't know the answer to that". 
            Context: ```{context_data}```
            Question: ***{query}***
            Answer: 
        """
    if show_prompt:
        print(prompt_structure)
    generated_response = generation_model.predict(prompt_structure, temperature=0.7,max_output_tokens=1024)
    return generated_response.text

In [9]:
chunk_size = 36
overlap_size = 0

text_vector_store = build_vector_store(filtered_text, chunk_size, overlap_size)

  0%|          | 0/160 [00:00<?, ?it/s]

In [10]:
text_vector_store.head()

Unnamed: 0,text_chunks,text_embeddings
0,Below is the List of products that a,"[0.020918134599924088, 0.02229158580303192, 0...."
1,re currently live on our mobile app,"[-0.031662553548812866, 0.046923596411943436, ..."
2,Product Name: Aurora Smartwatch for,"[-0.00695442408323288, 0.06290320307016373, 0...."
3,"Men. Priced at $250, Product descrip","[-0.014447714202105999, 0.02792857401072979, 0..."
4,tion: High-tech and stylish smartwat,"[0.0061335074715316296, 0.019488485530018806, ..."


In [11]:
user_query = """
                what products do you have for kids? Tell me about each of them including the price.
            """

generated_answer = generate_answer(user_query, text_vector_store)
print(generated_answer)

We have the following products for kids: Galaxy Night Lamp for Kids priced at $30, Magical Storybook for Kids priced at $25, Robot Building Kit for Kids priced at $45, Interactive Globe for Kids priced at $50, and Aromatherapy Candle Set for Kids priced at $40.


In [12]:
user_query = """
                do you have coffee beans? tell me more about the product.
            """

generated_answer = generate_answer(user_query, text_vector_store)
print(generated_answer)

The Gourmet Coffee Beans are premium quality, freshly roasted coffee beans from the best plantations in the world. They are available in a variety of flavours, so you can find the perfect one to suit your taste.
