# Meet Review Genie, a conversational chatbot for your e-commerce platform's reviews.

### Understanding the Limitation of the LLM

In [1]:
# Importing the OpenAI library to interact with OpenAI's API services.
from openai import OpenAI

In [2]:
import os  # Importing the os module to interact with environment variables
import getpass  # Importing getpass to securely input sensitive information

# Prompting the user to securely enter their OpenAI API key without displaying it on the screen
OPENAI_API_KEY = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key: ··········


In [3]:
# Defining the prompt to query the LLM
prompt = ''' What was uber's revenue in 2022? '''

In [4]:
# Sending a request to the OpenAI API to generate a chat response
client = OpenAI(api_key=OPENAI_API_KEY)
openai_response = client.chat.completions.create(
    model='gpt-3.5-turbo',  # Specifying the model to use;
    # Note: An older model chosen for testing purposes because the cutoff is 2021 whereas prompt is querying details about 2022
    messages=[{'role': 'user', 'content': prompt}]  # Creating a structured message for the AI model
)
print(openai_response.choices[0].message.content)

I'm sorry, but as an AI, I do not have access to real-time data. I recommend checking Uber's official website or financial reports for the most recent revenue figures in 2022.


In [5]:
## Let's create the above context for the prompt
# Defining a context string with revenue details retrieved from an external source.
retrieved_context = '''Revenue was $37.3 billion, up 17% year-over-year. Mobility revenue increased $5.8 billion primarily attributable to an increase in
               Mobility Gross Bookings of 31% year-over-year.'''

In [6]:
## Let's modify our prompt now
# Creating a prompt by embedding the retrieved context into a question for the AI model.

prompt = f"What was Uber's revenue in 2022? Check in {retrieved_context}"

# Note: The AI is being asked to analyze the given context and provide Uber's revenue for 2022

In [7]:
## Let's ask the LLM again
openai_response = client.chat.completions.create(
    model = 'gpt-3.5-turbo',
    messages = [{'role': 'user', 'content': prompt}])

In [8]:
# Accessing the generated response from the AI model.
openai_response.choices[0].message.content

"Uber's revenue in 2022 was $37.3 billion, with a significant portion of this coming from its Mobility segment which saw a 31% increase in Gross Bookings compared to the previous year."

In [36]:

%%capture output
# Installing the LangChain Hub package to access and manage pre-built AI chains, prompts, and agents.
!pip install langchainhub

# Installing the LangChain OpenAI integration to use OpenAI models within LangChain workflows.
!pip install langchain-openai

# Installing the core LangChain library for building LLM-based applications, including chaining, memory, and retrieval capabilities.
!pip install langchain

# Installing the community version of LangChain, which includes integrations and tools contributed by the community.
!pip install langchain-community

# Installing FAISS (Facebook AI Similarity Search) for efficient similarity-based search on text embeddings.
!pip install faiss-cpu

# Installing Gradio, a framework to create web-based UIs for AI models and applications easily.
!pip install gradio


In [10]:
# Importing the KaggleHub library to interact with datasets and models available on Kaggle.
import kagglehub

# Importing the CSV module for reading and writing CSV files.
import csv

# Importing pandas for data manipulation and analysis.
import pandas as pd

# Importing numpy for numerical operations and handling arrays efficiently.
import numpy as np

# Importing os to interact with the operating system, such as environment variables and file paths.
import os

# Importing getpass to securely handle user input (e.g., API keys or passwords).
import getpass


### STEP 1: Data Preparation

In [11]:
from google.colab import drive
drive.mount('/content/drive') # Mounting to the default Google Drive location in Colab

Mounted at /content/drive


In [12]:
#xx='/content/drive/MyDrive/IK-KickStart/ProjectUp/Gene/Project/'
file_name='/content/drive/MyDrive/IK-Company/ProjectUp/Gene/Project/MyData/dataset/prod_small.csv'

In [None]:
# prompt: unzip a file

# !unzip "/content/drive/MyDrive/IK-Company/ProjectUp/Gene/Project/archive.zip" -d "/content/drive/MyDrive/IK-Company/ProjectUp/Gene/Project/MyData"


Archive:  /content/drive/MyDrive/IK-KickStart/ProjectUp/Gene/Project/archive.zip
  inflating: /content/drive/MyDrive/IK-KickStart/ProjectUp/Gene/Project/MyData/dataset/train.csv  


In [13]:
# Loading the data
df = pd.read_csv(file_name,index_col=0)

In [14]:
# Viewing the data
df.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


**Constructing the text data**

It's useful to use both `Title` and `Description`. To help downstream models understand which content is title and which content is description, we will add a prefix explaining which section is title and which is description. So each row should look like

```
Title
{Title}
Description
{Description}
```

In [15]:
## Let's construct the text data
# Initializing empty lists to store product descriptions and their lengths
import math
product_description =[]
product_description_len =[]

# Iterating through each row in the dataframe df2
for row in df.iterrows():
    product = ""  # Initialize an empty string to accumulate product details

    # Extracting the product title from the current row
    title = row[1]['TITLE']

    # Checking if the title is valid (not NaN or missing)
    if type(title) != float or not math.isnan(title):
        product += "Title\n" + title + "\n"  # Append the title to the product description

    # Extracting the product description from the current row
    description = row[1]['DESCRIPTION']

    # Checking if the description is valid (not NaN or missing)
    if type(description) != float or not math.isnan(description):
        product += "Description\n" + description + "\n"  # Append the description to the product details

    # Check if either title or description was added
    added_content = title or description
    if added_content:
        product = product.strip()  # Remove any leading/trailing whitespace
        product_description.append(product)  # Add the formatted product details to the list
        product_description_len.append(len(product))  # Store the length of the product description


In [16]:
# Checking the length of the data
print(f"Number of elements {len(product_description)}")

Number of elements 100


In [17]:
# Check a sample product description data
product_description[5]

'Title\nHINS Metal Bucket Shape Plant Pot for Indoor & Outdoor Gardening (Red, Medium) Plant Stands for Indoor Balcony I Plant Bench I Plant Stands I Pot Stand Single I Potted Plant Stand I Big Pots I Metal\nDescription\nHINS Brings you the most Elegant Looking Pot with Stand for durable and long life Pot Stands for your lovely garden space, office and home. HINS is one of the best choice when it comes to indoor plants. It makes a good choice for housewarming gift. This beautiful product will take center stage with its sprawling design when planted with a plant. The metal stands are painted with powder-coated paint that will protect the galvanized iron from rusting. It will also prevent the color from fading. Each planter pot is removable for easy mobility, allowing you to switch out plants depending on your mood Note- Monitors are not calibrated same, item color displayed in photos may be showing slightly different from the real object. Please take the real one as standard. Please all

In [18]:
# Print the total number of product descriptions processed
print("Number of items", len(product_description_len))

# Print the minimum length of the product descriptions
print("Min Length of the description:",np.min(product_description_len))

# Print the average (mean) length of the product descriptions
print("Avg Length of the description:",np.mean(product_description_len))

print("Median of the description:",np.median(product_description_len))

# Print the maximum length of the product descriptions
print("Max Length of the description:",np.max(product_description_len))

Number of items 100
Min Length of the description: 18
Avg Length of the description: 385.9
Median of the description: 120.0
Max Length of the description: 1834


### Interpretation:

What does the above result signify about the data?






In [19]:
# Importing RecursiveCharacterTextSplitter from LangChain for chunking large text into smaller, manageable pieces.
# This helps in optimizing text for processing and retrieval.
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Importing OpenAIEmbeddings from LangChain to generate numerical vector representations (embeddings) of text.
# These embeddings capture the semantic meaning of the text for efficient similarity searches.
from langchain_openai import OpenAIEmbeddings

# Importing FAISS (Facebook AI Similarity Search) from LangChain's community package.
# FAISS is used for storing and retrieving embeddings efficiently by finding similar vectors.
from langchain_community.vectorstores import FAISS


In [20]:
# Setting the OpenAI API key as an environment variable.
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [21]:
# Split the input text using Recursive Character Chunking
# See this for more details https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/recursive_text_splitter/

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,
    chunk_overlap=20,
    length_function=len,
)
documents = text_splitter.create_documents(product_description)

In [22]:
# Create an embedding model using LangChain.
# One option is using https://python.langchain.com/docs/integrations/text_embedding/openai/
# See https://python.langchain.com/docs/integrations/text_embedding/ for a list of available embedding models on LangChain
embeddings = OpenAIEmbeddings()

In [23]:
# Create a vector store using the created chunks and the embeddings model
vector = FAISS.from_documents(documents, embeddings)

In [24]:
# Importing ChatOpenAI from LangChain to interact with OpenAI's language models, such as GPT, for generating responses.
from langchain_openai import ChatOpenAI

# Importing ChatPromptTemplate to create structured prompts for the chatbot, ensuring consistent interactions with the AI model.
from langchain_core.prompts import ChatPromptTemplate

# Importing OpenAIEmbeddings to convert text data into numerical vector representations for similarity search and retrieval.
from langchain_openai import OpenAIEmbeddings

# Importing ChatPromptTemplate again (duplicate import, should be removed to avoid redundancy).
from langchain_core.prompts import ChatPromptTemplate

# Importing create_stuff_documents_chain to combine and process retrieved documents for meaningful AI-generated responses.
from langchain.chains.combine_documents import create_stuff_documents_chain

# Importing create_retrieval_chain to build a chain that retrieves relevant documents from a vector store and generates AI responses.
from langchain.chains import create_retrieval_chain

# Importing StrOutputParser from LangChain to parse the output
from langchain_core.output_parsers import StrOutputParser

#### Code Explanation:
- `ChatOpenAI` – Used to access OpenAI models for chatbot functionality.
- `ChatPromptTemplate` – Helps structure queries to ensure better responses.
- `OpenAIEmbeddings` – Converts text into vector form for similarity-based retrieval.
- `create_stuff_documents_chain` – Combines retrieved documents meaningfully before passing to the LLM.
- `create_retrieval_chain` – Automates the process of retrieving and utilizing relevant content for AI responses.
- `StrOutputParser` - For processing the output of language models, ensuring that the output is returned as a plain string

In [25]:
# Initializing the ChatOpenAI model to interact with OpenAI's GPT model.
llm = ChatOpenAI(api_key=os.environ["OPENAI_API_KEY"], model = 'gpt-4o-mini')

In [26]:
# Importing the output parser to process and format the model's response into a readable string format.
output_parser = StrOutputParser()

# Creating a prompt template that instructs the AI to act as a customer service agent.
# The prompt takes two parameters:
#   1. {context} - Relevant information retrieved from the document store.
#   2. {input} - The user's question.
# The model is instructed to base its answer solely on the provided context.
prompt = ChatPromptTemplate.from_template(
    """Answer the following question based only on the provided context:

    <context>
    {context}
    </context>

    Question: {input}""",
    output_parser= output_parser                  # The output parser ensures that the response is returned in a structured string format.
)

# Creating a document processing chain using the LLM and the defined prompt template.
# This chain takes a list of retrieved documents and passes them as context to the model for generating responses.

document_chain = create_stuff_documents_chain(llm, prompt)

# Alternative chain creation method:
# Using the "|" (pipe) operator to link the prompt with the language model (llm),
# meaning the input first goes to the prompt and then to the model for response generation.
# document_chain = prompt | llm


In [28]:
print(output_parser)




In [29]:
# Create a retriever from the vector store for fetching relevant documents
# See https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/
retriever = vector.as_retriever()

# Create a retrieval chain that first retrieves relevant documents and then processes them using the document chain
retrieval_chain = create_retrieval_chain(retriever, document_chain)


In [30]:
# Invoking the retrieval chain to process the user's query.
# The query "what are some of the best shoes available?" is passed as input.
# The retrieval chain first fetches relevant product descriptions from the vector store,
# then processes them using the document chain to generate a meaningful AI response.
retrieval_chain.invoke({"input": "what are some of the best shoes available?"})

{'input': 'what are some of the best shoes available?',
 'context': [Document(id='1430ad73-6ef1-4b65-99e3-b76dcf6afdc3', metadata={}, page_content="Title\nadidas Men's Predator 18+ FG Firm Ground Soccer Cleats\nDescription\nadidas Predator 18+ FG- Black 7.5"),
  Document(id='057e59a2-d817-4dc3-b24b-3413ddb6a788', metadata={}, page_content="Title\nPUMA Cali Sport Clean Women's Sneakers White Leather (37540701)"),
  Document(id='c1c524d4-a680-4a68-99a9-163b947a74b2', metadata={}, page_content="Title\nKenneth Cole REACTION Men's Crespo Loafer B Shoe, Cognac, 10 M US"),
  Document(id='78850180-c46c-4f3c-8044-117f5f38c66a', metadata={}, page_content="The Remora Climbing Shoe is Mad Rock's do-it-all slipper for climbers who can't have separate shoes for boulders, sport routes, and gyms. With a moderately stiff, slightly downturned design, the Remora performs on any climb at steep to vertical")],
 'answer': "Based on the provided context, some of the best shoes available include:\n\n1. adidas

In [31]:
# Fetching the final answer from the retrieval chain by invoking it with a user query.
# The ['answer'] key extracts the final AI-generated answer from the response dictionary.
retrieval_chain.invoke({"input": "what are some of the best shoes available?"})['answer']

"Based on the provided context, some of the best shoes available include:\n\n1. adidas Men's Predator 18+ FG Firm Ground Soccer Cleats\n2. PUMA Cali Sport Clean Women's Sneakers\n3. Kenneth Cole REACTION Men's Crespo Loafer B Shoe\n4. Mad Rock Remora Climbing Shoe\n\nThese options cater to different activities such as soccer, casual wear, loafers, and climbing."

Now, we got the answer! But, the formatting is not very good, right? Lets create a simple UI for our bot.

In [32]:
# Function to process the user query and return formatted product names
def final_response(user_query):
    # Invoking the retrieval chain with the user's query to fetch relevant product information
    response = retrieval_chain.invoke({"input": user_query})['answer']

    # Creating a prompt to instruct the AI to format the response properly
    # The prompt asks the AI to extract only product names from the retrieved response
    prompt = f"Format the responses properly in {response}. Just return the product names, no other text"

    # Sending the formatted prompt to the GPT-4o-mini model for processing
    openai_response = client.chat.completions.create(
        model='gpt-4o-mini',  # Using GPT-4o-mini model for response generation
        messages=[{'role': 'user', 'content': prompt}]  # Providing the prompt to the model
    )

    # Extracting and returning the AI-generated response containing only the product names
    return openai_response.choices[0].message.content


In [33]:
# Printing the final response
print(final_response("what are some of the best shoes available?"))


1. adidas Men's Predator 18+ FG Firm Ground Soccer Cleats  
2. PUMA Cali Sport Clean Women's Sneakers  
3. Kenneth Cole REACTION Men's Crespo Loafer B Shoe  
4. Mad Rock Remora Climbing Shoe


In [34]:
# Importing the Gradio library to create a simple web-based user interface
import gradio as gr

# Creating the Gradio interface for the product recommendation system
app = gr.Interface(
    fn= final_response,        # The function that processes user input and returns recommendations
    inputs= "text",            # Input component: a text box for users to enter their query
    outputs= "text" ,           # Output component: a text box to display the AI-generated response
    title= "Review Gene"        ,     # The title of the web interface
    description= "Type your question to get recommendations"          ,# A brief description displayed to users
    theme="Ocean",
    allow_flagging="never"    # Disabling the flagging feature to remove the "Flag" button
)

# Launching the Gradio app to start the interface and make it accessible via web browser
app.launch()




Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://17ca98b327f5e537fe.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


