## Notebook displaying how to extract content using object-oriented way
Steps to achive
  1. Identify a list of products from an e-commerce website like Flipkart
  2. Extract the HTML text of the website or URL using Beautiful Soup
  3. Create an object model to capture the data according to your style
  4. Use PydanticOutputParser to create specific instructions to parse the data according to our format
  5. Display the results for confirmation 

In [17]:
# Requirements.txt
# langchain[all]
# openai==0.28
# python-dotenv
# jupyter
# pydantic
# beautifulsoup4
# requests

In [1]:
# Import all the required libraries
import os
import requests
from dotenv import load_dotenv
from langchain.chat_models import AzureChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
from pydantic import BaseModel, Field
from bs4 import BeautifulSoup
load_dotenv()
import warnings
warnings.filterwarnings(action="ignore")

In [2]:
# Set your certificate file path
os.environ["REQUESTS_CA_BUNDLE"] = r"../../ca-bundle-full.crt"

In [3]:
# Create your llm object from any of your available models like Chat GPT 3.5 turbo
llm = AzureChatOpenAI(
    deployment_name=os.environ["AZURE_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["OPENAI_API_VERSION"],
    openai_api_base=os.environ["OPENAI_API_BASE"],
    openai_api_key=os.environ["OPENAI_API_KEY"],
    openai_api_type=os.environ["OPENAI_API_TYPE"],
) 

In [4]:
# Here we are declaring model classes to capture Mobile related information from an e-commerce website
class ProductDetails(BaseModel):
    '''
        This class is used to capture additional product details related to Mobile
    '''
    RAM: int = Field(descriptoin="RAM of the mobile")
    ROM: int = Field(description="ROM of the mobile")
    Battery: int = Field(description="Battery capacity of the mobile")
        
class Product(BaseModel):
    '''
        This class is used to capture primary product details of Mobile
    '''
    Name: str = Field(description="Name of the mobile.")
    Price: int = Field(description="Price of the mobile.")
    Details: ProductDetails = Field(description="Additional features of the Mobile")
    
class Products(BaseModel):
    '''
        This class is used to store the collection/list of Mobiles 
    '''
    Mobiles: list[Product] = Field("List of mobiles listed in the text")

In [5]:
# Declare your url or website for extracting data
url = "https://www.flipkart.com/search?q=samsung+s23&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&p%5B%5D=facets.ram%255B%255D%3D8%2BGB%2Band%2BAbove&p%5B%5D=facets.battery_capacity%255B%255D%3D5000%2B-%2B5999%2BmAh"

In [13]:
# Store the HTML text to a variable
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
html_text = soup.body.text

In [7]:
# Create an output parser from our class model
output_parser = PydanticOutputParser(pydantic_object = Products)
format_instructions = output_parser.get_format_instructions()

In [15]:
# Create a prompt and pass the html text to LLM for extracting data as per our model
human_text = "{instruction}\n{format_instructions}"
message = HumanMessagePromptTemplate.from_template(human_text)
prompt = ChatPromptTemplate.from_messages([message])

chain = prompt | llm | output_parser
products = chain.invoke({"instruction":"Extract all the products from the below text \n" + html_text,"format_instructions":format_instructions})

In [16]:
# Displaying the list of products/Mobiles in an object oriented way
products.Mobiles

[Product(Name='SAMSUNG Galaxy S23 Ultra 5G (Cream, 256 GB)', Price=124999, Details=ProductDetails(RAM=12, ROM=256, Battery=5000)),
 Product(Name='SAMSUNG Galaxy S23 Ultra 5G (Green, 256 GB)', Price=124999, Details=ProductDetails(RAM=12, ROM=256, Battery=5000)),
 Product(Name='SAMSUNG Galaxy S23 Ultra 5G (Phantom Black, 256 GB)', Price=124999, Details=ProductDetails(RAM=12, ROM=256, Battery=5000)),
 Product(Name='SAMSUNG Galaxy S23 Ultra 5G (Cream, 512 GB)', Price=134999, Details=ProductDetails(RAM=12, ROM=512, Battery=5000)),
 Product(Name='SAMSUNG Galaxy S23 Ultra 5G (Green, 512 GB)', Price=134999, Details=ProductDetails(RAM=12, ROM=512, Battery=5000)),
 Product(Name='SAMSUNG Galaxy S23 Ultra 5G (Green, 1 TB)', Price=154999, Details=ProductDetails(RAM=12, ROM=1024, Battery=5000)),
 Product(Name='SAMSUNG Galaxy S23 Ultra 5G (Cream, 1 TB)', Price=154999, Details=ProductDetails(RAM=12, ROM=1024, Battery=5000)),
 Product(Name='SAMSUNG Galaxy S23 Ultra 5G (Phantom Black, 1 TB)', Price=1549

In [18]:
format_instructions

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"$defs": {"Product": {"description": "This class is used to capture primary product details of Mobile", "properties": {"Name": {"description": "Name of the mobile.", "title": "Name", "type": "string"}, "Price": {"description": "Price of the mobile.", "title": "Price", "type": "integer"}, "Details": {"allOf": [{"$ref": "#/$defs/ProductDetails"}], "description": "Additional features of the Mobile"}}, "required": ["Name", "Price", "Details"], "title": "Product", "type": "object"}, "ProductDetails": {"description": "This class is used to 