In [30]:
!pip install langchain langchain-community llama-parse fastembed chromadb beautifulsoup4==4.12.2 python-dotenv langchain-groq fastembed unstructured[pdf]



In [31]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv

# Loading environment variables from .env file
load_dotenv()
os.environ["GROQ_API_KEY"]="<your groq api here>"

# Accessing the environment variables
groq_api_key = os.getenv("GROQ_API_KEY")

def split_documents(documents, chunk_size=1000, chunk_overlap=100):
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunked_documents = text_splitter.split_text(documents)
    return chunked_documents

def initialize_embeddings(model_name="BAAI/bge-base-en-v1.5"):
    embed_model = FastEmbedEmbeddings(model_name=model_name)
    return embed_model

def create_vectorstore(documents, embed_model):
    vectorstore = FAISS.from_texts(documents, embed_model)
    return vectorstore

def initialize_chat_model(temperature=0, model_name="llama3-8b-8192", api_key=None):
    chat_model = ChatGroq(temperature=temperature, model_name=model_name, api_key=api_key)
    return chat_model


In [32]:
from langchain.chains.question_answering import load_qa_chain
import os
import requests
import json
import bs4

In [33]:
!pip install faiss-cpu



In [35]:
from langchain.vectorstores import FAISS

In [45]:
def get_info(raw_html):
    texts = split_documents(str(raw_html))
    embeddings = initialize_embeddings()
    docsearch = create_vectorstore(texts, embeddings)
    chain = load_qa_chain(initialize_chat_model(api_key=groq_api_key), chain_type="stuff")
    query = """I have an HTML code snippet of a product page from an e-commerce website.
    The snippet contains various attributes related to the product.
    Extract the meaningful attributes relevant to e-commerce contexts such as product names, prices, descriptions, and images.
    Identify the CSS selectors or XPaths for each extracted attribute to pinpoint their location within the HTML structure.
    Return the output in JSON Format without giving any explanation.
    Do not say "Here is the extracted data in JSON format:"
    Extract the class and give the sutiable key names based on the class name. For the value of values, extract the value and create the suitable JSON format.
    For the css_selector, write the value as class name in the format ".class-name".
    For the xpath, write the value as class name in the format "//div[@class='product-name']".
    Generate the key-value pairs as much as the class. Do not creat own class name.

    Follow the examples given in the below.

### Example ###

Input HTML:
            <div class="product-name">Sample Product</div>
            <span class="product-price">$19.99</span>
            <div class="product-description">This is a sample product description.</div>
            <img class="product-image" src="sample-product.jpg" />


Output JSON:
{
    "product_name": {
        "value": "Sample Product",
        "css_selector": ".product-name",
        "xpath": "//div[@class='product-name']"
    },
    "price": {
        "value": "$19.99",
        "css_selector": ".product-price",
        "xpath": "//span[@class='product-price']"
    },
    "description": {
        "value": "This is a sample product description.",
        "css_selector": ".product-description",
        "xpath": "//div[@class='product-description']"
    },
    "image_url": {
        "value": "sample-product.jpg",
        "css_selector": ".product-image",
        "xpath": "//img[@class='product-image']/@src"
    }
}

HTML: {raw_html}

JSON:
"""
    docs = docsearch.similarity_search(query)
    answer = chain.run(input_documents=docs, question=query)
    print(answer)

In [49]:
get_info(
    """
     <div class="product-name">Addidas shoes</div>
            <span class="product-price">$100.99</span>
            <div class="product-description">This is a Addidas shoes.</div>
            <img class="product-image" src="sample-product.jpg" />

    """
    )

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

{
    "product_name": {
        "value": "Addidas shoes",
        "css_selector": ".product-name",
        "xpath": "//div[@class='product-name']"
    },
    "price": {
        "value": "$100.99",
        "css_selector": ".product-price",
        "xpath": "//span[@class='product-price']"
    },
    "description": {
        "value": "This is a Addidas shoes.",
        "css_selector": ".product-description",
        "xpath": "//div[@class='product-description']"
    },
    "image_url": {
        "value": "sample-product.jpg",
        "css_selector": ".product-image",
        "xpath": "//img[@class='product-image']/@src"
    }
