In [1]:
from tools.html_processor import HTMLProcessor
from tools.prompt_templator import PromptTemplator
from tools.llm_invocator import LLMInvocator
from tools.stealth_scraper import StealthScraper
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

stealth_scraper = StealthScraper()
html_processor = HTMLProcessor()
prompt_templator = PromptTemplator()
llm_invocator = LLMInvocator()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv("01_llmpipeline/specbook.csv")
df['id'] = range(1, len(df) + 1)

In [3]:
product_scrape_results = []
with ThreadPoolExecutor(max_workers=10) as executor:
    for id, product_search_result in zip(df['id'], executor.map(stealth_scraper.scrape_url, df['product_url'].to_list())):
        product_scrape_results.append({
            'id': id,
            'product_url': product_search_result.url,
            'success': product_search_result.success,
            'content_length': len(product_search_result.content) if product_search_result.content else 0,
            'status_code': product_search_result.status_code,
            'final_method': product_search_result.final_method,
            'error_reason': product_search_result.error_reason,
            'page_issues': product_search_result.page_issues,
            'html_content': product_search_result.content,
            'full_result': product_search_result.model_dump_json()
        })
product_scrape_results_df = pd.DataFrame(product_scrape_results)

2025-07-05 23:28:36 - StealthScraper - INFO - [stealth_scraper.py:309] - Rotated stealth configuration
2025-07-05 23:28:36 - StealthScraper - INFO - [stealth_scraper.py:309] - Rotated stealth configuration
2025-07-05 23:28:36 - StealthScraper - INFO - [stealth_scraper.py:309] - Rotated stealth configuration
2025-07-05 23:28:36 - StealthScraper - INFO - [stealth_scraper.py:309] - Rotated stealth configuration
2025-07-05 23:28:36 - StealthScraper - INFO - [stealth_scraper.py:309] - Rotated stealth configuration
2025-07-05 23:28:36 - StealthScraper - INFO - [stealth_scraper.py:309] - Rotated stealth configuration
2025-07-05 23:28:36 - StealthScraper - INFO - [stealth_scraper.py:390] - Successfully scraped URL with requests (224015 chars)
2025-07-05 23:28:36 - StealthScraper - INFO - [stealth_scraper.py:309] - Rotated stealth configuration
2025-07-05 23:28:36 - StealthScraper - INFO - [stealth_scraper.py:390] - Successfully scraped URL with requests (115442 chars)
2025-07-05 23:28:36 - Ste

In [4]:
product_scrape_results_df.value_counts(['success', 'status_code', 'final_method'])

success  status_code  final_method            
True     200          ScrapingMethod.REQUESTS     73
False    500          ScrapingMethod.FIRECRAWL    10
         404          ScrapingMethod.REQUESTS      4
Name: count, dtype: int64

In [5]:
product_scrape_results_df = df.merge(product_scrape_results_df, on='id', how='left') \
    .drop(columns=['product_url_y']) \
    .rename(columns={'product_url_x': 'product_url'})

# product_results_df.drop(columns=['html_content', 'full_result']).to_csv("01_llmpipeline/1-specbook_scrape_results.csv", index=False)
# product_results_df[['id', 'product_url', 'full_result']].to_csv("01_llmpipeline/1-specbook_scrape_content.csv", index=False)
# product_results_df.to_csv("01_llmpipeline/1-specbook_scrape.csv", index=False)
# product_results_df = pd.read_csv("01_llmpipeline/1-specbook_scrape.csv")

In [6]:
product_scrape_results_df_success = product_scrape_results_df[product_scrape_results_df['success'] == True]
product_prompts_df = product_scrape_results_df.copy()

for id, product_url, html_content in zip(product_scrape_results_df_success['id'], product_scrape_results_df_success['product_url'], product_scrape_results_df_success['html_content']):
    cleaned_html = html_processor.clean_html(str(html_content))
    cleaned_html_json = cleaned_html.model_dump_json()
    prompt = prompt_templator.product_extraction(product_url, cleaned_html_json)
    
    # Add fields dynamically using loc
    product_prompts_df.loc[product_prompts_df['id'] == id, 'cleaned_html'] = cleaned_html_json
    product_prompts_df.loc[product_prompts_df['id'] == id, 'cleaned_html_len'] = len(cleaned_html_json)

    product_prompts_df.loc[product_prompts_df['id'] == id, 'prompt'] = prompt
    product_prompts_df.loc[product_prompts_df['id'] == id, 'prompt_len'] = len(prompt)

In [7]:
# final_df.to_csv("01_llmpipeline/specbook_scrape_results.csv", index=False)
# product_prompts_df = pd.read_csv("01_llmpipeline/llm_results.csv")

In [8]:
llm_results_df = product_prompts_df.copy()

In [10]:
for id, success, prompt in zip(llm_results_df['id'], llm_results_df['success'], llm_results_df['prompt']):
    default_response = PromptTemplator.ProductExtractionOutput(
            image_url="",
            type="",
            description="",
            model_no="",
            product_link="",
            qty="",
            key="",
        )

    if success == True:
        llm_response = llm_invocator.invoke_llm(
            model_provider="openai",
            llm_model_name="gpt-4.1",
            prompt=prompt
        )
        try:
            default_response = PromptTemplator.ProductExtractionOutput.model_validate_json(llm_response)
        except Exception as e:
            print(f"Error validating response: {e}")
            default_response.description = "Error validating response"

    llm_results_df.loc[llm_results_df['id'] == id, 'llm_response'] = default_response.model_dump_json()

2025-07-05 23:32:30 - httpx - INFO - [_client.py:1025] - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2025-07-05 23:32:33 - httpx - INFO - [_client.py:1025] - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2025-07-05 23:32:36 - httpx - INFO - [_client.py:1025] - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2025-07-05 23:32:38 - httpx - INFO - [_client.py:1025] - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2025-07-05 23:32:41 - httpx - INFO - [_client.py:1025] - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2025-07-05 23:32:43 - httpx - INFO - [_client.py:1025] - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2025-07-05 23:32:45 - httpx - INFO - [_client.py:1025] - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2025-07-05 23:32:48 - httpx - INFO - [_client.py:1025] - HTTP Request: POST https://api.op

In [11]:
llm_results_df.to_csv("01_llmpipeline/llm_results.csv", index=False)

In [12]:
llm_results_df.count()

category            87
product_url         87
id                  87
success             87
content_length      87
status_code         87
final_method        87
error_reason        14
page_issues         87
html_content        73
full_result         87
cleaned_html        73
cleaned_html_len    73
prompt              73
prompt_len          73
llm_response        87
dtype: int64

In [19]:
llm_result_dicts = [dict(PromptTemplator.ProductExtractionOutput.model_validate_json(response)) for response in llm_results_df['llm_response'].to_list()]
product_specs_df = pd.DataFrame(llm_result_dicts)

In [20]:
product_specs_df.to_csv("01_llmpipeline/product_specs.csv", index=False)