In [191]:
!pip install --upgrade openai requests beautifulsoup4 selenium webdriver-manager pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [192]:
import openai
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set OpenAI API key
openai.api_key = 'placeholder'

# Function to scrape data from a sample restaurant website
def scrape_website(url):
    options = webdriver.ChromeOptions()
    options.binary_location = "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"
    options.add_argument("--headless")  # Run headless Brave
    service = ChromeService(executable_path='/Users/panxuanen/Downloads/chromedriver-mac-arm64/chromedriver')
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    time.sleep(5)  # Wait for JavaScript to load content
    page_source = driver.page_source
    driver.quit()
    return page_source

def clean_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove script and style elements
    for script in soup(["script", "style", "iframe", "noscript"]):
        script.decompose()

    # Get text from the remaining HTML
    text = soup.get_text()

    # Break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # Break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # Drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text

# Function to split the HTML content into manageable chunks while preserving structure
def chunk_content(content, chunk_size=3000):
    start = 0
    while start < len(content):
        end = start + chunk_size
        if end < len(content):
            end = content.rfind('>', start, end) + 1
            if end == 0:
                end = start + chunk_size
        yield content[start:end]
        start = end

# Function to extract data using OpenAI with retry mechanism and rate limit handling
def extract_basic_data(html_content):
    prompt_template = """
    Extract the following information from the provided HTML content:
    1. Restaurant Name
    2. Address
    3. Phone Number
    4. Email
    If no relevant information is found, indicate 'no data' and move on.
    HTML Content: {}
    """
    max_retries = 5
    retry_count = 0
    results = []

    for chunk in chunk_content(html_content):
        prompt = prompt_template.format(chunk)
        while retry_count < max_retries:
            try:
                response = openai.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "assistant", "content": prompt}
                    ]
                )
                results.append(response.choices[0].message.content)
                break
            except openai.RateLimitError as e:
                retry_count += 1
                wait_time = 2 ** retry_count  # Exponential backoff
                print(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            except openai.OpenAIError as e:
                print(f"OpenAI error: {e}")
                raise
        if retry_count == max_retries:
            raise Exception("Exceeded maximum retry attempts.")
    return results

# Function to extract data using OpenAI with retry mechanism and rate limit handling
def extract_dietary_and_reviews(html_content):
    prompt_template = """
    From the provided HTML content, extract:
    1. List all mentioned dietary offerings such as vegan, vegetarian, gluten-free, etc., and combine them into a single consolidated list.
    2. Customer reviews and summarize them into a couple of concise sentences.
    If no relevant information is found, indicate 'no data' and move on.
    HTML Content: {}
    """
    max_retries = 5
    retry_count = 0
    dietary_offerings = []
    review_summaries = []

    for chunk in chunk_content(html_content):
        prompt = prompt_template.format(chunk)
        while retry_count < max_retries:
            try:
                response = openai.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "assistant", "content": prompt}
                    ]
                )
                content = response.choices[0].message.content
                # Processing the response to extract dietary offerings and reviews
                lines = content.split('\n')
                for line in lines:
                    if line.startswith('1.'):
                        dietary_offerings.append(line.split(':')[1].strip())
                    elif line.startswith('2.'):
                        review_summaries.append(line.split(':')[1].strip())
                break
            except openai.RateLimitError as e:
                retry_count += 1
                wait_time = 2 ** retry_count  # Exponential backoff
                print(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            except openai.OpenAIError as e:
                print(f"OpenAI error: {e}")
                raise
        if retry_count == max_retries:
            raise Exception("Exceeded maximum retry attempts.")
    
    # Summarize dietary offerings and reviews
    dietary_summary = ' | '.join(set(dietary_offerings)) if dietary_offerings else 'No specific dietary offerings provided.'
    reviews_summary = ' '.join(review_summaries) if review_summaries else 'No detailed reviews available.'

    return dietary_summary, reviews_summary

# Scrape data from website
# url = 'https://www.thefatfowl.com'
url = 'https://www.mitrthainyc.com/'
yelp_url = 'https://www.yelp.com/biz/mitr-thai-new-york'

html_content = clean_html(scrape_website(url))
yelp_html_content = clean_html(scrape_website(yelp_url))

# Extract data using OpenAI
extracted_data = extract_basic_data(html_content)
dietary_info, reviews_info = extract_dietary_and_reviews(yelp_html_content)


In [206]:
print(extracted_data)
print("Dietary Offerings:", dietary_info)
print("Customer Reviews:", reviews_info)

['1. Restaurant Name: Mitr Thai\n2. Address: 37 West 46th St, New York, NY 10036\n3. Phone Number: 212-466-6699\n4. Email: info@mitrthainyc.com', '1. Restaurant Name: no data\n2. Address: no data\n3. Phone Number: no data\n4. Email: no data', 'The provided HTML content does not contain any information related to the restaurant name, address, phone number, or email.', '1. Restaurant Name: No data\n2. Address: No data\n3. Phone Number: No data\n4. Email: No data', '1. Restaurant Name: Mitr Thai\n2. Address: 37 West 46th St, New York, NY 10036\n3. Phone Number: 212-466-6699\n4. Email: info@mitrthainyc.com', '1. Restaurant Name: no data\n2. Address: no data\n3. Phone Number: no data\n4. Email: no data']
Dietary Offerings:  | Based on the provided HTML content, no specific mentions of dietary offerings (vegan, vegetarian, gluten-free, etc.) were found. | The restaurant offers vegan-friendly options based on customer reviews. | Vegan, Vegetarian, Gluten-Free | vegan, vegetarian, gluten-free.

In [208]:
def summarize_dietary_offerings(offerings_str):
    prompt = f"""
    Given the list of mentions about dietary offerings: {offerings_str}
    Summarize this into a list of unique dietary offerings. If no specific dietary offerings are mentioned, ignore.
    """
    try:
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "assistant", "content": prompt}
            ]
        )
        
        return response.choices[0].message.content.strip().split(', ')
    except Exception as e:
        print(f"Error during OpenAI API call: {e}")
        return []
        
def parse_extracted_data(extracted_data, dietary_info, reviews_info):
    # Initialize an empty dictionary with all fields set to 'no data'
    best_data = {
        'Restaurant Name': 'no data',
        'Address': 'no data',
        'Phone Number': 'no data',
        'Email': 'no data',
        'Dietary Offerings': 'no data',
        'Customer Reviews': 'no data'
    }
    
    # Iterate over all extracted data entries
    for entry in extracted_data:
        current_data = {}
        lines = entry.split('\n')
        for line in lines:
            if 'Restaurant Name' in line:
                current_data['Restaurant Name'] = line.split(': ')[1]
            elif 'Address' in line:
                current_data['Address'] = line.split(': ')[1]
            elif 'Phone Number' in line:
                current_data['Phone Number'] = line.split(': ')[1]
            elif 'Email' in line:
                current_data['Email'] = line.split(': ')[1]
            # elif 'Dietary Offerings' in line:
            #     current_data['Dietary Offerings'] = line.split(': ')[1]
            # elif 'Customer Reviews' in line:
            #     current_data['Customer Reviews'] = line.split(': ')[1]
        

        # Update the best_data if current_data has more complete information
        is_more_complete = all(current_data.get(key) != 'no data' for key in current_data)
        if is_more_complete and ('no data' in best_data.values()):
            best_data.update(current_data)
            break  # Stop searching after finding the first complete entry

    
    best_data['Dietary Offerings'] = summarize_dietary_offerings(dietary_info)
    best_data['Customer Reviews'] = reviews_info

    return best_data


# Assuming 'extracted_data' is a list of strings as described
parsed_data = parse_extracted_data(extracted_data, dietary_info, reviews_info)

# Store parsed data in a Pandas DataFrame
import pandas as pd
df = pd.DataFrame([parsed_data])
pd.set_option('display.width', None)        
pd.set_option('display.max_colwidth', None)
df


Unnamed: 0,Restaurant Name,Address,Phone Number,Email,Dietary Offerings,Customer Reviews
0,Mitr Thai,"37 West 46th St, New York, NY 10036",212-466-6699,info@mitrthainyc.com,[- Vegan\n- Vegetarian\n- Gluten-Free],"Reviews are generally positive with a rating of 4.6 out of 5, indicating high satisfaction among customers. Customers generally enjoyed the food at Mitr Thai, with positive comments on the quality of dishes such as pad see ew, crab legs in curry sauce, shrimp with eggplant, pad Thai, and green curry chicken. The restaurant was praised for its ambience and attentive service, although some customers felt slightly rushed during their dining experience. One customer noted a preference for softer batter on the crab legs but still enjoyed the dish overall. Customers also mentioned a lychee martini on tap. ""I can't speak to the dine-in experience, but need to give the food a 5* review!"""
