In [21]:
from zenrows import ZenRowsClient
from bs4 import BeautifulSoup
import re
import json

client = ZenRowsClient("2fb712f035250fa0feba32543c584318e4894544")
url = "https://www.g2.com/products/toast/reviews?order=most_recent&page=31"
params = {"js_render":"true","premium_proxy":"true",}

response = client.get(url, params=params)

# Assuming response.text contains the HTML source
soup = BeautifulSoup(response.text, 'html.parser')

# Remove all spans containing 'Review collected by and hosted on G2.com.'
for span in soup.find_all('span', string=re.compile(r'Review collected by and hosted on G2.com.', re.I)):
    span.decompose()

# Find all review containers
reviews = soup.find_all('div', itemprop='review')

# List to store structured review data
review_list = []

for review in reviews:
    review_data = {}
    
    # Extract metadata from 'data-track-in-viewport-options' attribute
    data_attr = review.get('data-track-in-viewport-options')

    if data_attr:
        metadata = json.loads(data_attr.replace('&quot;', '"'))  # Fix JSON encoding
        review_data.update(metadata)

    review_title = review.find('div', itemprop='name')
    review_data['title'] = review_title.get_text(strip=True)

    review_data['author_name'] =  None
    review_data['user_url'] = None

    # Extract reviewer name (check for anonymous vs named)
    author_name = review.find('meta', itemprop='name')
    author_url = review.find('meta', itemprop='url')

    if author_name:
        review_data['author_name'] = author_name['content']
        review_data['user_url'] = author_url['content']

    # Extract reviewer title and company size using div instead of span
    title_divs = review.find_all('div', class_='mt-4th')
    div_texts = [div.get_text(strip=True) for div in title_divs]
    
    if len(div_texts) == 1:
        review_data['author_company_category'] = div_texts[0]
    elif len(div_texts) > 1:
        review_data['author_title'] = div_texts[0]
        review_data['author_company_category'] = div_texts[-1]

    # Extract rating
    rating_meta = review.find('meta', itemprop='ratingValue')
    review_data['rating'] = rating_meta['content'] if rating_meta else None
    
    # Extract 'like' section using regex
    like_section = review.find('div', string=re.compile(r'What do you like best about', re.I))
    if like_section:
        like_text = like_section.find_next('p', class_='formatted-text')
        review_data['body_like'] = like_text.get_text(strip=True) if like_text else None
    
    # Extract 'dislike' section using regex
    dislike_section = review.find('div', string=re.compile(r'What do you dislike about', re.I))
    if dislike_section:
        dislike_text = dislike_section.find_next('p', class_='formatted-text')
        review_data['body_dislike'] = dislike_text.get_text(strip=True) if dislike_text else None
    
    # Extract 'usage' section using regex
    usage_section = review.find('div', string=re.compile(r'What problems .* solving and how', re.I))
    if usage_section:
        usage_text = usage_section.find_next('p', class_='formatted-text')
        review_data['body_solutions'] = usage_text.get_text(strip=True) if usage_text else None
    
    review_list.append(review_data)

# Print structured review data
for review in review_list:
    print(review)

{'survey_response_id': 60880, 'reviewer_id': 62486, 'video_review': False, 'published_date': '20150904', 'product_id': 16426, 'product_uuid': '04a69f9a-4818-4716-880b-d4cf6fe77ab4', 'product': 'Toast', 'vendor_id': 13799, 'product_type': 'Software', 'name': 'Event::SurveyResponses::Viewed', 'title': '"Excellent results"', 'author_name': None, 'user_url': None, 'author_company_category': 'Mid-Market(51-1000 emp.)', 'rating': '4.5', 'body_like': 'The ability to digitize the paperwork needed to process employee payroll and benefits related items.', 'body_dislike': 'The usage level on our employee side. If we use it more, we could leverage its services more.', 'body_solutions': 'We have increased employee to corporate communication'}
{'survey_response_id': 60871, 'reviewer_id': 62477, 'video_review': False, 'published_date': '20150904', 'product_id': 16426, 'product_uuid': '04a69f9a-4818-4716-880b-d4cf6fe77ab4', 'product': 'Toast', 'vendor_id': 13799, 'product_type': 'Software', 'name': 'E

In [25]:
import re
clean_string = lambda s: re.sub(r'[^a-z0-9]+', '-', s.strip().lower()).strip('-')

s = "Hello, World! + (123)"
print(clean_string(s))  # Output: "HelloWorld123"

hello-world-123
