In [None]:
%pip install requests
%pip install beautifulsoup4

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.emag.ro/search/sony%20xm5?ref=effective_search'
response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')
product_links = soup.find_all('a', class_='card-v2-title semibold mrg-btm-xxs js-product-url')

# Raw html content of the page
# print(soup)
# for link in product_links:
#     print(link['href'])

In [None]:
import re

In [None]:
def get_price_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    scripts = soup.find_all('script')
    for script in scripts:
        if 'EM.product_id' in script.text:
            product_id_search = re.search(r'EM.productDiscountedPrice\s=\s([0-9.]+);', script.text)
            if product_id_search:
                return float(product_id_search.group(1))
    return None


def get_reviews_by_url(product_url):
    # remove potential '?' at the end of the url
    if product_url.endswith('?'):
        product_url = product_url[:-1]

    # check for potentially missing slash at the very end
    if not product_url.endswith('/'):
        product_url += '/'

    # --------------------- endpoint URL ---------------------

    substr_to_remove = 'https://www.emag.ro/'
    endpoint_url = 'https://www.emag.ro/product-feedback/'
    endpoint_url += product_url.replace(substr_to_remove, '')
    endpoint_url += 'reviews/list'

    # --------------------- GET request ---------------------

    product_response = requests.get(product_url)
    product_html = product_response.text
    product_soup = BeautifulSoup(product_html, 'html.parser')

    # --------------------- number of reviews ---------------------

    # get the number of reviews. e.g. for 256 we iterate 26 times, for 5 we iterate 1 time
    # res = product_soup.find_all('p', class_='small semibold font-size-sm text-muted')
    # reviews_number = re.search(r'\d+', str(res[0])).group()

    # ^^^^^ this approach got dumped because it was only working for products 
    #       that actually had any number of reviews

    offset = 0

    params = {
        "source_id": 7,
        "page[limit]": 10,
        "page[offset]": offset,
        "sort[created]": "desc"
    }

    response = requests.get(endpoint_url, params=params)
    data = response.json()
    # print(data)
    reviews_number = data['reviews']['count']
    reviews_number = int(reviews_number)

    if reviews_number <= 0:
        return []

    # --------------------- product title ---------------------

    product_title = product_soup.find('h1', class_='page-title').get_text()
    # get rid of multiple whitespaces and \n
    product_title = re.sub(r'\s+', ' ', product_title).strip()

    # --------------------- product price ---------------------

    # product_price = product_soup.find('p', class_='product-new-price').get_text()
    # # get rid of ' Lei'
    # product_price = product_price[:-4] 
    # # transform string '1.920,00' to float '1920.0'
    # product_price = float(product_price.replace('.', '').replace(',', '.')) 

    # ^^^^^ this approach got dumped

    # used this approach instead of looking through the HTML tags (like above)
    # because formats were too many, e.g. 'de la xxx.xx Lei' when multiple offers exist

    scripts = product_soup.find_all('script')
    for script in scripts:
        if 'EM.product_id' in script.text:
            product_id_search = re.search(r'EM.productDiscountedPrice\s=\s([0-9.]+);', script.text)
            if product_id_search:
                product_price = float(product_id_search.group(1))
                break

    # --------------------- get reviews ---------------------

    review_titles_arr = []
    review_ratings_arr = []
    review_contents_arr = []
    review_verified_users_arr = []

    while offset < reviews_number:
        params = {
            "source_id": 7,
            "page[limit]": 10,
            "page[offset]": offset,
            "sort[created]": "desc"
        }

        response = requests.get(endpoint_url, params=params)
        data = response.json()

        items = data['reviews']['items']

        review_titles = [item['title'] for item in items]
        review_ratings = [item['rating'] for item in items]
        review_contents = [item['content'] for item in items]
        review_verified_users = [item['is_bought'] for item in items]

        review_titles_arr += review_titles
        review_ratings_arr += review_ratings
        review_contents_arr += review_contents
        review_verified_users_arr += review_verified_users

        offset += 10

    # --------------------- final product array ---------------------

    merged_list = [
        {
            'product_title': product_title,
            'product_price': product_price,
            'review_title': review_title, 
            'review_rating': review_rating, 
            'review_verified_buyer': review_verified_buyer, 
            'review_content': review_content
        }
        for review_title, review_content, review_rating, review_verified_buyer in zip(review_titles_arr, review_contents_arr, review_ratings_arr, review_verified_users_arr)
    ]

    return merged_list

# product_reviews = get_reviews_by_url('https://www.emag.ro/mouse-wireless-logitech-mx-master-3s-performance-8000-dpi-silent-usb-bt-graphite-910-006559/pd/DZMBWVMBM/')
# for review in product_reviews:
#     print(review)

In [None]:
reviews = []
for link in product_links:
    product_reviews = get_reviews_by_url(link['href'])
    reviews += product_reviews

In [None]:
for review in reviews:
    print(review)

In [None]:
import csv

csv_file = 'reviews_output.csv'

header = reviews[0].keys()

with open(csv_file, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()
    writer.writerows(reviews)

print("Data has been written to {csv_file}")