In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import re
import json
import sqlite3
import pandas as pd
import hashlib
import time
import csv

### Create base user-agent

In [2]:
headers = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

____

### Created functions that help me to clean data and have it in good shape
* clean_text - Cleans text of non-letter characters (e.g., spaces, punctuation marks) at the beginning and end of a line.
* add_space - Adds a period with a space between a lowercase letter and an uppercase letter if they follow each other.

In [3]:
def clean_text(text):
    return re.sub(r'^[^\wА-Яа-яІіЇїЄє]+|[^\wА-Яа-яІіЇїЄє]+$', '', text)

In [4]:
def add_space(text):
    return re.sub(r'([а-я])([А-Я])', r'\1. \2', text)

____

### How scrape_website work
* We get each field by html tags or classes
* By link we can get to item page and get additional info like:
    - Shops where paople can buy this item
    - Item characteristics
    - Item description
    - Item image link
    - Item path that can give useful info like category or subcategory
* Characteristics and shops are in json format for comfortable use

In [5]:
pattern = r"Сподобалось:\s*(.*?)\s*Не сподобалося:\s*(.*?)\s*Досвід використання:\s*(.*)"
def scrape_website(review):
    name = review.find(class_="reviews-info-product__title").text.strip() or np.nan

    ID = hashlib.md5(name.encode()).hexdigest()

    link = "https://hotline.ua" + review.find(class_="reviews-info-product__title")["href"]

    response_link = requests.get(link, headers=headers).text
    response_html = BeautifulSoup(response_link, "html.parser")
    img_tag = response_html.find("img", attrs={"data-tracking-id": "product-3"})
    pic = ("https://hotline.ua" + img_tag["src"]) or np.nan

    recommend = review.find("div", class_=["review__recommend review__recommend--like", 
                                           "review__recommend review__recommend--dislike"])
    if recommend:
        recommend = recommend.text.strip()
    else:
        recommend = np.nan

    description = review.find("div", class_="review__row-experience").text.strip().replace("\n", ".") or np.nan
    match = re.search(pattern, description, re.DOTALL)
    if match:
        liked = clean_text(match.group(1).strip())
        disliked = clean_text(match.group(2).strip())
        experience = clean_text(match.group(3).strip())
    else:
        print("Failed to parse the text.")

    comment = review.find("div", class_="review__row-comment").text.strip() or np.nan

    shops = response_html.find_all("div", class_="list__item flex content")
    shops_list = []
    for shop in shops:
        title = shop.find("a", class_="shop__title")
        shop_name = title.text.strip()
        shop_link = "https://hotline.ua" + title["href"]
        shop_img = "https://hotline.ua" + shop.find(class_="shop__img-container").find("img")["src"]

        shops_list.append({
                "name": shop_name,
                "link": shop_link,
                "image": shop_img
            })
    shops_json = json.dumps(shops_list, ensure_ascii=False, indent=4)

    path = response_html.find("ul", class_="breadcrumbs__list").text.strip().replace("\n          ", "/").replace("//","/")

    characteristics_table = response_html.find(class_="specifications__table")
    characteristics = {}
    for tr in characteristics_table.find_all("tr"):
        try:
            data = tr.find_all("td")
            characteristics[data[0].contents[0].strip().strip()] = data[1].text.strip()
        except:
            pass
    characteristics_json = json.dumps(characteristics, ensure_ascii=False, indent=4)

    description = response_html.find(class_="html-clamp description__content").text.strip()
    return name, ID, link, pic, recommend, liked, disliked, experience, comment, shops_json, path, characteristics_json, description

_____

### Create DataFrame we want to get data in

In [6]:
reviews_df = pd.DataFrame(columns=["name", "ID", "link", "pic", "recommend", "liked", "disliked",
                                   "experience", "comment", "shops_json", "path", "characteristics_json", "description"])

In [None]:
!pip install fake_useragent

_____

<p style="color:red; font-size: 36px"><b>IF YOU NEED ONLY REVIEWS READ THE TEXT BELOW AND GO FURTHER</b></p>

# How i process each page (All data for full dataset)
* We need to create random user-agent to reduce chance of being detected as a bot. It is a better practice but i tested it on pre-defined user-agent and it worked the same.
* Our code prints number of page which is processing and if something wrong it prints out the message of error . In our case every message means that format is not appropriate for web scraping and this review will be skiped. So we get the reviews we need
* Hotline gives only 13-14 pages to process and after that it gives message of error that means we need to stop running cell and save our DataFrame to .csv format. This doesn't stop running cell but it's a sign that we can get more data now. Later (1-2 hours you can get data again). I think that 13-14 pages of reviews is enough, because its 110-130 reviews.
* To run cell below you need to choose page you want to start from:
  - for page in range( "YOUR_START_PAGE", 100)
  - "YOUR_LINK" should be in this format from Hotline: https://hotline.ua/ua/reviews/products/. You can change categories that are on the right.

In [None]:
from fake_useragent import UserAgent
import random
ua = UserAgent()
for page in range("YOUR_START_PAGE", "YOUR_END_PAGE"):
    headers = {"User-Agent": ua.random}
    print(page)
    webpage_url = 'YOUR_LINK' + f'&p={page}'
    response = requests.get(webpage_url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    reviews = soup.find_all("div", class_="reviews-info__item content")
    for review in reviews:
        try:
            name, ID, link, pic, recommend, liked, disliked, experience, comment, shops_json, path, characteristics_json, description = scrape_website(review)
            review_data = pd.DataFrame([{"name": name,
                                         "ID": ID,
                                        "link": link,
                                        "pic": pic,
                                        "recommend": recommend,
                                        "liked": liked,
                                        "disliked": disliked,
                                        "experience": experience,
                                        "comment": comment,
                                        "shops_json": shops_json,
                                        "path": path,
                                        "characteristics_json": characteristics_json,
                                        "description": description}])
            reviews_df = pd.concat([reviews_df, review_data], ignore_index=True)
        except Exception as e:
            print(e)
            pass
    sleep_time = random.randint(3, 7)
    time.sleep(sleep_time)

### Statistic of web scraping
* 1-14 before error message
* 15-28 before error message
* 29-42 before error message
* 43-56 before error message
* 57-68 before error message
* 69-80 before error message

### Finaly you need to change name of .csv file and it will be succesfully saved

In [None]:
reviews_df.to_csv("YOUR_CSV.csv", index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)

# How i process each page (Only reviews)

### Everything is the same as it was but without info about item because it was the reason why we can't get data after 14 pages collecting our data

In [8]:
pattern = r"Сподобалось:\s*(.*?)\s*Не сподобалося:\s*(.*?)\s*Досвід використання:\s*(.*)"
def parse_only_reviews(review):
    name = review.find(class_="reviews-info-product__title").text.strip() or np.nan

    ID = hashlib.md5(name.encode()).hexdigest()

    link = "https://hotline.ua" + review.find(class_="reviews-info-product__title")["href"]
    recommendation = review.find("div", class_=["review__recommend review__recommend--like",
                                           "review__recommend review__recommend--dislike"])
    if recommendation:
        recommend = recommendation.text.strip()
    else:
        print("Recommend not found")

    description = review.find("div", class_="review__row-experience").text.strip().replace("\n", ".") or np.nan
    match = re.search(pattern, description, re.DOTALL)
    if match:
        liked = clean_text(match.group(1).strip())
        disliked = clean_text(match.group(2).strip())
        experience = clean_text(match.group(3).strip())
    else:
        print("Не вдалося розпарсити текст.")

    comment = review.find("div", class_="review__row-comment").text.strip() or np.nan
    return name, ID, link, recommend, liked, disliked, experience, comment

In [9]:
only_reviews = pd.DataFrame(columns=["name", "ID", "link", "recommend", "liked", "disliked", "experience", "comment"])

### To run cell below you need to choose page you want to start from:
  - for page in range( "YOUR_START_PAGE", "YOUR_END_PAGE")
  - "YOUR_LINK" should be in this format from Hotline: https://hotline.ua/ua/reviews/products/. You can change categories that are on the right.

In [10]:
from fake_useragent import UserAgent
import random
from urllib.parse import urlparse, parse_qs
ua = UserAgent()
def parse_reviews_url_list(url_list):
    for url in url_list:
        only_reviews = pd.DataFrame(columns=["name", "ID", "link", "recommend", "liked", "disliked", "experience", "comment"])
        headers_pages = {"User-Agent": ua.random}
        response_pages = requests.get(url, headers=headers_pages)
        soup_pages = BeautifulSoup(response_pages.text, "html.parser")
        pages = soup_pages.find_all(class_="page")
        pages = [page.text.strip() for page in pages]
        pages = list(filter(lambda x: x not in ["", "..."], pages))
        pages = [int(x) for x in pages]
        for page in range(min(pages), max(pages)+1):
            headers = {"User-Agent": ua.random}
            print(page)
            webpage_url = url + f'&p={page}'
            response = requests.get(webpage_url, headers=headers)
            soup = BeautifulSoup(response.text, "html.parser")
            reviews = soup.find_all("div", class_="reviews-info__item content")
            for review in reviews:
                try:
                    name, ID, link, recommend, liked, disliked, experience, comment = parse_only_reviews(review)
                    review_data = pd.DataFrame([{"name": name,
                                                "ID": ID,
                                                "link": link,
                                                "recommend": recommend,
                                                "liked": liked,
                                                "disliked": disliked,
                                                "experience": experience,
                                                "comment": comment}])
                    only_reviews = pd.concat([only_reviews, review_data], ignore_index=True)
                except Exception as e:
                    print(e)
                    pass
            sleep_time = random.randint(5, 21)
            time.sleep(sleep_time)
        query = urlparse(url).query
        params = parse_qs(query)
        vendor_id = params.get("vendor_id", ["tablets"])[0] #Change default if u need
        only_reviews.to_csv(f"new_parse/vendor_id_{vendor_id}.csv", index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)

<p style="color: red; font-size: 25px"><b>The link should have at least 2 pages of reviews to work properly!!!</b></p>

In [None]:
url_list = [] # Your url list
parse_reviews_url_list(url_list)