# Ceneo Scraper

| Component | Selector | Variable | 
| --------- | -------- | -------- | 
| opinion ID | | 
| opinion’s author | | 
| author’s recommendation | | 
| score expressed in number of stars | | 
| opinion’s content | | 
| list of product advantages | | 
| list of product disadvantages | | 
| how many users think that opinion was helpful | | 
| how many users think that opinion was unhelpful | | 
| publishing date | | 
| purchase date | | 

In [2]:
import requests
import os
import json
from bs4 import BeautifulSoup
from deep_translator import GoogleTranslator

import subprocess

In [3]:
# Mac notifications

CMD = '''
on run argv
  display notification (item 2 of argv) with title (item 1 of argv)
end run
'''

def notify(title, text):
  subprocess.call(['osascript', '-e', CMD, title, text])

In [4]:
product_id = input("Product ID: ")

In [5]:
selectors = {
    "opinion_id": [None,"data-entry-id"],
    "author": ["span.user-post__author-name"],
    "recommendation": ["span.user-post__author-recomendation > em"],
    "score": ["span.user-post__score-count"],
    "content": ["div.user-post__text"],
    "pros": ["div.review-feature__title--positives ~ div.review-feature__item", None, True],
    "cons": ["div.review-feature__title--negatives ~ div.review-feature__item", None, True],
    "helpful": ["button.vote-yes > span"],
    "unhelpful": ["button.vote-no > span"],
    "publish_date": ["span.user-post__published > time:nth-child(1)","datetime"],
    "purchase_date": ["span.user-post__published > time:nth-child(2)","datetime"],
}

In [6]:
def translate(text, from_lang="auto", to_lang="en"):
    if text:
        if isinstance(text, list):
            return {
                from_lang: text,
                to_lang: [GoogleTranslator(source = from_lang, target = to_lang).translate(t) for t in text]
            }
        else:
            return {
                from_lang: text,
                to_lang: GoogleTranslator(source = from_lang, target = to_lang).translate(text)
            }
    else:
        return None

In [7]:
# mapping functions

def mapRecommendation(value):
    if value == "Polecam":
        return True
    elif value == "Nie polecam":
        return False
    else:
        return None

def mapScore(value):
    value = value.replace(",", ".")
    splitValue = value.split("/")
    points = float(splitValue[0])
    maxPoints = float(splitValue[1])

    return points / maxPoints

mappingFunctions = {
    "recommendation": [mapRecommendation, False],
    "score": [mapScore, False],
    "helpful": [int, False],
    "unhelpful": [int, False],
    # "content": [translate, False],
    # "pros": [translate, True],
    # "cons": [translate, True],
}

In [8]:
def extract(ancestor, selector=None, attribute=None, return_list=False):
    if return_list:
        if attribute:
            result = [tag[attribute] for tag in ancestor.select(selector)]
        else:
            result = [tag.get_text().strip() for tag in ancestor.select(selector)]
        
        if len(result) > 0:
            return result
        else:
            return None
    if selector:
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute]
            except TypeError:
                return None
        try:   
            return ancestor.select_one(selector).get_text().strip()
        except AttributeError:
            return None
    if attribute:
        return ancestor[attribute]
    return ancestor.get_text().strip()

In [9]:
all_opinions = []
processed_opinions = 0
url = "https://www.ceneo.pl/" + product_id + "#tab-reviews"

while(url):
    responce = requests.get(url)

    page_dom = BeautifulSoup(responce.text, "html.parser")
    opinions = page_dom.select("div.js_product-review")

    for opinion in opinions:
        single_opinion = {
            key: extract(opinion, *value)
                for key, value in selectors.items()
        }

        for key, value in mappingFunctions.items():
            if single_opinion.get(key) != None:
                if value[1] == True:
                    single_opinion[key] = list(map(value[0], single_opinion[key]))
                else:
                    single_opinion[key] = value[0](single_opinion[key])

        all_opinions.append(single_opinion)
        processed_opinions += 1
    try:
        url = "https://www.ceneo.pl/" + extract(page_dom, "a.pagination__next", "href")
    except TypeError:
        url = None


# Saving to file

In [10]:
if not os.path.exists("opinions"):
    os.mkdir("opinions")

jf = open(f"opinions/{product_id}.json", "w", encoding="utf-8")

json.dump({
    "meta": {
        "name": extract(page_dom, ".product-top__product-info__name"),
        "product_id": product_id,
        "opinion_count": processed_opinions,
        "image": "https:" + extract(page_dom, ".js_gallery-media", "src"),
    },
    "content": all_opinions,
}, jf, indent=4, ensure_ascii=False)

jf.close()

notify("Finished Scraping", f"Finished scraping product with ID {product_id}!\nProducts processed: {processed_opinions}")