In [11]:
import requests
from bs4 import BeautifulSoup
import json
import os

from random import choice, randint
import time

## 1. Extracting all the attributes from each product page

- In this case we do not need selenium, as a simple request suffices
- We will only store the relevant attributes as json in the output

In [26]:
output_path = 'output'
output_folder = output_path

# Load json file from ikea_products.json which we will use as refference to know which pages to hit
with open(os.path.join(output_path, 'ikea_products.json'), encoding="utf-8") as f:
    target_products = json.load(f)

# Check if there is a file named full_product_extraction.json if not create it
# We will use this file to keep track of the categories we have already extracted
output_fname = os.path.join(output_path, 'full_product_extraction.json')
if not os.path.isfile(output_fname):
    with open(output_fname, 'w', encoding="utf-8") as f:
        json.dump([], f)
        extracted_products = []
else:
    with open(output_fname, encoding="utf-8") as f:
        extracted_products = json.load(f)

previously_extracted_urls_fname = os.path.join(output_path, 'extracted_product_urls.json')
if not os.path.isfile(previously_extracted_urls_fname):
    with open(previously_extracted_urls_fname, 'w', encoding="utf-8") as f:
        json.dump([], f)
        previously_extracted_urls = []
else:
    with open(previously_extracted_urls_fname, encoding="utf-8") as f:
        previously_extracted_urls = json.load(f)

# Do the same for call_record.json
if not os.path.isfile(os.path.join(output_path, 'request_records.json')):
    with open(os.path.join(output_path, 'request_records.json'), 'w', encoding="utf-8") as f:
        json.dump([], f)
        call_record = []
else:
    with open(os.path.join(output_path, 'request_records.json'), encoding="utf-8") as f:
        call_record = json.load(f)

In [13]:
def read_json_file(file_path):
    with open(file_path, 'r', encoding="utf-8") as file:
        data = json.load(file)
    return data

def write_json_file(file_path, data):
    with open(file_path, 'w', encoding="utf-8") as file:
        json.dump(data, file, indent=4)

def append_to_json_list(file_path, new_elements):
    # Read the existing data
    data = read_json_file(file_path)
    
    # Check if the data is a list
    if isinstance(data, list):
        # Append new elements to the list
        data.extend(new_elements)
    else:
        raise ValueError("JSON data is not a list")
    
    # Write the updated list back to the file
    write_json_file(file_path, data)

In [6]:
# Global variable to store log entries
log_entries = []
unsuccessful_request_count = 0

def log_action(action, result, elapsed_time=None):
    global log_entries, unsuccessful_request_count  # Declare the global variables
    
    log_entry = {
        "timestamp": time.strftime('%Y-%m-%d %H:%M:%S'),
        "action": action,
        "result": result,
        "time_taken": f"{elapsed_time:.2f}" if elapsed_time is not None else "N/A"
    }
    log_entries.append(log_entry)  # Append the log entry to the global list

    # Update the unsuccessful request count
    if result.lower() != "success":
        unsuccessful_request_count += 1
    else:
        unsuccessful_request_count = 0
    
    return log_entry

def get_unsuccessful_request_count():
    global unsuccessful_request_count  # Declare the global variable
    return unsuccessful_request_count

In [46]:
def extract_product_attributes(product, cookie, headers):
    """
    Extract product attributes from the product page and return the results as a dictionary
    Only extract one single attribute per function call
    Inputs:
        product_url: Dictionary of the product with keys url, name and category
        cookie: Cookie to be used in the request, should have been previously obtained. It is a good idea to use the same cookie for all requests with same headers
        headers: Headers to be used in the request
    Outputs:
        product_attributes: Dictionary containing the extracted product attributes
    """

    action = "Request to individual product page"
    url = product["url"]
    # Make the GET request
    # We may need to remove headers, or iterate using different profiles
    start_time = time.time()
    response = requests.get(url, headers=headers, cookies=cookie)
    elapsed_time = time.time() - start_time


    # Check the response
    if response.status_code == 200:
        log_action(action, "Success", elapsed_time)

        soup = BeautifulSoup(response.content, 'html.parser')
        # extracting JSON object from a script tag within the HTML
        product_div = soup.find('div', class_='pip-product__subgrid product-pip js-product-pip')
        if product_div:
            # Extract the 'data-hydration-props' attribute, which contains JSON
            json_data_attr = product_div['data-hydration-props']
            
            # Convert HTML entities and escaped quotes to normal form
            json_data_attr = json_data_attr.replace('&quot;', '"')
            
            # Load string into a JSON object
            data = json.loads(json_data_attr)

            price_dict = data["pipPriceModule"]["price"]["mainPriceProps"]["price"]
            integer_price = price_dict["integer"].replace(".", "")
            product_price = f"{integer_price}.{price_dict['decimals']}"
            # Convert price to float
            product_price = float(product_price)
            currency = data["pipPriceModule"]["price"]["mainPriceProps"]["currencySymbol"]
            # We get product dimensions for the full product
            product_measurement_text = data["pipPriceModule"]["measurementText"]

            product_dimensions_json = data["productInformationSection"]["dimensionProps"]["packaging"]["contentProps"]

            product_dictionary = product_dimensions_json
            product_dictionary["price"] = product_price
            product_dictionary["currency"] = currency
            product_dictionary["measurement_ensembled_text"] = product_measurement_text
            product_dictionary["url"] = url
            product_dictionary["name"] = product["name"]
            product_dictionary["category"] = product["category"]
            
            return product_dictionary
            
    else:
        log_action(action, "Failed", elapsed_time)
        print("Failed to retrieve the page, status code:", response.status_code)



In [8]:
cookies = [
    {
        'guest': 'eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6ImVxSFFLR3duR3hfV3dJZkx0RGpaeDA5MTUzS2xSam5fVE1nVUlMYlJ5RncifQ.eyJpc3MiOiJodHRwczovL2FwaS5pbmdrYS5pa2VhLmNvbS9ndWVzdCIsInN1YiI6ImIwMDJmYTBkLTkwMjEtNGQ2My04YzlkLTJhZDNlZjM0ZjE0YiIsInJldGFpbFVuaXQiOiJlcyIsImlhdCI6MTcxNTQ1MTY0MCwiZXhwIjoxNzE4MDQzNjQwfQ.NS0sPhnYbArE-750pTzE4_5I6wsCRgZxlKa0Cfyf0Z4bKi5jsfvcFWRS88jz6d0O1z6wLmfN5XN0tseCMzouaeLW_jmxGQ5qQF1_9J9uJny7R3t37Ku_lT41Psbu1ymQra2cLOZWwbungu41bNClWd3p3k4NaOXWx2fcivZrFUo'
    }
]
headers = [
    {
        'Host': 'www.ikea.com',
        'Sec-Ch-Ua': '"Not-A.Brand";v="99", "Chromium";v="124"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'document',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'es-ES,es;q=0.9',
        'Connection': 'close'
    }
]

In [43]:
target_products = [product for product in target_products if product["url"] not in previously_extracted_urls]

In [47]:
product_attributes = extract_product_attributes(product, cookie, header)

In [45]:
full_product_extraction = []
max_n_profile_uses = 100
for idx, target_product in enumerate(target_products):
    sleep_time = randint(1, 3)
    if (idx+1) % max_n_profile_uses == 0:
        cookie = choice(cookies)
        header = choice(headers)
        append_to_json_list(output_fname, full_product_extraction)
        full_product_extraction = []
        append_to_json_list(os.path.join(output_path, 'request_records.json'), log_entries)
        log_entries = []

    product = target_product
    product_attributes = extract_product_attributes(product, cookie, header)
    print(f"extracted product {idx+1} out of {len(target_products)}")

    time.sleep(sleep_time)

    full_product_extraction.append(product_attributes)
    previously_extracted_urls.append(target_product["url"])

    if unsuccessful_request_count > 4:
        break

extracted product 1 out of 8103
extracted product 2 out of 8103
extracted product 3 out of 8103
extracted product 4 out of 8103
extracted product 5 out of 8103
extracted product 6 out of 8103
extracted product 7 out of 8103
extracted product 8 out of 8103
extracted product 9 out of 8103
extracted product 10 out of 8103
extracted product 11 out of 8103
extracted product 12 out of 8103
extracted product 13 out of 8103
extracted product 14 out of 8103
extracted product 15 out of 8103
extracted product 16 out of 8103
extracted product 17 out of 8103
extracted product 18 out of 8103
extracted product 19 out of 8103
extracted product 20 out of 8103
extracted product 21 out of 8103
extracted product 22 out of 8103
extracted product 23 out of 8103
extracted product 24 out of 8103
extracted product 25 out of 8103
extracted product 26 out of 8103
extracted product 27 out of 8103
extracted product 28 out of 8103
extracted product 29 out of 8103
extracted product 30 out of 8103
extracted product 3

ValueError: could not convert string to float: '1.045.'

In [51]:
append_to_json_list(output_fname, full_product_extraction)
append_to_json_list(os.path.join(output_path, 'request_records.json'), log_entries)
write_json_file(previously_extracted_urls_fname, previously_extracted_urls)
full_product_extraction = []
log_entries = []