In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from tqdm import tqdm
from typing import Dict

In [None]:
url = "http://books.toscrape.com/"
out_path = '/Users/saraevsviatoslav/Documents/FORMATION_DATA_ANALYST/'

In [None]:
class Book():
    def __init__(self, title, category, price, note, en_stock, count_in_stock, description):
        self.title = title
        self.category = category
        self.price = price
        self.note = note
        self.en_stock = en_stock
        self.count_in_stock = count_in_stock
        self.description = description




In [None]:
def scrap_books(url: str) -> Dict[str, pd.DataFrame]:
    all_books = []
    headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    }
    current_url = url
    note_to_int = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
    for page in tqdm(range(50)):

        response = requests.get(current_url, headers=headers)
        html = response.content.decode('utf-8', errors='ignore')
        soup = BeautifulSoup(html, 'lxml')
        books_block = soup.find_all('li', class_='col-xs-6 col-sm-4 col-md-3 col-lg-3')


        for book in books_block:

            link_desc = book.find('h3').find('a')
            
            link = link_desc['href']
            if link[:9] != 'catalogue':
                link = 'catalogue/' + link

            full_link = url + link


            # =================================== On the new page

            resp_link = requests.get(full_link, headers=headers)
            sub_soup = BeautifulSoup(resp_link.text, 'lxml')
            info_on_page = sub_soup.find('div', class_='col-sm-6 product_main')

            instock_available = info_on_page.find('p', class_='instock availability').get_text(strip=True)

            available_num = int(re.findall(r'\d+', instock_available)[0])

            desc_header = sub_soup.find('div', id='product_description')
            if desc_header:
                description_bad = desc_header.find_next_sibling('p').get_text(strip=True)
                description = description_bad.encode('latin1').decode('utf-8')

            breadcrumb = sub_soup.select('ul.breadcrumb li')
            category = breadcrumb[2].get_text(strip=True)

            # ===================================

            note_header = book.find('div', class_='image_container')
            note = note_header.find_next_sibling('p')['class'][1]
            note_int = note_to_int[note]

            title_tag = book.find('h3').find('a')
            title = title_tag['title']

            price = book.find('p', class_='price_color').get_text(strip=True)
            price_num = float(re.findall(r"\d+(?:\.\d+)?", price)[0])

            en_stock = book.find('p', class_='instock availability')
            status_stock = en_stock.find('i')['class']

            if status_stock == ["icon-ok"]:
                availability = True
            else:
                availability = False


            new_book = Book(
                title=title,
                category=category,
                price=price_num,
                note=note_int,
                en_stock=availability,
                count_in_stock=available_num,
                description=description
            )

            all_books.append(new_book)


        next = soup.find('li', class_='next')
        if next:
            next_link = next.find('a')['href']
            if next_link[:9] != 'catalogue':
                next_link = 'catalogue/' + next_link

            current_url = url + next_link
        else:
            break
        break
    
    data = [book.__dict__ for book in all_books]
    df_scrapped = pd.DataFrame(data)
    df_books = df_scrapped[['title', 'category', 'description']]
    df_shop = df_scrapped[['price', 'note']]
    df_stock = df_scrapped[['en_stock', 'count_in_stock']]

    dfs_to_export = {'books': df_books, 'shop': df_shop, 'stock': df_stock}
    return dfs_to_export




In [None]:
def save_to_csv(url: str, out_path: str):
    dfs = scrap_books(url)
    for name, df in dfs.items():
        path = out_path + name + '.csv'
        df.to_csv(path)


In [None]:
save_to_csv(url, out_path)