In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import fake_useragent
import re
import warnings
warnings.filterwarnings("ignore")

ua = fake_useragent.UserAgent()

In [2]:
save_frequency = 50
log_frequency = 5

In [3]:
def get_url(category, tot_pages):
    for page in range(1, tot_pages + 1):
        if page % log_frequency == 0:
            print(f"Log: working over page {page}...")

        url_main = f"https://www.domsporta.com/catalog/{category}/?PAGEN_1={page}"
        response = requests.get(url_main, headers={"user-agent": ua.random})
        soup = BeautifulSoup(response.content, "lxml")
        data = soup.find_all(class_="b-catalog__item-info")
        for item in data:
            item_url = "https://www.domsporta.com" + item.find("a").attrs["href"]
            yield item_url

In [4]:
def get_item(category, tot_pages):
    count = 0
    for url_item in get_url(category, tot_pages):
        count += 1
        response = requests.get(url_item, headers = {'user-agent': ua.random})
        soup = BeautifulSoup(response.content, 'lxml')

        try:
            title = soup.find('h1').text
        except:
            title = ''

        try:
            article = soup.find(class_ = 'b-detail-wide__top-info-wrap -product-code').text
            article = article.replace(' ', '')
            article = article.replace('\nКодтовара:', '')
        except:
            article = ''

        try:
            image = soup.find_all('img', alt = True)
            for im in image:
                if bool(re.search(pattern = re.escape(title), string = im.attrs['alt'])):
                    url = 'https://www.domsporta.com' + im.attrs['src']
                    with open(f'images/{category}/{count}_domsporta_{article}.jpeg', 'wb') as f:
                        r = requests.get(url)
                        f.write(r.content)
                    break
            image_ref = f"images/{category}/{count}_domsporta_{article}.jpeg"
        except:
            print(f'Log: no image for {article} found...')
            url = ''
            image_ref = ''

        description = []
        try:
            desc = soup.find(class_ = 'b-detail__tab-description').children

            for child in desc:
                child = child.text.replace('\n', '')
                child = child.replace('ОСНОВНАЯ ИНФОРМАЦИЯ', '')
                if child == '':
                    pass
                else:
                    description.append(child.replace('\xa0', '').strip())
        except:
            description.append('')
        description_fin = ' '.join(map(str,(description)))

        characteristics = []
        try:
            char = soup.find(class_ = 'b-detail__characteristics').children

            for ch in char:
                ch = ch.text.replace('\n?', '')
                ch = ch.replace('Общие свойства', '')
                ch = ch.replace('\n\n', ':').strip()
                ch = ch.replace(':\n', ' ')
                if ch == '':
                    pass
                else:
                    characteristics.append(ch.strip())
        except:
            characteristics.append('')
        characteristics_fin = ' '.join(map(str,(characteristics)))
        try:
            price = soup.find(class_ = 'b-detail__price').text
            price = price.replace('руб.', '')
            price = price.replace(' ', '')
            price = price.replace('\n', '')
        except:
            price = ''

        try:
            categories = soup.find(class_ = 'breadcrumbs')
            cat = categories.find_all('a')
            cat_1 = cat[1].text
        except:
            cat_1 = ''
        try:
            cat_2 = cat[2].text
        except:
            cat_2 = ''
        try:
            cat_3 = cat[3].text
        except:
            cat_3 = ''

        yield title, article, price, cat_1, cat_2, cat_3, url, description_fin, characteristics_fin, image_ref

In [5]:
df_columns = ['title', 'article','price', 'cat_1', 'cat_2', 'cat_3','url', 
              'description', 'caracteristics', 'img_ref']

df = pd.DataFrame(columns = df_columns)

In [6]:
import time 

def to_csv(category, tot_pages):
    counter = 0  

    for item in get_item(category, tot_pages):
        data = []
        for i in range(len(df_columns)):
            data.append(item[i])
        df.loc[len(df.index)] = data
        time.sleep(0.1)

        counter += 1
        if counter % save_frequency == 0:
            df.to_csv(f"{category}_domsporta.csv")
            print(f"{counter} items saved in csv..")
    df.to_csv(f"{category}_domsporta.csv")

    print(f'\nTotal items saved: {counter}')  

In [7]:
to_csv("trenazhery_na_svobodnykh_vesakh", 45)

Log: working over page 5...
50 items saved in csv..
100 items saved in csv..
Log: working over page 10...
Log: working over page 15...
150 items saved in csv..
Log: working over page 20...
200 items saved in csv..
Log: working over page 25...
250 items saved in csv..
Log: working over page 30...
300 items saved in csv..
Log: working over page 35...
Log: working over page 40...
350 items saved in csv..
400 items saved in csv..
Log: working over page 45...

Total items saved: 420
