In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import fake_useragent
import re
import warnings
warnings.filterwarnings("ignore")

ua = fake_useragent.UserAgent()

In [2]:
def get_url(category, tot_pages):
    for page in range(1,tot_pages+1):
        if page % log_frequency == 0:
            print(f'Log: working over page {page}...')       
        
        url_main = f'https://www.domsporta.com/catalog/{category}/?PAGEN_1={page}'
        response = requests.get(url_main, headers = {'user-agent': ua.random})
        soup = BeautifulSoup(response.content, 'lxml')
        data = soup.find_all(class_ = 'b-catalog__item-info')
        for item in data:
            item_url = 'https://www.domsporta.com' + item.find('a').attrs['href']
            yield item_url

In [3]:
def get_item(category, tot_pages):
    
    for url_item in get_url(category, tot_pages):
        response = requests.get(url_item, headers = {'user-agent': ua.random})
        soup = BeautifulSoup(response.content, 'lxml')
        
        try:
            title = soup.find('h1').text
        except:
            title = ''

        try:
            article = soup.find(class_ = 'b-detail-wide__top-info-wrap -product-code').text
            article = article.replace(' ', '')
            article = article.replace('\nКодтовара:', '')
        except:
            article = ''

        try:
            image = soup.find_all('img', alt = True)
            for im in image:
                if bool(re.search(pattern = re.escape(title), string = im.attrs['alt'])):
                    url = 'https://www.domsporta.com' + im.attrs['src']
                    with open(f'images/{article}.jpeg', 'wb') as f:
                        r = requests.get(url)
                        f.write(r.content)
                    break
        except:
            print(f'Log: no image for {article} found...')
            url = ''
            
        
            
        description = []
        try:
            desc = soup.find(class_ = 'b-detail__tab-description').children
            
            for child in desc:
                child = child.text.replace('\n', '')
                child = child.replace('ОСНОВНАЯ ИНФОРМАЦИЯ', '')
                if child == '':
                    pass
                else:
                    description.append(child.replace('\xa0', '').strip())
        except:
            description.append('')
        description_fin = ' '.join(map(str,(description)))
            
        characteristics = []
        try:
            char = soup.find(class_ = 'b-detail__characteristics').children
            
            for ch in char:
                ch = ch.text.replace('\n?', '')
                ch = ch.replace('Общие свойства', '')
                ch = ch.replace('\n\n', ':').strip()
                ch = ch.replace(':\n', ' ')
                if ch == '':
                    pass
                else:
                    characteristics.append(ch.strip())
        except:
            characteristics.append('')
        characteristics_fin = ' '.join(map(str,(characteristics)))
        try:
            price = soup.find(class_ = 'b-detail__price').text
            price = price.replace('руб.', '')
            price = price.replace(' ', '')
            price = price.replace('\n', '')
        except:
            price = ''

        try:
            categories = soup.find(class_ = 'breadcrumbs')
            cat = categories.find_all('a')
            cat_1 = cat[1].text
        except:
            cat_1 = ''
        try:
            cat_2 = cat[2].text
        except:
            cat_2 = ''
        try:
            cat_3 = cat[3].text
        except:
            cat_3 = ''
    
        yield title, article, price, cat_1, cat_2, cat_3, url, description_fin, characteristics_fin

In [4]:
df_columns = ['title', 'article','price', 'cat_1', 'cat_2', 'cat_3','url', 
              'description', 'caracteristics']

df = pd.DataFrame(columns = df_columns)
save_frequency = 50
log_frequency = 5

In [5]:
def to_csv(category, tot_pages):
    counter = 0  

    for item in get_item(category, tot_pages):
            data = []
            for i in range(len(df_columns)):
                data.append(item[i])
            df.loc[len(df.index)] = data
            #time.sleep(0.05)

            counter += 1
            if counter % save_frequency == 0:
                df.to_csv('domsporta.csv')
                print(f'{counter} items saved in csv..')
    df.to_csv('domsporta.csv')

    print(f'\nTotal items saved: {counter}')  

In [6]:
to_csv('steppery', 4)

50 items saved in csv..

Total items saved: 71


In [7]:
to_csv('begovye_dorozhki', 27)

50 items saved in csv..
Log: working over page 5...
100 items saved in csv..
150 items saved in csv..
Log: working over page 10...
200 items saved in csv..
250 items saved in csv..
Log: working over page 15...
300 items saved in csv..
350 items saved in csv..
Log: working over page 20...
400 items saved in csv..
450 items saved in csv..
Log: working over page 25...
500 items saved in csv..

Total items saved: 519


In [8]:
print(f"Items with no article: {list(df[df['article'] == ''].index)}")
print(f"Items with no title: {list(df[df['title'] == ''].index)}")
print(f"Items with no price: {list(df[df['price'] == ''].index)}")
print(f"Items with no description: {list(df[df['description'] == ''].index)}")

Items with no article: [82, 105, 146, 186, 223, 350, 500]
Items with no title: []
Items with no price: [0, 68, 69, 70, 82, 105, 146, 186, 223, 295, 350, 500, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589]
Items with no description: [82, 105, 146, 186, 223, 350, 500]


In [9]:
#def show_image(val):
    #return '<a href="{}"><img src="{}" width=10000></img></a>'.format(val, val)
#df.iloc[58:60].style.format({'url': show_image, **{'width': '100px'}})

