Собираю все ссылки на товары по каталогу с сайта pop-music.ru

In [70]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from tqdm import tqdm

In [2]:
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'

def get_page_text(url, user_agent, error_log=True):
    headers = {'User-Agent': user_agent}
    response = requests.get(url, headers=headers, allow_redirects=True)

    if response.status_code == 200:
        page_text = response.text
        return page_text
    elif error_log:
        print(f'Ошибка! Response code: {response.status_code}')
        print(f'URL: {url}')
        print('--------------')
    
    return None

In [5]:
main_url = 'https://pop-music.ru/'
page_text = get_page_text(main_url, USER_AGENT)

In [4]:
page_text[:100]

'<!DOCTYPE html>\n<html lang="ru">\n<head>\n    \n    <meta charset="utf-8">\n    <meta http-equiv="x-ua-c'

In [6]:
soup = BeautifulSoup(page_text)

In [9]:
def clear_link(page_link):
    page_link = re.sub('^\/', '', page_link)
    page_link = re.sub('[\#\?].+', '', page_link)
    
    return page_link

In [15]:
soup.find_all('a')[10:20]

[<a class="topline__link" href="/shops/">
 <div class="topline__link-icon">
 <svg class="icon icon-notes">
 <use xlink:href="/bitrix/templates/pop-music/images/sprite.svg#notes"></use>
 </svg>
 </div>
 <div class="topline__link-desc">Наши магазины</div>
 </a>,
 <a href="tel:+78002505500">8 800 250 55 00</a>,
 <a class="header__link" href="/help/">
 <div class="header__link-icon">
 <svg class="icon icon-lifebuoy">
 <use xlink:href="/bitrix/templates/pop-music/images/sprite.svg#lifebuoy"></use>
 </svg>
 </div>
 <div class="header__link-desc">Помощь</div>
 </a>,
 <a class="header__link js-popup-ajax" href="popup-login">
 <div class="header__link-icon">
 <svg class="icon icon-user">
 <use xlink:href="/bitrix/templates/pop-music/images/sprite.svg#user"></use>
 </svg>
 </div>
 <div class="header__link-desc">Войти</div>
 </a>,
 <a class="header__link" href="/favourite/">
 <div class="header__link-icon">
 <svg class="icon icon-heart">
 <use xlink:href="/bitrix/templates/pop-music/images/sprite

In [80]:
links = set()
# Поиск ссылок на главной странице

for a in soup.find_all('a'):
    if not a.has_attr('href'):
        continue
        
    if not a.has_attr('itemprop'):
        continue
    
    if a.attrs['itemprop'] != 'url':
        continue
    
    page_link = a.attrs['href']
    
    if re.search('^(http)', page_link):
        if main_url not in page_link or main_url == page_link:
            continue
    
    page_link = clear_link(page_link)
    
    if len(page_link) == 0:
        continue 
    
    page_link = main_url + page_link
    
    links.add(page_link)

# links

In [81]:
catalog_links =[]
for link in links:
    if re.search(r'^(https://pop-music.ru/catalog)', link):
        catalog_links.append(link)

In [75]:
catalog_links[:10]

['https://pop-music.ru/catalog/gitarnaya-furnitura/predusiliteli-i-ekvalayzery/',
 'https://pop-music.ru/catalog/udarnyie/tarelki-bell/',
 'https://pop-music.ru/catalog/mikrofonyi/lampovyie-mikrofonyi/',
 'https://pop-music.ru/catalog/studiynoe-oborudovanie/studiynyie-aksessuaryi/',
 'https://pop-music.ru/catalog/udarnyie/komplektyi-tarelok/',
 'https://pop-music.ru/catalog/gitarnyie-aksessuaryi/strunyi-dlya-elektrogitar/',
 'https://pop-music.ru/catalog/svetovoe-oborudovanie/aksessuaryi-dlya-svetovogo-oborudovaniya/',
 'https://pop-music.ru/catalog/gitaryi/gitary-klassicheskie-s-podklyucheniem/',
 'https://pop-music.ru/catalog/udarnyie/tarelki-ride/',
 'https://pop-music.ru/catalog/gitarnyie-aksessuaryi/chehlyi-dlya-akusticheskih-gitar/']

In [82]:
len(catalog_links)

327

Собраны ссылки на все первые страницы разделов каталога.

К ним необходимо добавить страницы, доступные при переключении навигации ("следующие" страницы)

In [78]:
with open('links.txt', 'w', encoding='utf-8') as f:
    for link in catalog_links:
        f.write("%s\n" % link)

In [45]:
test_link = 'https://pop-music.ru/catalog/aksessuaryi/klavishnye-stoyki/'

In [46]:
page_text = get_page_text(test_link, USER_AGENT)

In [47]:
soup_2 = BeautifulSoup(page_text)

In [87]:
test = 'https://pop-music.ru/catalog/gitaryi/gitaryi-akusticheskie/'

In [103]:
# Если есть класс pagination, значит, есть навигация внизу страницы с переходом на следующие страницы  этого же раздела.
new_links = []

for catalog_link in tqdm(catalog_links): 
    page_text = get_page_text(catalog_link, USER_AGENT)
    soup_2 = BeautifulSoup(page_text)

    pagination = soup_2.find('div', class_='pagination')

    if pagination:
        links = pagination.find_all('a', class_='pagination__page')
        pages = [int(links[i].text) for i in range(len(links))]    # достаем номера страниц
        max_page = max(pages)

        new_pages = [catalog_link+'?PAGEN_2='+str(i) for i in range(2, max_page+1)]

        new_links = new_links + new_pages


100%|████████████████████████████████████████████████████████████████████████████████| 327/327 [10:02<00:00,  1.84s/it]


In [104]:
new_links[:10]

['https://pop-music.ru/catalog/udarnyie/komplektyi-tarelok/?PAGEN_2=2',
 'https://pop-music.ru/catalog/gitarnyie-aksessuaryi/strunyi-dlya-elektrogitar/?PAGEN_2=2',
 'https://pop-music.ru/catalog/gitarnyie-aksessuaryi/strunyi-dlya-elektrogitar/?PAGEN_2=3',
 'https://pop-music.ru/catalog/gitarnyie-aksessuaryi/strunyi-dlya-elektrogitar/?PAGEN_2=4',
 'https://pop-music.ru/catalog/gitarnyie-aksessuaryi/strunyi-dlya-elektrogitar/?PAGEN_2=5',
 'https://pop-music.ru/catalog/gitarnyie-aksessuaryi/strunyi-dlya-elektrogitar/?PAGEN_2=6',
 'https://pop-music.ru/catalog/gitarnyie-aksessuaryi/strunyi-dlya-elektrogitar/?PAGEN_2=7',
 'https://pop-music.ru/catalog/gitarnyie-aksessuaryi/strunyi-dlya-elektrogitar/?PAGEN_2=8',
 'https://pop-music.ru/catalog/gitaryi/gitary-klassicheskie-s-podklyucheniem/?PAGEN_2=2',
 'https://pop-music.ru/catalog/gitarnyie-aksessuaryi/chehlyi-dlya-akusticheskih-gitar/?PAGEN_2=2']

In [100]:
len(new_links)

684

In [105]:
with open('new_links.txt', 'w', encoding='utf-8') as f:
    for link in new_links:
        f.write("%s\n" % link)

In [106]:
all_links = catalog_links + new_links

In [107]:
len(all_links)

1011

In [113]:
test = all_links[100]
test

'https://pop-music.ru/catalog/aksessuaryi/kabeli-mikrofonnyie/'

In [114]:
product_links=set()

In [117]:
# Собираем все товары с собранных страниц каталога
product_links=set()

for link in tqdm(all_links):
    page_text = get_page_text(link, USER_AGENT)
    soup_3 = BeautifulSoup(page_text)
    products = soup_3.find_all('div', class_='products-grid__i')

    for product in products:

        product_link = product.find('a', class_='product-card__name')
        if not product_link.has_attr('href'):
            continue

        product_link = product_link.attrs['href']    
        product_link = clear_link(product_link)

        if len(product_link) == 0:
            continue 

        product_link = main_url + product_link

        product_links.add(product_link)

100%|██████████████████████████████████████████████████████████████████████████████| 1011/1011 [29:31<00:00,  1.75s/it]


In [118]:
len(product_links)

11717

In [119]:
with open('product_links.txt', 'w', encoding='utf-8') as f:
    for link in product_links:
        f.write("%s\n" % link)

In [121]:
list(product_links)[:10]

['https://pop-music.ru/products/audiointerfeys-focusrite-clarett-4pre-888880029674/',
 'https://pop-music.ru/products/klassicheskaya-gitara-stagg-25th-scl-maho-888880028463/',
 'https://pop-music.ru/products/perehodnik-soundking-cxa004-888880021470/',
 'https://pop-music.ru/products/plastik-evans-bd20gb4s-888880020015/',
 'https://pop-music.ru/products/gitarnyy-kabel-soundking-bjj031-15ft/',
 'https://pop-music.ru/products/bas-gitara-schecter-omen-5-blk-l-h-888880006652/',
 'https://pop-music.ru/products/plastik-evans-b16uv1-888880025353/',
 'https://pop-music.ru/products/akusticheskaya-gitara-stagg-sa20d-red-888880022374/',
 'https://pop-music.ru/products/predusilitel-art-usbdualpre-888880000341/',
 'https://pop-music.ru/products/top-lok-schaller-r1-art-13120701-888880011641/']