*Подключаем библиотеки*

In [113]:
import csv
import json
import os

import numpy as np
import requests
from fake_useragent import UserAgent
from lxml import html
from tqdm import tqdm

*Задаём города*

In [114]:
CITIES = ["kazan"]

*Получаем ссылку*

In [115]:
def get_url(city, page):
    return f"https://auto.ru/{city}/cars/used/?page={page}"

*Получаем html*

In [116]:
def get_html(url, header, cookies):
    r = requests.get(url, headers=header, cookies=cookies)
    r.encoding = "utf-8"

    return r.text

*Получаем количество страниц для контретного города*

In [117]:
def get_number_page(html_content):
    tree = html.fromstring(html_content)
    page = tree.xpath("//span[@class='ControlGroup ControlGroup_responsive_no ControlGroup_size_s ListingPagination__pages']//a[last()]")

    return int(page[0].text_content())

*Получаем ссылки на машины*

In [118]:
def get_links(html_content):
    tree = html.fromstring(html_content)
    items = tree.xpath("//div[contains(@class, 'ListingItem')]")

    result = set()
    for item in items:
        links = item.xpath("//a[contains(@class, 'Link OfferThumb')]/@href")
        if (isinstance(links, list)):
            for i in links:
                result.add(i)
        else:
            result.add(links)

    return result

*Получаем данные об автомобиле с главной страницы*

In [119]:
def get_link_tex_info(html_content):
    tree = html.fromstring(html_content)
    link = tree.xpath("//div[text()='Поколение']//..//a[@class='Link']/@href")

    if (len(link) != 0):
        return link[0]

    return -1

*Получаем данные об автомобиле с вспомогательной страницы*

In [120]:
def get_tex_info_extra(html_content):
    tree = html.fromstring(html_content)
    name_header = tree.xpath("//span[contains(@class, 'ModificationHeader__optionName')]")
    name_info = tree.xpath("//span[contains(@class, 'ModificationInfo__optionName')]")
    value_header = tree.xpath("//span[contains(@class, 'ModificationHeader__optionValue')]")
    value_info = tree.xpath("//span[contains(@class, 'ModificationInfo__optionValue')]")

    model = tree.xpath("//div[contains(@class, 'CatalogInStockOfferBaseItem__title')]/text()")

    return name_header + name_info, value_header + value_info, model[0] if (isinstance(model, list) and (len(model) != 0)) else model

In [121]:
def get_tex_info(html_content):
    tree = html.fromstring(html_content)
    try:
        price = tree.xpath("//span[@class='OfferPriceCaption__price']//text()")[0]
    except IndexError:
        price = np.nan
    data = tree.xpath("//div[@class='CardInfoRow__cell']//a/text() | //div[@class='CardInfoRow__cell']/text() | //div[@class='CardInfoRow__cell']//span/text()")

    return price, data

*Подгружаем cookie*

In [122]:
with open(os.path.join("cookie", "1.json"), 'r', encoding="utf-8") as file:
    cookies_list = json.load(file)

cookie = {cookie["name"]: cookie["value"] for cookie in cookies_list}

*Задаём характеристики, которые будем получать*

In [123]:

names = ["Год выпуска", "Пробег", "Налог", "Руль", "Состояние", "Владельцы"]
names_extra = ["Объем", "Мощность", "Коробка", "Тип двигателя", "Топливо", "Привод", "Разгон", "Расход", 
               "Страна марки", "Класс автомобиля", "Количество мест", "Длина", "Ширина", "Высота", "Колёсная база", 
               "Клиренс", "Размер колёс", "Объём топливного бака", "Полная масса", "Количество передач", "Максимальная скорость", 
               "Расход топлива, город/трасса/смешанный", "Экологический класс", "Выбросы CO2", "Объем двигателя", 
               "Тип наддува", "Количество цилиндров", "Число клапанов на цилиндр", "Степень сжатия", "Модель двигателя", "ГРМ"]


*Создаём файл*

In [124]:
fieldnames = ["price", "year", "mileage", "tax", "steering", "condition", "owners", "model", "volume",
              "power", "transmission", "engine", "fuel", "drive", "acceleration", "consumption", "country",
              "class", "number seats", "length", "width", "height", "wheelbase", "clearance",
              "wheel size", "fuel capacity", "weight", "gears", "maximum speed", "fuel consumption city/highway/combined",
              "ecological", "co2", "engine capacity", "boost type", "cylinders", "cylinder", "compression", "engine model", "timing"]

with open("data/cars.csv", "a", newline="", encoding="utf-8") as file:
    writer = csv.writer(file, delimiter=",")
    writer.writerow(fieldnames)

*Задаём header и начинаем сбор данных*

In [125]:
user = UserAgent()

In [126]:
for city in CITIES:
    print(city.upper())
    header = {"user-agent": user.random}
    max_page = get_number_page(get_html(get_url(city, 1), header, cookie))
    
    for page in tqdm(range(1, max_page + 1)):
        links = get_links(get_html(get_url(city, page), header, cookie))
        
        for link in links:
            link_tex = get_link_tex_info(get_html(link, header, cookie))
            if link_tex == -1:
                continue
            
            price, info_tex = get_tex_info(get_html(link, header, cookie))
            array = [price]
            
            name_value_map = {info_tex[i]: info_tex[i + 1] for i in range(0, len(info_tex), 2) if info_tex[i] in names}
            array.extend(name_value_map.get(name, np.nan) for name in names)
            
            name, value, model = get_tex_info_extra(get_html(link_tex, header, cookie))
            array.append(model.split(' ')[0] if model else np.nan)
            
            name_value_extra_map = {n.text: v.text for n, v in zip(name, value)}
            array.extend(name_value_extra_map.get(extra, np.nan) for extra in names_extra)
            
            with open("data/cars.csv", "a", newline="", encoding="utf-8") as file:
                csv.writer(file, delimiter=",").writerow(array)

KAZAN


100%|██████████| 99/99 [2:02:49<00:00, 74.44s/it]  
