In [13]:
import os
import csv
import json
import requests
import cfscrape
import traceback
from tqdm import tqdm
from time import sleep
from random import uniform
from fake_useragent import UserAgent
import numpy as np
import pandas as pd

In [5]:
# pip install cfscrape

In [6]:
# pip install requests==2.29.0

In [8]:
# pip install fake-useragent

In [None]:
CSV_NUMBER = 'cian'  # Постфикс названия создаваемой таблицы
CSV_PATH = os.path.normpath(os.path.join(os.getcwd(), 'csv'))  # Создаём папку 'csv' для записи создаваемых таблиц
ua = UserAgent()

# Если папки нет — создаём её
if not os.path.exists(CSV_PATH):
    os.mkdir(CSV_PATH)
    print(f'Folder {CSV_PATH} has been created!')

# Словарь некоторых городов с номерами, объявления по которым можно искать на Циан
regions = {
    'msk': 1,  # Москва
}

# Названия столбцов (header) будущей таблицы,
# которые связываются с отобранными признаками в create_table()
dataset = [
    [
        'region',
        'address',
        'price',
        'total_area',
        'kitchen_area',
        'living_area',
        'rooms_count',
        'floor',
        'floors_number',
        'build_date',
        'is_complete',
        'completion_year',
        'house_material',
        'parking',
        'decoration',
        'balcony',
        'passenger_elevator',
        'cargo_elevator',
        'metro',
        'metro_distance',
        'metro_transport',
        'district',
        'is_apartments',
        'from_developer',
        'is_auction',
        'link',  # Добавляем колонку для ссылок
        'photos'  # Добавляем колонку для ссылок на фотографии
    ]
]

# Функция для обработки пропусков и булевых значений
def add_attr(attr):
    if isinstance(attr, bool):
        return int(attr)
    return attr if attr is not None else None

# Функция для создания экземпляра класса запросов
def get_session():
    headers = {
        'authority': 'www.cian.ru',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
        # 'cookie': '_CIAN_GK=ba9cbcd7-318e-466f-acc9-76edbde6f3e8; _gcl_au=1.1.547104461.1710680659; tmr_lvid=57770ba711bf55b7a32e568f304f9f9f; tmr_lvidTS=1710680659331; login_mro_popup=1; sopr_utm=%7B%22utm_source%22%3A+%22direct%22%2C+%22utm_medium%22%3A+%22None%22%7D; uxfb_usertype=searcher; sopr_session=4690d323f46b4532; _gid=GA1.2.1804219744.1710680661; _ym_uid=1710680662970774452; _ym_d=1710680662; uxs_uid=df351200-e45e-11ee-85dd-e90cfbd1a20d; afUserId=f76d2a6f-9a78-48ed-8085-df3d3715ca3d-p; AF_SYNC=1710680661868; _ym_isad=2; _ym_visorc=b; adrdel=1; adrcid=AT0UT0wEPNDp1rk7PPeLAzg; session_region_id=1; session_main_town_region_id=1; __cf_bm=BbPuZP1Voe0Xuq68MmgPcpmXtot4vjcdiWJ1urjxOfw-1710681351-1.0.1.1-iLJsaBY6woS.xbe4OQIJPoqgWhSmKSUl1JipA4iphNKsAWiXU.WTC0SqaeNxX_1Lxz9uQGoa8WKXCzkHS53_2Q; _ga_3369S417EL=GS1.1.1710680661.1.1.1710681355.60.0.0; _ga=GA1.2.1802668360.1710680661; _dc_gtm_UA-30374201-1=1; tmr_detect=0%7C1710681357609',
        'referer': 'https://www.cian.ru/',
        'sec-ch-ua': '"Not(A:Brand";v="24", "Chromium";v="122"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': ua.random,
    }

    session = requests.Session()
    session.headers = headers
    return cfscrape.create_scraper(sess=session)  # cfscrape — обход защиты от ботов Cloudflare

# Записываем всё в файл формата .csv
def recording_table():
    try:
        with open(os.path.join(CSV_PATH, f'data_{CSV_NUMBER}.csv'), mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            for row in dataset:
                writer.writerow(row)
        print(f'The dataset is written in file "data_{CSV_NUMBER}.csv"')
    except Exception as error:
        print('Recording error!\n', traceback.format_exc())

# Получаем формат json (питоновский dict) из нашего запроса Response
def get_json(session, region_name, cur_page):
    json_data = {
        'jsonQuery': {
            '_type': 'flatsale',
            'engine_version': {'type': 'term', 'value': 2},
            'region': {'type': 'terms', 'value': [regions[region_name]]},
            'room': {'type': 'terms', 'value': [1, 2, 3, 4, 5, 6]},
            'page': {'type': 'term', 'value': cur_page},
        }
    }

    try:
        response = session.post('https://api.cian.ru/search-offers/v2/search-offers-desktop/', json=json_data)
    except:
        return f'oops! Error {response.status_code}'

    if response.status_code != 204 and response.headers["content-type"].strip().startswith("application/json"):
        try:
            return response.json()
        except ValueError:
            return f'oops! ValueError!'

# Создание таблицы и добавление ссылок
def create_table(region_name='msk', start_page=1, end_page=55, number_of_samples=2000):
    if start_page < 1:
        start_page = 1
    if end_page > 55:
        end_page = 55

    session = get_session()

    cnt_samples = 0
    for cur_page in tqdm(range(start_page, end_page)):
        if cnt_samples >= number_of_samples:
            break


      # Случайное время паузы перед каждой отправкой запроса
        sleep_time = uniform(25, 35)  # Случайное время паузы между 25 и 35 секундами
        sleep(sleep_time)

        data = get_json(session, region_name, cur_page)
        if data is None:
            print('oops! Captcha!')
            return
        if isinstance(data, str):
            continue

        for item in data['data']['offersSerialized']:
            # Получаем ссылку на объявление
            link = item.get('fullUrl', None)
            if link:
                    link = f"{link}"

            # Получаем ссылки на фотографии
            photos = item.get('photos', [])
            photo_links = [photo['fullUrl'] for photo in photos if 'fullUrl' in photo]

            # Преобразуем список ссылок на фото в строку
            photo_links_str = ', '.join(photo_links)

            cur_item = [
                region_name,
                add_attr(item["geo"]["userInput"]),
                add_attr(item['bargainTerms']['priceRur']),
                add_attr(item.get('totalArea')),
                add_attr(item.get('kitchenArea')),
                add_attr(item.get('livingArea')),
                add_attr(item.get('roomsCount')),
                add_attr(item.get('floorNumber')),
                add_attr(item['building'].get('floorsCount')),
                add_attr(item['building'].get('buildYear')),
                add_attr(item['building']['deadline']['isComplete'] if item['building'].get('deadline') is not None else None),
                add_attr(item['building']['deadline']['year'] if item['building'].get('deadline') is not None else None),
                add_attr(item['building'].get('materialType')),
                add_attr(item['building']['parking']['type'] if item['building'].get('parking') is not None else None),
                add_attr(item.get('decoration')),
                add_attr(item.get('balconiesCount')),
                add_attr(item['building'].get('passengerLiftsCount')),
                add_attr(item['building'].get('cargoLiftsCount')),
                add_attr(','.join([str(x['name']) for x in item['geo']['undergrounds'] if x is not None])),
                add_attr(','.join([str(x['time']) for x in item['geo']['undergrounds'] if x is not None])),
                add_attr(','.join([str(x['transportType']) for x in item['geo']['undergrounds'] if x is not None])),
                add_attr(','.join([str(x['name']) for x in item['geo']['districts'] if x is not None])),
                add_attr(item.get('isApartments')),
                add_attr(item.get('fromDeveloper')),
                add_attr(item.get('isAuction')),
                add_attr(link),  # Ссылка на объявление
                add_attr(photo_links_str)  # Добавляем ссылки на фотографии
            ]

            if cur_item not in dataset:
                dataset.append(cur_item)
                cnt_samples += 1
            else:
                continue

            if cnt_samples >= number_of_samples:
                break

        print(f'{cnt_samples} / {number_of_samples} | page: {cur_page}')

    recording_table()
    return

# Запуск функции
create_table()


  2%|▏         | 1/54 [00:30<27:02, 30.62s/it]

28 / 2000 | page: 1


  4%|▎         | 2/54 [00:56<23:54, 27.59s/it]

56 / 2000 | page: 2


  6%|▌         | 3/54 [01:24<23:48, 28.02s/it]

84 / 2000 | page: 3


  7%|▋         | 4/54 [01:53<23:38, 28.38s/it]

112 / 2000 | page: 4


  9%|▉         | 5/54 [02:17<21:53, 26.80s/it]

140 / 2000 | page: 5


 11%|█         | 6/54 [02:49<22:56, 28.68s/it]

167 / 2000 | page: 6


 13%|█▎        | 7/54 [03:14<21:25, 27.35s/it]

195 / 2000 | page: 7


 15%|█▍        | 8/54 [03:41<20:54, 27.28s/it]

223 / 2000 | page: 8


 17%|█▋        | 9/54 [04:06<19:58, 26.64s/it]

250 / 2000 | page: 9


 19%|█▊        | 10/54 [04:32<19:14, 26.25s/it]

278 / 2000 | page: 10


 20%|██        | 11/54 [05:02<19:43, 27.52s/it]

306 / 2000 | page: 11


 22%|██▏       | 12/54 [05:25<18:10, 25.98s/it]

334 / 2000 | page: 12


 24%|██▍       | 13/54 [05:47<17:03, 24.95s/it]

362 / 2000 | page: 13


 26%|██▌       | 14/54 [06:18<17:51, 26.79s/it]

390 / 2000 | page: 14


 28%|██▊       | 15/54 [06:42<16:53, 25.99s/it]

418 / 2000 | page: 15


 30%|██▉       | 16/54 [07:09<16:32, 26.11s/it]

446 / 2000 | page: 16


 31%|███▏      | 17/54 [07:36<16:14, 26.35s/it]

474 / 2000 | page: 17


 33%|███▎      | 18/54 [08:06<16:30, 27.52s/it]

502 / 2000 | page: 18


 35%|███▌      | 19/54 [08:34<16:04, 27.56s/it]

530 / 2000 | page: 19


 37%|███▋      | 20/54 [08:57<14:51, 26.22s/it]

558 / 2000 | page: 20


 39%|███▉      | 21/54 [09:28<15:15, 27.75s/it]

586 / 2000 | page: 21


 41%|████      | 22/54 [09:57<14:58, 28.09s/it]

614 / 2000 | page: 22


 43%|████▎     | 23/54 [10:22<14:07, 27.34s/it]

642 / 2000 | page: 23


 44%|████▍     | 24/54 [10:51<13:54, 27.82s/it]

670 / 2000 | page: 24


 46%|████▋     | 25/54 [11:18<13:12, 27.34s/it]

698 / 2000 | page: 25


 48%|████▊     | 26/54 [11:48<13:09, 28.19s/it]

726 / 2000 | page: 26


 50%|█████     | 27/54 [12:16<12:42, 28.23s/it]

754 / 2000 | page: 27


 52%|█████▏    | 28/54 [12:38<11:21, 26.20s/it]

782 / 2000 | page: 28


 54%|█████▎    | 29/54 [13:04<10:57, 26.28s/it]

810 / 2000 | page: 29


 56%|█████▌    | 30/54 [13:29<10:21, 25.90s/it]

838 / 2000 | page: 30


 57%|█████▋    | 31/54 [13:59<10:22, 27.06s/it]

865 / 2000 | page: 31


 59%|█████▉    | 32/54 [14:24<09:41, 26.44s/it]

893 / 2000 | page: 32


 61%|██████    | 33/54 [14:48<08:59, 25.69s/it]

921 / 2000 | page: 33


 63%|██████▎   | 34/54 [15:15<08:44, 26.23s/it]

949 / 2000 | page: 34


 65%|██████▍   | 35/54 [15:39<08:06, 25.62s/it]

977 / 2000 | page: 35


 67%|██████▋   | 36/54 [16:09<08:04, 26.90s/it]

1005 / 2000 | page: 36


 69%|██████▊   | 37/54 [16:31<07:11, 25.37s/it]

1033 / 2000 | page: 37


 70%|███████   | 38/54 [16:59<06:56, 26.02s/it]

1061 / 2000 | page: 38
