In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import requests
from multiprocessing.dummy import Pool as ThreadPool
from bs4 import BeautifulSoup
import random
import time
import gc
import re
import datetime
import math
import json

In [3]:
pool = ThreadPool(50)

In [4]:
def check_amount(lower, upper, limit = 510):
    r = requests.get(f'https://www.bn.ru/kvartiry-vtorichka/?priceFrom={lower}&priceTo={upper}')
    soup = BeautifulSoup(r.text, 'html.parser')
    amount = int(soup.find('strong').contents[0])
    if amount <= limit:
        return True, amount
    else:
        return False, amount


def check_bounds(price_bounds, limit = 510):
    r = requests.get(f'https://www.bn.ru/kvartiry-vtorichka')
    soup = BeautifulSoup(r.text, 'html.parser')
    amount = int(soup.find('strong').contents[0])
    summa = 0
    for interval in price_bounds:
        l, u, _ = interval
        flag, count = check_amount(l, u)
        if not flag:
            print(l, u)
            return False
        summa += count
    if summa == amount:
        return True
    else:
        print("Counted: ", summa, " Total: ", amount)
        return False


def calc_borders():
    '''Считает список подходящих ограничений на цены'''
    answer = []
    lower = 0
    upper = 3000000
    step = 100000
    while upper < 20 * 10 ** 6:
        state, _ = check_amount(lower, upper)
        while state:
            if upper > 20 * 10 ** 6:
                break
            upper += step
            state, _ = check_amount(lower, upper)
        upper -= step
        _, amount = check_amount(lower, upper - 1)
        answer.append((lower, upper - 1, amount))
        print((lower, upper - 1, amount))
        lower = upper
    # --- const
    _, amount = check_amount(lower, 24999999)
    answer.append((lower, 24999999, amount))
    print((lower, 24999999, amount))
    _, amount = check_amount(25000000, 29999999)
    answer.append((25000000, 29999999, amount))
    print((25000000, 29999999, amount))
    _, amount = check_amount(30000000, 39999999)
    answer.append((30000000, 39999999, amount))
    print((30000000, 39999999, amount))
    _, amount = check_amount(40000000, 5785000000)
    answer.append((40000000, 5785000000, amount))
    print((40000000, 5785000000, amount))             
    return answer


def get_ids_from_page(soup):
    divs = soup.find_all("div", {"class": "catalog-item__id"})
    ids = [str(i.contents[0]) for i in divs]
    return ids


def get_prices_from_page(soup):
    divs = soup.find_all("div", {"class": "catalog-item__price"})
    prices = [str(i.contents[0]) for i in divs]
    return prices


def get_additional_info1(soup):
    '''Ищет информацию либо о собственнике, либо об онлайн показах'''
    result = []
    divs = soup.find_all("div", {"class": "catalog-item__mark-container"})
    for div in divs:
        part = []
        if div.find('span', {"class": "catalog-item__mark-item catalog-item__mark-item-online"}):
            part.append(str(div.find('span', {"class": "catalog-item__mark-item catalog-item__mark-item-online"}).contents[0]))
        if div.find('a'):
            part.append(str(div.find('a').contents[0])) 
        if len(part) != 0:
            result.append(sorted(part))
        else:
            result.append(None)
    return result


def get_additional_info2(soup):
    '''Ищет информацию о продвижении объявления'''
    result = []
    divs = soup.find_all("div", {"class": "catalog-item__vas-container"})
    for div in divs:
        part = []
        if div.find('div', {"class": "catalog-item__vas-icon catalog-item__vas-icon-top"}):
            part.append('top')
        if div.find('div', {"class": "catalog-item__vas-icon catalog-item__vas-icon-up"}):
            part.append('up') 
        if div.find('div', {"class": "catalog-item__vas-icon catalog-item__vas-icon-color"}):
            part.append('color')
        if div.find('div', {"class": "catalog-item__vas-icon catalog-item__vas-icon-object-day"}):
            part.append('day')
        if div.find('div', {"class": "catalog-item__vas-icon catalog-item__vas-icon-warn"}):
            part.append('warn')
        if len(part) != 0:
            result.append(sorted(part))
        else:
            result.append(None)
    return result


def get_dates_from_page(soup):
    '''Ищет информацию о датах в объявлении'''
    spans = soup.find_all('span', {'class':'catalog-item__date-value'})
    dates = [str(i.contents[0]) for i in spans]
    return dates


def get_addresses_from_page(soup):
    divs = soup.find_all('div', {'class':'catalog-item__address'})
    addresses = [str(i.contents[0]) if len(i.contents) != 0 else None for i in divs]
    return addresses    


def parse_ids_from_price(bounds):
    global pool
    ids_from_price = []
    l, u = bounds
    url = f'https://www.bn.ru/kvartiry-vtorichka/?priceFrom={l}&priceTo={u}'
    s = requests.Session()
    r = s.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    td = soup.find('strong')
    amount = int(td.contents[0])
    npages = math.ceil(amount/30)
    page_ids = get_ids_from_page(soup)
    ids_from_price.extend(page_ids)
    urls = [f'https://www.bn.ru/kvartiry-vtorichka/?priceFrom={l}&priceTo={u}&page={page}' for page in range(2, npages + 1)]
    answers = pool.map(requests.get, urls)
    for r in answers:
        soup = BeautifulSoup(r.text, 'html.parser')
        page_ids = get_ids_from_page(soup)
        ids_from_price.extend(page_ids)
    return ids_from_price


def parse_ids_and_prices_from_price(bounds):
    global pool
    ids_from_price = []
    prices_from_price = []
    l, u = bounds
    url = f'https://www.bn.ru/kvartiry-vtorichka/?priceFrom={l}&priceTo={u}'
    s = requests.Session()
    r = s.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    td = soup.find('strong')
    amount = int(td.contents[0])
    npages = math.ceil(amount/30)
    page_ids = get_ids_from_page(soup)
    page_prices = get_prices_from_page(soup)
    ids_from_price.extend(page_ids)
    prices_from_price.extend(page_prices)
    urls = [f'https://www.bn.ru/kvartiry-vtorichka/?priceFrom={l}&priceTo={u}&page={page}' for page in range(2, npages + 1)]
    answers = pool.map(requests.get, urls)
    for r in answers:
        soup = BeautifulSoup(r.text, 'html.parser')
        page_ids = get_ids_from_page(soup)
        page_prices = get_prices_from_page(soup)
        ids_from_price.extend(page_ids)
        prices_from_price.extend(page_prices)
    return ids_from_price, prices_from_price


def parse_info_from_price(bounds):
    global pool
    ids_from_price = []
    prices_from_price = []
    info1 = []
    info2 = []
    dates_from_price = []
    address_from_price = []
    l, u = bounds
    url = f'https://www.bn.ru/kvartiry-vtorichka/?priceFrom={l}&priceTo={u}'
    s = requests.Session()
    r = s.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    td = soup.find('strong')
    amount = int(td.contents[0])
    npages = math.ceil(amount/30)
    page_ids = get_ids_from_page(soup)
    page_prices = get_prices_from_page(soup)
    page_info1 = get_additional_info1(soup)
    page_info2 = get_additional_info2(soup)
    page_dates = get_dates_from_page(soup)
    page_addresses = get_addresses_from_page(soup)

    ids_from_price.extend(page_ids)
    prices_from_price.extend(page_prices)
    info1.extend(page_info1)
    info2.extend(page_info2)
    dates_from_price.extend(page_dates)
    address_from_price.extend(page_addresses)
    urls = [f'https://www.bn.ru/kvartiry-vtorichka/?priceFrom={l}&priceTo={u}&page={page}' for page in range(2, npages + 1)]
    answers = pool.map(requests.get, urls)
    for r in answers:
        soup = BeautifulSoup(r.text, 'html.parser')
        page_ids = get_ids_from_page(soup)
        page_prices = get_prices_from_page(soup)
        page_info1 = get_additional_info1(soup)
        page_info2 = get_additional_info2(soup)
        page_dates = get_dates_from_page(soup)
        page_addresses = get_addresses_from_page(soup)
        ids_from_price.extend(page_ids)
        prices_from_price.extend(page_prices)
        info1.extend(page_info1)
        info2.extend(page_info2)
        dates_from_price.extend(page_dates)
        address_from_price.extend(page_addresses)
    return ids_from_price, prices_from_price, info1, info2, dates_from_price, address_from_price


def parse_ids_from_bounds(list_of_bounds):
    ids = []
    for elem in list_of_bounds:
        l, u, _ = elem
        bounds = (l, u)
        price_ids = parse_ids_from_price(bounds)
        ids.extend(price_ids)
        print(bounds)
    return ids


def parse_ids_and_prices_from_bounds(list_of_bounds):
    ids = []
    prices = []
    for elem in list_of_bounds:
        l, u, _ = elem
        bounds = (l, u)
        price_ids, price_prices = parse_ids_and_prices_from_price(bounds)
        ids.extend(price_ids)
        prices.extend(price_prices)
        print(bounds)
    return ids, prices


def parse_info_from_bounds(list_of_bounds):
    ids = []
    prices = []
    info1 = []
    info2 = []
    dates = []
    addresses = []
    for elem in list_of_bounds:
        l, u, _ = elem
        bounds = (l, u)
        price_ids, price_prices, price_info1, price_info2, price_dates, price_addresses = parse_info_from_price(bounds)
        ids.extend(price_ids)
        prices.extend(price_prices)
        info1.extend(price_info1)
        info2.extend(price_info2)
        dates.extend(price_dates)
        addresses.extend(price_addresses)
        print(bounds)
    return ids, prices, info1, info2, dates, addresses


def collect_ids(list_of_bounds):
    r = requests.get(f'https://www.bn.ru/kvartiry-vtorichka')
    soup = BeautifulSoup(r.text, 'html.parser')
    amount = int(soup.find('strong').contents[0])
    set1 = set(parse_ids_from_bounds(list_of_bounds))
    set2 = set(parse_ids_from_bounds(list_of_bounds[::-1]))
    total_set = set1.union(set2)
    total_list = list(total_set)
    print('lost obs: ', amount - len(total_list))
    return total_list


def collect_ids_and_prices(list_of_bounds):
    r = requests.get(f'https://www.bn.ru/kvartiry-vtorichka')
    soup = BeautifulSoup(r.text, 'html.parser')
    amount = int(soup.find('strong').contents[0])
    result1 = parse_ids_and_prices_from_bounds(list_of_bounds)
    result1_dict = dict(zip(result1[0], result1[1]))
    result2 = parse_ids_and_prices_from_bounds(list_of_bounds[::-1])
    result2_dict = dict(zip(result2[0], result2[1]))
    total_dict = {**result1_dict, **result2_dict}
    print('lost obs: ', amount - len(total_dict))
    return total_dict


def collect_info(list_of_bounds):
    result1 = dict()
    result2 = dict()
    r = requests.get(f'https://www.bn.ru/kvartiry-vtorichka')
    soup = BeautifulSoup(r.text, 'html.parser')
    amount = int(soup.find('strong').contents[0])
    id1, price1, info1_1, info2_1, dates1, addresses1 = parse_info_from_bounds(list_of_bounds)
    assert (len(id1) == len(price1)) and (len(id1) == len(info1_1)) and (len(id1) == len(info2_1)) and (len(id1) == len(dates1)) and (len(id1) == len(addresses1))
    for i in range(len(id1)):
        result1[id1[i]] = {'price': price1[i], 'info1': info1_1[i], 'info2': info2_1[i], 'dates': dates1[i], 'address': addresses1[i]}
    id2, price2, info1_2, info2_2, dates2, addresses2 = parse_info_from_bounds(list_of_bounds[::-1])
    for i in range(len(id2)):
        result2[id2[i]] = {'price': price2[i], 'info1': info1_2[i], 'info2': info2_2[i], 'dates': dates2[i], 'address': addresses2[i]}
    result = {**result1, **result2}
    return result


def parse_flat_page(soup):
    name = soup.find_all("h1", {"class": "object-2019__header-headline"})
    price = soup.find_all("div", {"class": "object-2019__header-price"})
    how_selled = soup.find_all("div", {"class": "object-2019__header-price-unit"})
    metro = soup.find_all("span", {"class": "object__header-metro-name"})
    address = soup.find_all("a", {"class": "object-2019__header-address", "href": "#map-yandex"})
    date = soup.find_all("div", {"class": "object__id"})  
    author = soup.find_all("div", {"class": "object__user"})      
    characteristics = soup.find_all("div", {"class": "object__param-item-value"})
    desc = soup.find_all("div", {"class": "object__comment"})
    close_metro = soup.find_all("span", {"class": "object__transport-metro-name"})
    close_railway = soup.find_all("span", {"class": "object__transport-station-name"})
    transport_dist = soup.find_all("span", {"class": "object__transport-distance"})
    flat_info = {'name': name, 'price': price, 'how_selled': how_selled, 'metro': metro, 'address': address, 'date': date, 
                'author': author, 'characteristics': characteristics, 'desc': desc, 
                'close_metro': close_metro, 'close_railway': close_railway, 'transport_dist': transport_dist}
    return flat_info


def parse_flats(list_of_ids):
    '''Принимает на вход список айди объявлений и выдает информацию о квартирах в формате словарь словарей'''
    global pool
    all_flats = dict()
    chunked_ids = list(divide_chunks(list_of_ids, 100))
    counter = 0
    for chunk in chunked_ids:
        urls = [f'https://www.bn.ru/detail/flats/{id}/' for id in chunk]
        answers = pool.map(requests.get, urls)
        for i in range(len(chunk)):
            soup = BeautifulSoup(answers[i].text, 'html.parser')
            all_flats[chunk[i]] = delete_tags(parse_flat_page(soup))
        counter += 1
    return all_flats


def divide_chunks(l, n):
    for i in range(0, len(l), n): 
        yield l[i:i + n]


def delete_tags(dicti):
    result = dict()
    # address
    if dicti['address']:
        result['address'] = str(dicti['address'][0].contents[0])
    else:
        result['address'] = None
    # author
    if dicti['author']:
        if str(dicti['author'][0]) == 'частное':
            result['company'] = str(dicti['author'][0].find('span').contents[0])
            result['agent'] = None
        else:
            result['company'] = str(dicti['author'][0].find('span').contents[0])
            if len(dicti['author'][0].find_all('a')) == 2:
                result['agent'] = str(dicti['author'][0].find_all('a')[1].contents[0])
            else:
                result['agent'] = None
    # char
    if dicti['characteristics']:
        result['characteristics'] = [str(i.contents[0]) if len(i.contents) != 0 else None for i in dicti['characteristics']]
    else:
        result['characteristics'] = None
    # close_metro 
    if dicti['close_metro']:
        result['close_metro'] = [str(i.contents[0]) if len(i.contents) != 0 else None for i in dicti['close_metro']]
    else:
        result['close_metro'] = None
    # close_railway
    if dicti['close_railway']:
        result['close_railway'] = [str(i.contents[0]) if len(i.contents) != 0 else None for i in dicti['close_railway']]
    else:
        result['close_railway'] = None
    #date
    if dicti['date']:
        result['date'] = str(dicti['date'][0].find('span').contents[0])
    else:
        result['date'] = None
    # desc 
    if dicti['desc']:
        if len(dicti['desc'][0].contents) != 0:
            strings = [str(i) for i in dicti['desc'][0].contents]
            result['desc'] = ''.join(strings)
        else:
            result['desc'] = None
    else:
        result['desc'] = None
    # how_selled
    if dicti['how_selled']:
        result['how_selled'] = str(dicti['how_selled'][0].contents[2])
    else:
        result['how_selled'] = None
    # metro
    if dicti['metro']:
        result['metro'] = str(dicti['metro'][0].contents[0])
    else:
        result['metro'] = None        
    # name
    if dicti['name']:
        result['name'] = str(dicti['name'][0].contents[0])
    else:
        result['name'] = None
    # price
    if dicti['price']:
        result['price'] = str(dicti['price'][0].contents[0])
    else:
        result['price'] = None
    # transport_dist
    if dicti['transport_dist']:
        result['transport_dist'] = [str(i.contents[0]) if len(i.contents) != 0 else None for i in dicti['transport_dist']]
    else:
        result['transport_dist'] = None
    return result


def normal_round(n):
    if n - math.floor(n) < 0.5:
        return math.floor(n)
    return math.ceil(n)


def send_requests(list_of_ids):
    result = dict()
    chunked_ids = list(divide_chunks(list_of_ids, 100))
    counter = 0
    for chunk in chunked_ids:
        urls = [f'https://www.bn.ru/detail/flats/{id}/' for id in chunk]
        answers = pool.map(requests.get, urls)
        for i in range(len(chunk)):
            result[chunk[i]] = answers[i].status_code
    return result

In [32]:
dir = 'drive/My Drive/Colab Notebooks/CSC/Projects/REZ_8/'
with open(dir + 'full_db(18-05-21).json') as json_file:
    flats = json.load(json_file)

In [None]:
price_bounds7 = calc_borders()

In [6]:
price_bounds7 = [(0, 2899999, 478), (2900000, 3599999, 433), (3600000, 3899999, 366), (3900000, 4099999, 283), (4100000, 4299999, 331), 
                 (4300000, 4599999, 450), (4600000, 4799999, 320), (4800000, 4999999, 346), (5000000, 5199999, 303), (5200000, 5399999, 424), 
                 (5400000, 5499999, 248), (5500000, 5699999, 394), (5700000, 5899999, 301), (5900000, 6099999, 377), (6100000, 6299999, 363), 
                 (6300000, 6499999, 326), (6500000, 6699999, 404), (6700000, 6899999, 277), (6900000, 7199999, 414), (7200000, 7499999, 390), 
                 (7500000, 7799999, 406), (7800000, 8099999, 392), (8100000, 8499999, 362), (8500000, 8899999, 425), (8900000, 9399999, 471), 
                 (9400000, 9899999, 488), (9900000, 10499999, 382), (10500000, 11299999, 459), (11300000, 11899999, 402), (11900000, 12599999, 474), 
                 (12600000, 13799999, 486), (13800000, 15399999, 484), (15400000, 17799999, 490), (17800000, 19999999, 325), (20000000, 24999999, 397), 
                 (25000000, 29999999, 277), (30000000, 39999999, 329), (40000000, 5785000000, 431)]

In [7]:
start_time = time.time()
all_ok = check_bounds(price_bounds7)
print("--- %s seconds ---" % (time.time() - start_time))

Counted:  14172  Total:  14158
--- 58.33199191093445 seconds ---


In [8]:
start_time = time.time()
ids_info = collect_info(price_bounds7)
print("--- %s seconds ---" % (time.time() - start_time))

(0, 2899999)
(2900000, 3599999)
(3600000, 3899999)
(3900000, 4099999)
(4100000, 4299999)
(4300000, 4599999)
(4600000, 4799999)
(4800000, 4999999)
(5000000, 5199999)
(5200000, 5399999)
(5400000, 5499999)
(5500000, 5699999)
(5700000, 5899999)
(5900000, 6099999)
(6100000, 6299999)
(6300000, 6499999)
(6500000, 6699999)
(6700000, 6899999)
(6900000, 7199999)
(7200000, 7499999)
(7500000, 7799999)
(7800000, 8099999)
(8100000, 8499999)
(8500000, 8899999)
(8900000, 9399999)
(9400000, 9899999)
(9900000, 10499999)
(10500000, 11299999)
(11300000, 11899999)
(11900000, 12599999)
(12600000, 13799999)
(13800000, 15399999)
(15400000, 17799999)
(17800000, 19999999)
(20000000, 24999999)
(25000000, 29999999)
(30000000, 39999999)
(40000000, 5785000000)
(40000000, 5785000000)
(30000000, 39999999)
(25000000, 29999999)
(20000000, 24999999)
(17800000, 19999999)
(15400000, 17799999)
(13800000, 15399999)
(12600000, 13799999)
(11900000, 12599999)
(11300000, 11899999)
(10500000, 11299999)
(9900000, 10499999)
(94000

In [9]:
today_ids = set(i.split(' ')[1] for i in ids_info.keys())

In [10]:
len(today_ids)

14125

In [12]:
to_check = list(set(flats.keys()) - today_ids)

In [13]:
len(to_check)

10300

In [14]:
checked_ids = send_requests(to_check)

In [15]:
existing_past_ids = []
for key in checked_ids.keys():
    if checked_ids[key] == 200:
        existing_past_ids.append(key)

In [16]:
len(existing_past_ids)

177

In [17]:
new_ids = list(today_ids - set(flats.keys()))

In [18]:
len(new_ids)

309

In [19]:
parsed_past = parse_flats(existing_past_ids)
parsed_new = parse_flats(new_ids)

In [21]:
counter1 = 0
counter2 = 0
counter3 = 0
for id in flats.keys():
    key = 'id ' + id
    if key in ids_info:
        counter1 += 1
        flats[id]['update_history'].append((ids_info[key]['dates'], '18-05-21'))
        flats[id]['info1_history'].append((ids_info[key]['info1'], '18-05-21'))
        flats[id]['info2_history'].append((ids_info[key]['info2'], '18-05-21'))
        flats[id]['price_history'].append((ids_info[key]['price'], '18-05-21'))
        flats[id]['address_history'].append((ids_info[key]['address'], '18-05-21'))
    elif id in parsed_past:
        counter2 += 1
        flats[id]['update_history'].append((parsed_past[id]['date'], '18-05-21'))
        flats[id]['info1_history'].append((None, '18-05-21'))
        flats[id]['info2_history'].append((None, '18-05-21'))
        flats[id]['price_history'].append((parsed_past[id]['price'], '18-05-21'))
        flats[id]['address_history'].append((parsed_past[id]['address'], '18-05-21'))
    else:
        counter3 += 1
        flats[id]['update_history'].append((None, '18-05-21'))
        flats[id]['info1_history'].append((None, '18-05-21'))
        flats[id]['info2_history'].append((None, '18-05-21'))
        flats[id]['price_history'].append((None, '18-05-21'))
        flats[id]['address_history'].append((None, '18-05-21'))         
for id in parsed_new.keys():
    key = 'id ' + id
    flats[id] = parsed_new[id]
    flats[id]['update_history'] = [(ids_info[key]['dates'], '18-05-21')]
    flats[id]['info1_history'] = [(ids_info[key]['info1'], '18-05-21')]
    flats[id]['info2_history'] = [(ids_info[key]['info2'], '18-05-21')]
    flats[id]['price_history'] = [(ids_info[key]['price'], '18-05-21')]
    flats[id]['address_history'] = [(ids_info[key]['address'], '18-05-21')]                 

In [26]:
for id in flats.keys():
    assert (len(flats[id]['update_history']) > 0) and (len(flats[id]['update_history']) <= 34)
    assert (len(flats[id]['info1_history']) > 0) and (len(flats[id]['info1_history']) <= 34)
    assert (len(flats[id]['info2_history']) > 0) and (len(flats[id]['info2_history']) <= 34)
    assert (len(flats[id]['price_history']) > 0) and (len(flats[id]['price_history']) <= 34)
    assert (len(flats[id]['address_history']) > 0) and (len(flats[id]['address_history']) <= 34)
    assert flats[id]['update_history'][-1][1] == '18-05-21'
    assert flats[id]['info1_history'][-1][1] == '18-05-21'
    assert flats[id]['info2_history'][-1][1] == '18-05-21'
    assert flats[id]['price_history'][-1][1] == '18-05-21'
    assert flats[id]['address_history'][-1][1] == '18-05-21'

In [27]:
for id in random.sample(flats.keys(), 200):
    print(id, flats[id]['update_history'])
    print(id, flats[id]['info1_history'])
    print(id, flats[id]['info2_history'])
    print(id, flats[id]['price_history'])
    print(id, flats[id]['address_history'])
    print('-------')

3647019 [['30.03.2021', '14-04-21'], ['30.03.2021', '15-04-21'], ['30.03.2021', '16-04-21'], ['30.03.2021', '17-04-21'], ['30.03.2021', '18-04-21'], ['30.03.2021', '19-04-21'], ['30.03.2021', '20-04-21'], ['30.03.2021', '21-04-21'], ['30.03.2021', '22-04-21'], ['30.03.2021', '23-04-21'], ['30.03.2021', '24-04-21'], ['30.03.2021', '25-04-21'], ['30.03.2021', '26-04-21'], ['30.03.2021', '27-04-21'], ['30.03.2021', '28-04-21'], ['30.03.2021', '29-04-21'], ['30.03.2021', '30-04-21'], ['30.03.2021', '01-05-21'], ['30.03.2021', '02-05-21'], ['30.03.2021', '03-05-21'], ['30.03.2021', '05-05-21'], ['30.03.2021', '06-05-21'], ['30.03.2021', '07-05-21'], ['30.03.2021', '08-05-21'], ['30.03.2021', '09-05-21'], ['30.03.2021', '10-05-21'], ['30.03.2021', '11-05-21'], ['30.03.2021', '12-05-21'], ['30.03.2021', '13-05-21'], ['30.03.2021', '14-05-21'], ['30.03.2021', '15-05-21'], ['30.03.2021', '16-05-21'], ['30.03.2021', '17-05-21'], ('30.03.2021', '18-05-21')]
3647019 [[None, '14-04-21'], [None, '15

In [None]:
parsed_new.keys()

dict_keys(['3697396', '3697369', '3697152', '3697402', '3697397', '3697188', '3697348', '3697370', '3697181', '3472617', '3697416', '3697352', '3697400', '3697362', '3697272', '3697298', '3697389', '3697154', '3697363', '3697415', '3697419', '3697387', '3697347', '3697143', '3668674', '3697190', '3697290', '3697289', '3697417', '3697180', '3697182', '3697334', '3697271', '3697361', '3697147', '3697183', '3697067', '3697355', '3697299', '3697356', '3697276', '3697296', '3697386', '3697297', '3697279', '3697153', '3696606', '3697327', '3697338', '3697146', '3697288', '3697179', '3697332', '3697295', '3697155', '3697359', '3697291', '3697283', '3697328', '3697398', '3697357', '3697176', '3697371', '3697187', '3697294', '3697275', '3697351', '3697358', '3697368', '3697278', '3697391', '3697360', '3697353', '3697376', '3697354', '3697274', '3697156', '3697273', '3697329', '3697337', '3697399', '3697390', '3697375', '3697388', '3697293', '3697412', '3697372', '3697401', '3697277', '3697292',

In [28]:
flats['3689147']

{'address': 'Санкт-Петербург, Красносельский район, Ленинский проспект, 55 2',
 'address_history': [['Ленинский проспект, 55 2', '29-04-21'],
  ['Ленинский проспект, 55 2', '30-04-21'],
  ['Ленинский проспект, 55 2', '01-05-21'],
  ['Ленинский проспект, 55 2', '02-05-21'],
  ['Ленинский проспект, 55 2', '03-05-21'],
  ['Ленинский проспект, 55 2', '05-05-21'],
  ['Ленинский проспект, 55 2', '06-05-21'],
  ['Ленинский проспект, 55 2', '07-05-21'],
  ['Ленинский проспект, 55 2', '08-05-21'],
  ['Ленинский проспект, 55 2', '09-05-21'],
  ['Ленинский проспект, 55 2', '10-05-21'],
  ['Ленинский проспект, 55 2', '11-05-21'],
  ['Ленинский проспект, 55 2', '12-05-21'],
  ['Ленинский проспект, 55 2', '13-05-21'],
  ['Ленинский проспект, 55 2', '14-05-21'],
  ['Ленинский проспект, 55 2', '15-05-21'],
  ['Ленинский проспект, 55 2', '16-05-21'],
  ['Ленинский проспект, 55 2', '17-05-21'],
  ('Ленинский проспект, 55 2', '18-05-21')],
 'agent': None,
 'characteristics': ['55.90 кв.м.',
  '29.00 кв.м

In [29]:
dir = 'drive/My Drive/Colab Notebooks/CSC/Projects/REZ_8/'
with open(dir + 'full_db(18-05-21).json', 'w') as f:
    json.dump(flats, f)

In [30]:
dir = 'drive/My Drive/Colab Notebooks/CSC/Projects/REZ_8/'
with open(dir + 'ids_info(18-05-21).json', 'w') as f:
    json.dump(ids_info, f)

In [31]:
len(flats)

24425

In [None]:
#today_ids = set(i.split(' ')[1] for i in ids_info.keys())

In [None]:
#to_check = list(set(flats.keys()) - today_ids)

In [None]:
#print(len(to_check))

8194
