__применение парсера объявлений авито.ru__

_Evgeny S. Borisov <parser@mechanoid.su>_

In [1]:
# !pacman -S firefox firefox-i18n-r  geckodriver
# !pip install seleniuam

In [2]:
# import re
import sys
import logging
from datetime import datetime as dtm
from tqdm.notebook import tqdm
import pandas as pd

In [3]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:.2f}'.format)
tqdm.pandas()

logging.basicConfig(
        format=u'[%(levelname)-8s] %(asctime)s | %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        level=logging.INFO,
        # level=logging.DEBUG,
        stream=sys.stdout,
    )

---

In [5]:
# url = (
# 'https://www.avito.ru/yaroslavskaya_oblast/avtomobili/chevrolet/niva'
# '?bt=0'
# '&i=1'
# '&pmax=150000'
# '&pmin=10000'
# )

In [6]:
profile_path = '/home/mechanoid/.mozilla/firefox/p144xo2m.default-release'

base_url = 'https://www.avito.ru/'
avito_path = 'sevastopol/kvartiry/prodam'

raw_data_path = 'data/raw/' # каталог с файлами объявлений
loc_file_path = 'data/location.pkl' # таблица адресов
result_file_path = 'data/data.pkl'

In [7]:
ts = dtm.now().strftime('%Y-%m-%d_%H-%M')
ts

'2022-09-06_20-08'

## качаем новые данные

In [None]:
%%time 

from lib.downloader import DownloaderSeleniumFirefox
from lib.avito import AvitoDownloaderRealty

df = AvitoDownloaderRealty(
        driver=DownloaderSeleniumFirefox(profile_path)
    ).load(avito_path, show_pbar=True, )

print(len(df))

In [None]:
assert len(df)>0
df.to_excel(f'{raw_data_path}/avito_{ts}_raw.xlsx',index=False)

In [None]:
df.sample(2)

## собираем и чистим данные

In [8]:
import re
from os import listdir

raw_data_files = sorted([ raw_data_path+'/'+f for f in listdir(raw_data_path) if re.match(r'.+\.xlsx$',f) ])
raw_data_files

['data/raw//avito_2022-08-26_17-29_raw.xlsx',
 'data/raw//avito_2022-08-29_13-16_raw.xlsx',
 'data/raw//avito_2022-08-31_17-42_raw.xlsx',
 'data/raw//avito_2022-09-01_14-07_raw.xlsx',
 'data/raw//avito_2022-09-02_13-04_raw.xlsx',
 'data/raw//avito_2022-09-05_14-02_raw.xlsx',
 'data/raw//avito_2022-09-06_13-09_raw.xlsx']

In [9]:
from lib.avito import AvitoDataCleanerRealty

data = pd.concat(
        [ pd.read_excel(f) for f in raw_data_files ]
    ).drop_duplicates().reset_index(drop=True)

data = AvitoDataCleanerRealty.transform( data )

print(len(data))

25677


In [10]:
assert len(data)>0

In [11]:
data.info(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25677 entries, 0 to 25676
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   avito_id       25677 non-null  int64         
 1   title          25677 non-null  object        
 2   price          25677 non-null  int64         
 3   adr            25677 non-null  object        
 4   description    25657 non-null  object        
 5   obj_name       16754 non-null  object        
 6   ts             25677 non-null  datetime64[ns]
 7   avito_page     25677 non-null  int64         
 8   nrooms         25677 non-null  int64         
 9   floor          25677 non-null  int64         
 10  nfloors        25677 non-null  int64         
 11  area           25677 non-null  float64       
 12  is_studio      25677 non-null  bool          
 13  is_apartment   25677 non-null  bool          
 14  is_part        25677 non-null  bool          
 15  is_auction     2567

### обновляем таблицу адресов

In [12]:
from lib.locator import LocationUpdater
from lib.locator import AddressTransformerSev

loc = LocationUpdater(
        address_transformer=AddressTransformerSev(),
    ).transform(
        adr=data['adr'],
        loc=pd.read_pickle(loc_file_path),
        show_pbar=True,
    )

[INFO    ] 2022-09-06 20:09:15 | LocationUpdater: 1706 addresses in location table
[INFO    ] 2022-09-06 20:09:15 | LocationUpdater: 1882 addresses total
[INFO    ] 2022-09-06 20:09:15 | LocationUpdater: 1706 addresses defined
[INFO    ] 2022-09-06 20:09:15 | LocationUpdater: 176 addresses undefined


  0%|          | 0/176 [00:00<?, ?it/s]

[INFO    ] 2022-09-06 20:11:32 | LocationUpdater: 131 new addresses found


In [13]:
loc.sample(3)

Unnamed: 0,adr,latitude,longitude,truncated
1191,"село Орловка, Качинское шоссе , 33/3",44.73,33.58,True
979,"улица Комбрига Потапова, 29В",44.57,33.47,False
572,"село Орловка, Качинское шоссе , 35/41",44.73,33.58,True


In [14]:
assert len(loc)>0
loc.to_pickle(loc_file_path)

### дополняем данные геометкой

In [15]:
data = data.merge(loc[['adr','latitude','longitude',]],on=['adr'],how='left')

print('всего записей:', len( data) )
print('записей без геометки:', len( data[ data['latitude'].isnull() ] ) )

всего записей: 25677
записей без геометки: 1200


In [16]:
assert len(data)>0
data.to_pickle(result_file_path)

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25677 entries, 0 to 25676
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   avito_id       25677 non-null  int64         
 1   title          25677 non-null  object        
 2   price          25677 non-null  int64         
 3   adr            25677 non-null  object        
 4   description    25657 non-null  object        
 5   obj_name       16754 non-null  object        
 6   ts             25677 non-null  datetime64[ns]
 7   avito_page     25677 non-null  int64         
 8   nrooms         25677 non-null  int64         
 9   floor          25677 non-null  int64         
 10  nfloors        25677 non-null  int64         
 11  area           25677 non-null  float64       
 12  is_studio      25677 non-null  bool          
 13  is_apartment   25677 non-null  bool          
 14  is_part        25677 non-null  bool          
 15  is_auction     2567

In [None]:
# for f in raw_data_files:
#     ts = dtm.strptime( re.sub(r'.*/avito_','',f), '%Y-%m-%d_%H-%M_raw.xlsx')
#     df = pd.read_excel(f)
#     df['ts'] = ts
#     df.to_excel(f,index=False)
#     print(ts)