__собираем статистики по объявлениям__

_Evgeny S. Borisov <parser@mechanoid.su>_

In [1]:
# import re
import numpy as np
# from tqdm.notebook import tqdm
import pandas as pd
import geopandas as gpd
# import contextily as ctx
from shapely.geometry import Polygon

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:.2f}'.format)
# tqdm.pandas()

In [3]:
# !ls -1 data

## считываем данные

In [4]:
# place = 'sudak'
# place = 'sevastopol'
# place = 'bakchisaray'

In [5]:
data_file = 'data/data_house.pkl'

In [6]:
cols = ['title','adr','latitude','longitude','priceM','ts']
df = pd.read_pickle(data_file)
df['dt'] = pd.to_datetime( df['ts'].dt.date )
print(len(df))
df.sample(3)[cols]

2432


Unnamed: 0,title,adr,latitude,longitude,priceM,ts
1433,"дом 80 м² на участке 9,9 сот.","Охотская ул., 34",44.59,33.53,6.2,2022-09-15 11:53:29.976
1302,дом 204 м² на участке 4 сот.,"Севастополь, улица Мечникова",44.6,33.51,39.0,2022-09-15 11:53:29.976
863,"дом 86,1 м² на участке 6 сот.","ул. Сладкова, 10",44.6,33.48,13.0,2022-09-15 11:53:29.976


In [None]:
df.info()

In [None]:
df.sample(3)[cols]

In [None]:
# берём объявления с геометкой
print(len(df))
df = df[ (~df['latitude'].isnull()) ].reset_index(drop=True)
print(len(df))

In [None]:
df.sample(2)

In [None]:
# # берём самые свежие объявления
# print(len(df))
# df = df[ (df['ts']==df['ts'].max()) ].reset_index(drop=True)
# print(len(df))

# cols = ['title','adr','latitude','longitude','priceM']
# df[cols]

### ограничиваем тип объекта 

In [None]:
print(len(df))
# выкидываем "ущербные" варианты 

df = df.query('~(is_part|is_SNT|is_townhouse) & (house_area>40.) & (priceM<15.)')

print(len(df))

In [None]:
gdf = gpd.GeoDataFrame( df, geometry = gpd.points_from_xy( df['longitude'], df['latitude']), crs='epsg:4326', )
gdf.sample(3)[['title','adr','latitude','longitude','priceM']]

In [None]:
del df

In [None]:
# gdf[['title','priceM','area_size_category','geometry']].sample(100).explore('area_size_category', legend=True) 
gdf[['title','priceM','house_size_category','geometry']].sample(100).explore('house_size_category',legend=True) 

## общая статистика изменение цены

In [None]:
ts = '2022-08-01' # ограничиваем дату

In [None]:
stat = (
    gdf.query('priceM>1.')
    .groupby(['house_size_category','dt'])
    ['priceM'].describe(percentiles=[.1,.25,.5,.75,.9])
)
stat

In [None]:
# stat.loc['30-50',['min','25%','50%']].plot(grid=True, title='цена на маленькие дома')

In [None]:
# stat.loc['50-70',['min','25%','50%']].plot(grid=True, title='цена на средние дома')

In [None]:
# stat.loc['70-150',['min','25%','50%']].plot(grid=True, title='цена на большие дома')

## гексагональная сетка

In [None]:
from shapely.geometry import MultiPoint

frame_geometry = MultiPoint( gdf['geometry'].tolist() ).convex_hull
frame_geometry

In [None]:
import h3
    
def cover_grid_hexagonal(gmtr,cell_size=7,crs='EPSG:4326'):
    polygonise = lambda hex_id: Polygon( h3.h3_to_geo_boundary( hex_id, geo_json=True)  )
    hexs = list(h3.polyfill( gmtr.__geo_interface__, cell_size, geo_json_conformant = True ))
    return gpd.GeoDataFrame(
        { 'hex_id': hexs,  'geometry': list(map(polygonise, hexs)), },
        crs=crs, 
    )

# grid = cover_grid_hexagonal( frames.query('area_id in (5,)').geometry.values[0] )
grid = cover_grid_hexagonal( frame_geometry )
print(len(grid))
grid.sample(2)

In [None]:
# grid.explore('hex_id')

In [None]:
# ctx.add_basemap( 
#     grid.plot(alpha=.5,color='yellow',edgecolors='blue',figsize=(31,31),), 
#     source=ctx.providers.OpenStreetMap.Mapnik,
#     crs=grid.crs, 
#     zoom=16,
# )

### медиана актуальной цены

In [None]:
# выкидываем 

grid_values = (
    grid.sjoin( gdf[ (gdf['dt']>ts) & (gdf['priceM']<15.) ], how='inner',)
    .groupby(['hex_id'])
    ['priceM']
    .median()
    .reset_index()
)

grid.merge(grid_values,on='hex_id').explore('priceM') #'area_name', legend=True)

In [None]:
# import folium
# start_pos = (44.578330, 33.516509)
# # Create a map
# my_map = folium.Map(location=start_pos, zoom_start=14,)

# # Add the data
# folium.Choropleth(
#     name='choropleth',
#     geo_data=grid,
#     data=grid_values,
#     columns=['hex_id','priceM',],
#     key_on='feature.properties.hex_id',
#     fill_color='YlOrRd',
#     fill_opacity=.7,
#     line_opacity=.2,
#     legend_name='priceM median'
# ).add_to(my_map)
# my_map

In [None]:
### количество предложений

In [None]:
# %%javascript
# IPython.OutputArea.prototype._should_scroll = function(lines) {  return false; }

In [None]:
# # количество предложений
# from folium import Map
# from folium.plugins import HeatMap

# start_pos = (44.578330, 33.516509)

# Map(
#     location=start_pos,
#     zoom_start=12,
#     # width=500,
#     #height=2000,
# ).add_child( 
#     HeatMap( 
#         gdf[ gdf['dt']>ts ]
#         .sjoin(frames.query('area_id in (5,)'),how='inner',predicate='within')
#         [['latitude','longitude',]],
#         #radius=10.,
#         #min_opacity=3. 
#     )
# )


In [None]:
# df = pd.read_pickle(data_file)
# df['priceMU'] = df['priceM']/df['area']

# area_bins = [ 0., 1., 2., 4., 8., 20., 1e6, ]
# labels =    [ '<1', '1-2','2-4', '4-8', '8-20', '20+' ]
# df['area_size_category'] = pd.cut( df['area'], bins = area_bins, labels=labels)

# df= df.drop(columns=['area_cut'])

# df.to_pickle(data_file)
# df[['title','adr','area','area_size_category','price','priceM','priceMU']]

In [None]:
## ограничиваем по район поиска

In [None]:
# # загружаем области поиска
# frames_path = 'data/frames/'
# frames_index = pd.read_csv(f'{frames_path}/_index.tsv',sep='\t')
# # frames_index

# swap_coo = lambda coo : [ (c[1],c[0]) for c in coo ]
# df2poly = lambda df : Polygon(swap_coo(df.values))

# frames = gpd.GeoDataFrame([ 
#     { 'area_name':nm, 'geometry': df2poly( pd.read_csv(f'{frames_path}/{f}',header=None) ) } 
#     for nm,f in frames_index.values
# ],crs='epsg:4326',)

# frames['area_id'] = range(len(frames))

# frames

In [None]:
# frames.plot('area_name', legend=True, cmap='GnBu', alpha=.5, edgecolor='k',figsize=(10,10),)  

In [None]:
# # фильтруем по области
# print(len(gdf))
# gdf_in_frame = gdf.sjoin( frames.query('area_id in (5,)'), how='inner', predicate='within') 
# print(len(gdf_in_frame))

In [None]:
# gdf_in_frame[['title','priceM','adr','area_size_category','geometry']].explore('area_size_category')
#,legend=False)

In [None]:
# ctx.add_basemap(
#     gdf_in_frame[ (gdf_in_frame['dt']==gdf_in_frame['dt'].max()) ]
#       .plot(figsize=(21,21), color='r',marker='*'), 
#     crs=gdf.crs, 
#     zoom=16,
#     # source=ctx.providers.Stamen.TonerLite,
#     source=ctx.providers.OpenStreetMap.Mapnik,
# )

In [None]:
### изменение цены

In [None]:
# stat = (
#     gdf_in_frame.query('priceM>1.')
#     .groupby(['area_size_category','dt'])
#     ['priceM'].describe(percentiles=[.1,.25,.5,.75,.9])
# )
# stat

In [None]:
# stat.loc['2-4',['min','25%','50%']].plot(grid=True, title='цена на 2-4 сот')

In [None]:
# stat.loc['4-8',['min','25%','50%']].plot(grid=True, title='цена на 4-8 сот')

In [None]:
# stat.loc['8-20',['min','25%','50%']].plot(grid=True, title='цена на 8-20 сот')