__собираем статистики по объявлениям__

_Evgeny S. Borisov <parser@mechanoid.su>_

In [1]:
# import re
import numpy as np
# from tqdm.notebook import tqdm
import pandas as pd
import geopandas as gpd
# import contextily as ctx
from shapely.geometry import Polygon

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:.2f}'.format)
# tqdm.pandas()

In [3]:
# !ls -1 data

## считываем данные

In [4]:
# place = 'sudak'
# place = 'sevastopol'
# place = 'bakchisaray'

In [5]:
data_file = 'data/data_house.pkl'

In [6]:
cols = ['title','adr','latitude','longitude','priceM','ts']
df = pd.read_pickle(data_file)
df['dt'] = pd.to_datetime( df['ts'].dt.date )
print(len(df))
df.sample(3)[cols]

2821


Unnamed: 0,title,adr,latitude,longitude,priceM,ts
2545,дом 70 м² на участке 6 сот.,"Бахчисарай, пос. городского типа Куйбышево, ул. Ленина, 36",44.63,33.87,3.45,2022-09-15 14:42:27.254
707,"коттедж 183 м² на участке 3,5 сот.","Севастополь, ул. Николая Кисляка, 32А",44.58,33.58,18.0,2022-09-15 11:53:29.976
825,коттедж 200 м² на участке 5 сот.,"Севастополь, садоводческое товарищество Радар-С, 6-я Радарная ул.",44.58,33.58,21.0,2022-09-15 11:53:29.976


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2821 entries, 0 to 2820
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   avito_id             2821 non-null   int64         
 1   title                2821 non-null   object        
 2   price                2821 non-null   int64         
 3   obj_name             2821 non-null   object        
 4   adr                  2821 non-null   object        
 5   description          2821 non-null   object        
 6   avito_page           2821 non-null   int64         
 7   ts                   2821 non-null   datetime64[ns]
 8   place                2821 non-null   object        
 9   house_area           2821 non-null   float64       
 10  is_part              2821 non-null   bool          
 11  is_townhouse         2821 non-null   bool          
 12  is_SNT               2821 non-null   bool          
 13  land_area            2821 non-nul

In [8]:
df.sample(3)[cols]

Unnamed: 0,title,adr,latitude,longitude,priceM,ts
2346,дом 45 м² на участке 6 сот.,"Севастополь, садоводческое товарищество Дружный",44.57,33.49,5.0,2022-09-15 11:53:29.976
154,дом 70 м² на участке 10 сот.,"Севастополь, с. Береговое",44.9,33.62,5.0,2022-09-15 11:53:29.976
2010,"дом 191 м² на участке 9,5 сот.","Севастополь, аэродром Херсонес",44.57,33.4,16.0,2022-09-15 11:53:29.976


In [9]:
# берём объявления с геометкой
print(len(df))
df = df[ (~df['latitude'].isnull()) ].reset_index(drop=True)
print(len(df))

2821
2821


In [10]:
# df.sample(2)

In [11]:
# # берём самые свежие объявления
# print(len(df))
# df = df[ (df['ts']==df['ts'].max()) ].reset_index(drop=True)
# print(len(df))

# cols = ['title','adr','latitude','longitude','priceM']
# df[cols]

### ограничиваем тип объекта 

In [12]:
print(len(df))
# выкидываем "ущербные" варианты 

df = df.query('~(is_part|is_SNT|is_townhouse) & (house_area>40.) & (priceM<15.)')

print(len(df))

2821
906


In [13]:
gdf = gpd.GeoDataFrame( df, geometry = gpd.points_from_xy( df['longitude'], df['latitude']), crs='epsg:4326', )
gdf.sample(3)[['title','adr','latitude','longitude','priceM']]

Unnamed: 0,title,adr,latitude,longitude,priceM
1235,"дом 52,5 м² на участке 4,2 сот.","Севастополь, Коммунистическая ул., 48А",44.6,33.51,8.4
778,"дом 108,5 м² на участке 6 сот.","Севастополь, товарищество собственников недвижимости Бодрость",44.57,33.58,11.8
2588,дом 48 м² на участке 10 сот.,"Бахчисарай, с. Плодовое, Симферопольская ул.",44.87,33.86,2.5


In [14]:
del df

In [15]:
cols = ['title','priceM','house_size_category','place','geometry']

# gdf[cols].explore('house_size_category',cmap='rainbow',legend=True,marker_type='marker') 
gdf[cols].explore('house_size_category',cmap='rainbow',legend=True,marker_type='circle_marker') 

In [16]:
cols = ['title','adr','priceM','house_size_category','place','geometry']
gdf[cols].explore('place',cmap='rainbow',legend=True) 

## общая статистика изменение цены

In [17]:
ts = '2022-08-01' # ограничиваем дату

In [18]:
stat = (
    gdf.query('priceM>1.')
    .groupby(['place','house_size_category','dt'])
    ['priceM'].describe(percentiles=[.1,.25,.5,.75,.9])
)
stat

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,10%,25%,50%,75%,90%,max
place,house_size_category,dt,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
bahchisaray,30-50,2022-09-15,29.0,3.53,1.46,1.7,2.0,2.5,3.5,4.0,4.86,7.9
bahchisaray,50-70,2022-09-15,59.0,4.62,1.83,1.5,2.66,3.27,4.5,5.7,6.84,10.5
bahchisaray,70-150,2022-09-15,64.0,6.26,2.83,1.4,3.15,4.07,5.5,7.78,10.21,14.25
bahchisaray,150-300,2022-09-15,49.0,8.62,3.01,1.79,4.6,6.5,8.0,11.5,12.5,14.99
bahchisaray,300+,2022-09-15,6.0,9.02,3.8,4.8,4.88,5.71,8.95,12.22,13.25,13.5
sevastopol,30-50,2022-09-15,45.0,5.22,1.95,2.5,2.88,3.9,4.99,6.3,8.03,10.0
sevastopol,50-70,2022-09-15,87.0,6.7,2.86,1.68,3.59,4.5,6.2,8.5,10.12,14.9
sevastopol,70-150,2022-09-15,366.0,8.46,2.78,1.5,5.2,6.5,8.2,10.15,12.42,14.91
sevastopol,150-300,2022-09-15,123.0,10.63,2.83,3.75,6.6,8.5,10.96,12.87,13.99,14.9
sevastopol,300+,2022-09-15,8.0,11.41,2.72,6.0,8.8,10.38,11.8,12.98,13.92,14.9


In [19]:
# stat.loc[('bahchisaray','30-50'),['min','25%','50%']].plot(grid=True, title='цена на маленькие дома')

In [20]:
# stat.loc['30-50',['min','25%','50%']].plot(grid=True, title='цена на маленькие дома')

In [21]:
# stat.loc['50-70',['min','25%','50%']].plot(grid=True, title='цена на средние дома')

In [22]:
# stat.loc['70-150',['min','25%','50%']].plot(grid=True, title='цена на большие дома')

## гексагональная сетка

In [23]:
# gdf['place'].unique()

In [24]:
places = gdf['place'].unique()
places

array(['sevastopol', 'bahchisaray', 'sudak'], dtype=object)

In [25]:
from shapely.geometry import MultiPoint

frame_geometry = gpd.GeoDataFrame([
        { 
            'place':p, 
            'geometry':MultiPoint( gdf[gdf['place']==p]['geometry'].tolist() ).convex_hull, 
        }
        for p in places
    ],crs='EPSG:4326')

In [26]:
frame_geometry.explore('place',legend=True,cmap='rainbow')

In [27]:
import h3
    
def cover_grid_hexagonal(gmtr,place,cell_size=8,crs='EPSG:4326'):
    polygonise = lambda hex_id: Polygon( h3.h3_to_geo_boundary( hex_id, geo_json=True)  )
    hexs = list(h3.polyfill( gmtr.__geo_interface__, cell_size, geo_json_conformant = True ))
    #return { 'place':place, 'hex_id': hexs,  'geometry': list(map(polygonise, hexs)), }
    return gpd.GeoDataFrame(
            { 'place':place, 'hex_id': hexs,  'geometry': list(map(polygonise, hexs)), },
            crs=crs, 
        )

In [33]:
# grid

In [42]:
places = {
'sevastopol':6,
'sudak':8,
'bahchisaray':6,
}

grid = gpd.GeoDataFrame(
    pd.concat([
        cover_grid_hexagonal(frame_geometry[ frame_geometry['place']==p ]['geometry'].values[0],p,cell_size=cs)
        for p,cs in places.items()
        ], ignore_index=True)
    ,crs='EPSG:4326'
)

grid.explore('place',cmap='rainbow')

### медиана актуальной цены

In [43]:
place = ['bahchisaray',]

grid_ =  grid[grid['place'].isin(place)]

grid_values = (
   grid_.sjoin( gdf[ (gdf['dt']>ts) & (gdf['priceM']<15.) ], how='inner',)
    .groupby(['hex_id'])
    ['priceM']
    .median()
    .reset_index()
)

grid_.merge(grid_values,on='hex_id').explore('priceM',cmap='rainbow') #'area_name', legend=True)

In [44]:
place = ['sevastopol',]

grid_ =  grid[grid['place'].isin(place)]

grid_values = (
   grid_.sjoin( gdf[ (gdf['dt']>ts) & (gdf['priceM']<15.) ], how='inner',)
    .groupby(['hex_id'])
    ['priceM']
    .median()
    .reset_index()
)

grid_.merge(grid_values,on='hex_id').explore('priceM',cmap='rainbow') #'area_name', legend=True)

In [45]:
place = ['sudak',]

grid_ =  grid[grid['place'].isin(place)]

grid_values = (
   grid_.sjoin( gdf[ (gdf['dt']>ts) & (gdf['priceM']<15.) ], how='inner',)
    .groupby(['hex_id'])
    ['priceM']
    .median()
    .reset_index()
)

grid_.merge(grid_values,on='hex_id').explore('priceM',cmap='rainbow') #'area_name', legend=True)

---