In [5]:
import shapely.geometry
import pyproj
import json
import statistics
import numpy as np
import pandas as pd


from entrie import Entrie, EntrieType
from utils import *




In [6]:
city_dict = {
    'spb': {
        'center': (59.9343, 30.3351)
    },
    'smr': {
        'center': (53.2415, 50.2212)
        
    }
}

def prepare_table(city_abbr):
    flats = pd.read_csv(
        f'./data/flats/{city_abbr}.csv').drop(columns=['adress', 'subway_station'])
    cols = flats.columns.tolist()
    cols[-1], cols[-2] = cols[-2], cols[-1]
    cols = cols[-2:] + [cols[0]] + cols[1:-2]
    flats = flats[cols]
    flats['price_per_m'] = (
        flats['price'] / flats['area']).map(lambda x: int(x))
    flats = flats.drop(columns=['price'])
    flats['distance_to_center'] = [distance(
        x, city_dict[city_abbr]['center']) for x in zip(flats.latitude, flats.longitude)]
    return flats

In [7]:
def add_distances(city_abbr):
    with open(f'./data/entries/parsed/{city_abbr}.json') as file:
        entry_data = json.load(file)

In [8]:
def assemble_dataset(city_abbr):
    with open(f'./data/entries/parsed/{city_abbr}.json') as file:
        entry_data = json.load(file)

    flats = prepare_table(city_abbr)

    R = 6371
    feature_list = []
    i = 0
    for row in flats.iterrows():
        i += 1
        row = row[1]
        row_dict = {
            'price_per_m': int(row['price_per_m']),
            'rooms': int(row['rooms']),
            'area': row['area'],
            'floor': int(row['floor']),
            'floors_total': int(row['floors_total']),
            'distance_to_subway': int(row['distance to subway']),
            'distance_to_center': int(row['distance_to_center'])
        }
        lat, lon = row['latitude'], row['longitude']
        for entry_type, entries in zip(entry_data.keys(), entry_data.values()):
            c = np.array([entry[0]
                          for entry in entries if type(entry[0][0]) is float]).T
            d = c.copy()
            d[0] -= lat
            d[0] *= (np.pi / 180)
            d[1] -= lon
            d[1] *= (np.pi / 180)
            a = np.sin(d[0] / 2) * np.sin(d[0] / 2)
            c = 2 * arctan2(np.sqrt(a), np.sqrt(1 - a)) + np.cos(lat * np.pi / 180) * \
                np.cos(c[0] * np.pi / 180) * \
                np.sin(d[1] / 2) * np.sin(d[1] / 2)
            distances = R * c * 1000
            if entry_type == 'shop' and np.nonzero(distances <= 1000)[0].shape[0] == 0:
                break
            for r in [500, 1000]:
                indices = np.nonzero(distances <= r)[0]
                if entry_type in ['vacancy']:
                    salaries = [entries[i][1] for i in indices] or [0]
                    median = int(statistics.median(salaries))
                    mean = int(statistics.mean(salaries))
                    row_dict[f'mean_salaries_{r}'] = mean
                    row_dict[f'median_salaries_{r}'] = median
                row_dict[f'{entry_type}_{str(r)}'] = indices.shape[0]
        else:
            feature_list.append(row_dict)
        if i % 1000 == 0:
            print(i)

    dataset = pd.DataFrame(feature_list)
    dataset.to_csv(f'./datasets/{city_abbr}.csv', index=False)


In [9]:
city_abbr = 'smr'
with open(f'./data/entries/parsed/{city_abbr}.json') as file:
    entry_data = json.load(file)

flats = prepare_table(city_abbr)

In [31]:
a = np.arange(4)

In [32]:
a.astype(float)

array([0., 1., 2., 3.])

In [35]:
p = 0.017453292519943295
feature_list = []
i = 0
for row in flats[:10].iterrows():
    i += 1
    row = row[1]
    row_dict = {
        'price_per_m': int(row['price_per_m']),
        'rooms': int(row['rooms']),
        'area': row['area'],
        'floor': int(row['floor']),
        'floors_total': int(row['floors_total']),
        'distance_to_subway': int(row['distance to subway']),
        'distance_to_center': int(row['distance_to_center'])
    }
    lat1, lon1 = row['latitude'], row['longitude']
    for entry_type, entries in zip(entry_data.keys(), entry_data.values()):
        c = np.array([entry[0]
                      for entry in entries if type(entry[0][0]) is float]).T
        lat2 = c[0]
        lon2 = c[1]
        a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
        distances = (12742000 * np.arcsin(np.sqrt(a))).astype(int)

        if entry_type == 'shop' and np.nonzero(distances <= 1000)[0].shape[0] == 0:
            break
            
        if entry_type in ['school', 'bus_stop']:
            row_dict[f'distance_to_{entry_type}'] = min(distances)        
        for r in [500, 1000]:
            indices = np.nonzero(distances <= r)[0]
            if entry_type in ['vacancy']:
                salaries = [entries[i][1] for i in indices] or [0]
                median = int(statistics.median(salaries))
                mean = int(statistics.mean(salaries))
                row_dict[f'mean_salaries_{r}'] = mean
                row_dict[f'median_salaries_{r}'] = median
            row_dict[f'{entry_type}_{str(r)}'] = indices.shape[0]
    else:
        feature_list.append(row_dict)
        
    if i % 1000 == 0:
        print(i)

In [36]:
feature_list

[{'price_per_m': 63063,
  'rooms': 2,
  'area': 44.4,
  'floor': 5,
  'floors_total': 9,
  'distance_to_subway': 3800,
  'distance_to_center': 1712,
  'shop_500': 4,
  'shop_1000': 21,
  'distance_to_bus_stop': 106,
  'bus_stop_500': 8,
  'bus_stop_1000': 18,
  'tram_stop_500': 2,
  'tram_stop_1000': 5,
  'subway_station_500': 0,
  'subway_station_1000': 0,
  'cafe_500': 1,
  'cafe_1000': 3,
  'office_500': 1,
  'office_1000': 2,
  'point_of_interest_500': 0,
  'point_of_interest_1000': 0,
  'distance_to_school': 448,
  'school_500': 1,
  'school_1000': 3,
  'hospital_500': 0,
  'hospital_1000': 0,
  'bank_500': 2,
  'bank_1000': 5,
  'univercity_500': 0,
  'univercity_1000': 0,
  'cinema_500': 0,
  'cinema_1000': 0,
  'nightlife_500': 0,
  'nightlife_1000': 0,
  'government_500': 1,
  'government_1000': 1,
  'leisure_500': 0,
  'leisure_1000': 0,
  'hotel_500': 0,
  'hotel_1000': 0,
  'mean_salaries_500': 0,
  'median_salaries_500': 0,
  'vacancy_500': 0,
  'mean_salaries_1000': 64,
 

In [26]:
p = (lat, lon)
loc_list = [[53.209791, 50.119632], [53.2255334, 50.2065899], [53.2257518, 50.2067186], [53.1854754, 50.0888691], [53.1851968, 50.0851277], [53.1844514, 50.0915613], [53.2021505, 50.1261779], [53.202022, 50.1260036], [53.2048135, 50.1280141], [53.2434004, 50.2424113], [53.2352408, 50.2493883], [53.2352375, 50.2488545], [53.1899902, 50.1271579], [53.2170603, 50.1731793], [53.1989219, 50.1118837], [53.2122188, 50.1352057], [53.1968171, 50.1688869], [53.2020622, 50.1971705], [53.2128207, 50.1749329], [53.2039186, 50.212989], [53.2143275, 50.1997608], [53.2230922, 50.2539185], [53.2432563, 50.2367658], [53.2212365, 50.2296546], [53.2316979, 50.2491872], [53.2084214, 50.2771666], [53.1943079, 50.2187004], [53.2082647, 50.2371019], [53.2448454, 50.2649155], [53.2523423, 50.2503027], [53.2073922, 50.1585215], [53.19692, 50.2138518], [53.2413452, 50.1867492]]# To display all data use the following two lines, but, since your data has
sorted([distance(p, point) for point in loc_list])

[449,
 678,
 701,
 1343,
 1346,
 1350,
 1735,
 2736,
 3057,
 3494,
 3517,
 3947,
 4355,
 4740,
 4808,
 5262,
 5851,
 5947,
 5962,
 6042,
 6134,
 7277,
 7565,
 8302,
 9126,
 9351,
 9386,
 9404,
 10122,
 10388,
 12402,
 12488,
 12714]

In [22]:
import folium

map_ = folium.Map(location=[40.13, 116.25], zoom_start=10)

for point in loc_list:
    folium.Marker(point, icon=folium.Icon(color='green')).add_to(map_)

folium.Marker(p, icon=folium.Icon(color='red')).add_to(map_)

# To display first 1000 points
# for point in range(0, 1000):
#     folium.Marker(loc_list[point]).add_to(map_)

map_

In [5]:
min(a)

0