In [1]:
import shapely.geometry
import pyproj
import json
import statistics
import numpy as np
import pandas as pd


#from grid import Grid
#from cell import Cell
from entrie import Entrie, EntrieType 
from utils import *

In [2]:
"""import shapely.geometry
import pyproj

from typing import List
from math import ceil

from utils import midpoint, distance
from entrie import Entrie, EntrieType
from cell import Cell


class Grid:
    def __init__(self, cells: List[List[Cell]], nw=(60.080844, 30.175307), se=(59.833302, 30.525166)):
        self.cells = cells
        self.dx = distance(cells[0][0].center, cells[0][1].center)
        self.dy = distance(cells[0][0].center, cells[1][0].center)
        self.dlat = abs(cells[0][0].center[0] - cells[1][1].center[0])
        self.dlon = abs(cells[0][0].center[1] - cells[1][1].center[1])
        self.nw = nw
        self.se = se
        self.shape = [len(self.cells), len(self.cells[0])]

    def find_suitable_position(self, lat: float, lon: float):
        try:
            if self.nw[0] >= lat >= self.se[0] or self.se[1] >= lon >= self.nw[1]:
                return (int((lon - self.nw[1]) // self.dlon), 
                        int((self.nw[0] - lat) // self.dlat))
            return
        except(TypeError):
                return
            
        '''
        for i in range(len(self.cells) - 1):
            m = midpoint(self.cells[i][0].center, self.cells[i + 1][0].center)
            try:
                if lat > m[0]:
                    for j in range(len(self.cells[i]) - 1):
                        m = midpoint(self.cells[i][j].center,
                                    self.cells[i][j + 1].center)
                        if m[1] > lon:
                            return (i, j)
            
        '''

    def get_all_entries_in_radius(self, coordinates, r: float):
        pos = self.find_suitable_position(*coordinates)
        if pos is None:
            return {}

        i0, j0 = pos
        aggrigated_cell = self.cells[i0][j0]
        i_steps, j_steps = ceil(r / self.dy), ceil(r / self.dx)

        for i in range(max(0, i0 - i_steps), min(len(self.cells), i0 + i_steps + 1)):
            for j in range(max(0, j0 - j_steps), min(len(self.cells[0]), j0 + j_steps + 1)):
                if distance((self.cells[i][j].center), coordinates) - self.dy / 2 <= r:
                    aggrigated_cell += self.cells[i][j]
                    #print(i, j)

        return aggrigated_cell.entries

    def add_entrie(self, entrie: Entrie, entrie_type: EntrieType):
        pos = self.find_suitable_position(*entrie.coorditates)
        try:
            if not pos is None:
                #print(pos, (len(self.cells), len(self.cells[0])))
                self.cells[pos[0]][pos[1]].add_entrie(entrie, entrie_type)
        except IndexError:
            pass
        
    def save_cell_centers(self, filename: str):
        with open(f'./tabula-rasa/{filename}', 'w') as file:
            file.write('\n'.join(['; '.join([str(cell.center)
                                             for cell in row]) for row in self.cells]))

    def load_cell_centers(filename: str):
        with open(f'./tabula-rasa/{filename}', 'r') as file:
            rows = file.read().replace('(', '').replace(')', '').split('\n')
        cells = [[Cell(tuple(float(i) for i in c.split(', ')))
                  for c in row.split('; ')] for row in rows]
        return Grid(cells)

    def make_grid_in_degrees(top_left, bottom_right, step=0.0005):
        '''Makes a Grid instance regulary spaced in degrees'''

        lat_max, lon_min = top_left
        lat_min, lon_max = bottom_right

        cells = []
        lon = lon_min
        while lon < lon_max:
            lat, row = lat_min, []
            while lat < lat_max:
                row.append(
                    Cell(midpoint((lat, lon), (lat + step, lon + step * 2))))
                lat += step
            cells.append(row)
            lon += step * 2
        g = Grid(cells)
        g.dlat = step
        g.dlon = step * 2
        return g

    def make_grid_in_meters(top_left, bottom_right, step=100):
        '''Makes a Grid instance regulary spaced in meters'''

        # Set up projections
        p_ll = pyproj.Proj(init='epsg:4326')
        p_mt = pyproj.Proj(init='epsg:3857')  # metric; same as EPSG:900913

        # Create corners of rectangle to be transformed to a grid
        nw = shapely.geometry.Point(top_left)
        se = shapely.geometry.Point(bottom_right)

        stepsize = step

        # Project corners to target projection
        # Transform NW point to 3857
        s = pyproj.transform(p_ll, p_mt, nw.x, nw.y)
        e = pyproj.transform(p_ll, p_mt, se.x, se.y)  # .. same for SE

        # Iterate over 2D area
        cells = []
        x = s[0]
        while x > e[0]:
            y = s[1]
            row = []
            while y < e[1]:
                nw_point = shapely.geometry.Point(
                    pyproj.transform(p_mt, p_ll, x, y))
                se_point = shapely.geometry.Point(
                    pyproj.transform(p_mt, p_ll, x + stepsize, y + stepsize))
                row.append(
                    Cell(midpoint((nw_point.x, nw_point.y), (se_point.x, se_point.y))))
                y += stepsize * 1.314
            cells.append(row)
            x -= stepsize

        return Grid(cells)
"""
True

True

In [59]:
from enum import Enum

from utils import midpoint
from entrie import Entrie, EntrieType


class CellTypes(Enum):
    URBAN = 1
    PARK = 2
    WATER = 3


class Cell:
    def __init__(self, nw, se):
        self.nw = nw
        self.se = se
        self.center = midpoint(nw, se)
        self.type = CellTypes.URBAN
        self.entries = {
            EntrieType.VACANCY: [],
            EntrieType.APARTMENT: [],
            EntrieType.SHOP: [],
            EntrieType.BUS_STOP: [],
            EntrieType.TRAM_STOP: [],
            EntrieType.SUBWAY_STATION: [],
            EntrieType.CAFE: [],
            EntrieType.OFFICE: [],
            EntrieType.POINT_OF_INTEREST: [],
            EntrieType.SCHOOL: [],
            EntrieType.HOSPITAL: [],
            EntrieType.BANK: [],
            EntrieType.UNIVERCITY: [],
            EntrieType.CINEMA: [],
            EntrieType.NIGHTLIFE: [],
            EntrieType.GOVERNMENT: [],
            EntrieType.LEISURE: [],
            EntrieType.HOTEL: [],
            EntrieType.VACANCY: [],
        }

    def add_entrie(self, entrie: Entrie, entrie_type: EntrieType):
        self.entries[entrie_type].append(entrie)


    def __add__(self, other):
        res = Cell(self.nw, self.se)
        for type in self.entries:
            res.entries[type] = self.entries[type] + other.entries[type]
        return res

    def __repr__(self):
        return f"{self.center[0]}, {self.center[1]}"


In [150]:
import shapely.geometry
import pyproj

import numpy as np

from typing import List
from math import ceil

from utils import midpoint, distance
from entrie import Entrie, EntrieType



class Grid:
    def __init__(self, nw, se, step=0.0005):
        self.lat_step = step
        self.lon_step = step * 2
        self.x_step = distance(nw, (nw[0], nw[1] + self.lon_step))
        self.y_step = distance(nw, (nw[0] + self.lat_step, nw[1]))
        self.cells = np.array([[Cell((lat, lon), (lat + self.lat_step, lon + self.lon_step)) for lon in np.linspace(nw[1], se[1],
                                                                                                            (se[1] - nw[1]) // self.lon_step)] for lat in np.linspace(nw[0], se[0], (nw[0] - se[0]) // self.lat_step)])
        self.nw = self.cells[0][0].nw
        self.se = self.cells[-1][-1].se

    def cell_indexes(self, coordinates):        
        if self.nw[0] >= coordinates[0] >= self.se[0] or self.se[1] >= coordinates[1] >= self.nw[1]:
            i = (self.nw[0] - coordinates[0]) // self.lat_step
            j = self.cells.shape[1] - (self.se[1] - coordinates[1]) // self.lon_step
            return (int(i), int(j))
        return

    def get_all_entries_in_radius(self, coordinates, r: float):
        pos = self.cell_indexes(coordinates)
        if pos is None:
            return {}

        i0, j0 = pos
        aggrigated_cell = self.cells[i0][j0]
        i_steps, j_steps = ceil(r / self.y_step), ceil(r / self.x_step)

        for i in range(max(0, i0 - i_steps), min(len(self.cells), i0 + i_steps + 1)):
            for j in range(max(0, j0 - j_steps), min(len(self.cells[0]), j0 + j_steps + 1)):
                if distance((self.cells[i][j].center), coordinates) - self.y_step / 2 <= r:
                    aggrigated_cell += self.cells[i][j]
        return aggrigated_cell.entries

    def add_entrie(self, entrie: Entrie, entrie_type: EntrieType):
        pos = self.cell_indexes(entrie.coorditates)
        if not pos is None:
            self.cells[pos[0]][pos[1]].add_entrie(entrie, entrie_type)



In [151]:
city_dict = {
    'spb': {
        'borders': ((60.080844, 30.175307), (59.833302, 30.525166)),
        'center': (59.9343, 30.3351),
        'grid': None
    },
    'smr': {
        'borders': ((53.277885, 50.056253), (53.174675, 50.319944)),
        'center': (53.2415, 50.2212),
        'grid': None
    }
}

In [152]:
'''
def f(grid, coordinates, r):
    pos = grid.cell_indexes(coordinates)
    if pos is None:
        return {}

    i0, j0 = pos
    print((i0, j0), grid.cells.shape)
    aggrigated_cell = grid.cells[i0][j0]
    i_steps, j_steps = ceil(r / grid.y_step), ceil(r / grid.x_step)

    print([i0, j0])
    print(c, (grid.cells[i0][j0].center))
    print(distance(c, (grid.cells[i0][j0].center)))
    for i in range(max(0, i0 - i_steps), min(len(grid.cells), i0 + i_steps + 1)):
        for j in range(max(0, j0 - j_steps), min(len(grid.cells[0]), j0 + j_steps + 1)):
            if distance((grid.cells[i][j].center), coordinates) - grid.y_step / 2 <= r:
                aggrigated_cell += grid.cells[i][j]
                print(i, j)

    return aggrigated_cell.entries

g = Grid(*city_dict['spb']['borders'], 0.001)
c = (59.939811999999996, 30.391174)
res = f(g, c, 1000)
'''
1

1

In [153]:
g = Grid(*city_dict['spb']['borders'], 0.001)



In [154]:
g.cells.shape

(247, 174)

In [155]:
g.cell_indexes((60.080844, 30.175307))

In [156]:
g.cell_indexes((59.833302, 30.525166))

(247, 174)

In [157]:
def make_grid(city_abbr):
    with open(f'./data/entries/parsed/{city_abbr}.json') as file:
        etrie_data = json.load(file)
        
    grid = Grid(*city_dict[city_abbr]['borders'], 0.001)
    grid.nw = city_dict[city_abbr]['borders'][0]
    grid.se = city_dict[city_abbr]['borders'][1]
    
    #print(abs(grid.nw[0] - grid.se[0]) // grid.dlat, abs(grid.nw[1] - grid.se[1]) // grid.dlon)
    
    i = 0
    for entrie_type_value, raw_entries in zip(etrie_data.keys(), etrie_data.values()):
        entrie_type = EntrieType(entrie_type_value)
        j = 0
        for raw_entrie in raw_entries:
            
            processed_entrie = Entrie(*raw_entrie)

            if type(processed_entrie.coorditates[0]) != type(1.2):
                continue
                     
            grid.add_entrie(processed_entrie, entrie_type)
            j += 1
            if j  % 100 == 0:
                pass
                #print(f'{j}/{len(raw_entries)}')
        i += 1
        print(f'{j} ({i} / {len(etrie_data.keys())})')
    
    return grid

In [158]:
def extract_public_trasport_stops(city_abbr):
    with open(f'./data/entries/parsed/{city_abbr}.json') as file:
        etrie_data = json.load(file)
    return [e[0] for e in etrie_data['subway_station']] + [e[0] for e in etrie_data['tram_stop']] + [e[0] for e in etrie_data['bus_stop']]
    

In [159]:
for city_abbr in city_dict.keys(): 
    city_dict[city_abbr]['grid'] = make_grid(city_abbr)



9074 (1 / 17)
2895 (2 / 17)
427 (3 / 17)
45 (4 / 17)
3152 (5 / 17)
2145 (6 / 17)
1007 (7 / 17)
75 (8 / 17)
111 (9 / 17)
502 (10 / 17)
39 (11 / 17)
30 (12 / 17)
624 (13 / 17)
138 (14 / 17)
128 (15 / 17)
656 (16 / 17)


IndexError: index 247 is out of bounds for axis 0 with size 247

In [77]:
def prepare_table(city_abbr):
    flats = pd.read_csv(f'./data/flats/{city_abbr}.csv').drop(columns=['adress', 'subway_station'])
    cols = flats.columns.tolist()
    cols[-1], cols[-2] = cols[-2], cols[-1]
    cols = cols[-2:] + [cols[0]] + cols[1:-2]
    flats = flats[cols]
    flats['price_per_m'] = (flats['price'] / flats['area']).map(lambda x: int(x))
    flats = flats.drop(columns=['price'])
    flats['distance_to_center'] = [distance(x, city_dict[city_abbr]['center']) for x in zip(flats.latitude, flats.longitude)]
    return flats

In [78]:
prepared_smr = prepare_table('smr')
prepared_spb = prepare_table('spb')

In [100]:
def cook_table(city_abbr, flats):
    
    i = 0
    feature_list = []
    grid = city_dict[city_abbr]['grid']

    for row in flats.iterrows():
        row = row[1]
        d = {'price_per_m': int(row['price_per_m']),
             'rooms': int(row['rooms']),
             'area': row['area'], 
             'floor': int(row['floor']),
             'floors_total': int(row['floors_total']),
             'distance_to_subway': int(row['distance to subway']),
             'distance_to_center': int(row['distance_to_center'])
            }

        lat, lon = row["latitude"], row["longitude"]
        
        print(grid.cell_indexes((lat, lon)))
        entries = {
            '500': grid.get_all_entries_in_radius((lat, lon), 500),
            '1000': grid.get_all_entries_in_radius((lat, lon), 1000),
        }
        
        return entries
        
        try:
            salaries_1000 = [e.weight for e in entries['1000'][EntrieType.VACANCY]]
            salaries_500 = [e.weight for e in entries['500'][EntrieType.VACANCY]]
            d['mean_salary_1000'] = int(statistics.mean(salaries_1000))
            d['median_salary_1000'] = int(statistics.median(salaries_1000))
            d['mean_salary_500'] = int(statistics.mean(salaries_500))
            d['median_salary_500'] = int(statistics.median(salaries_500))
        except KeyError:
            continue
        for key in entries['500'].keys():
            for dist in entries.keys():
                d[f'{key.value}_{dist}'] = len([e for e in entries[dist][key]])
        feature_list.append(d)
        
        if i % 100 == 0:
            print(i)
        i += 1
        return pd.DataFrame(feature_list)
    return pd.DataFrame(feature_list)

In [101]:
e = cook_table('smr', prepared_smr)

(30, 94)


In [99]:
e

{'500': {<EntrieType.VACANCY: 'vacancy'>: [],
  <EntrieType.APARTMENT: 'apartement'>: [],
  <EntrieType.SHOP: 'shop'>: [],
  <EntrieType.BUS_STOP: 'bus_stop'>: [],
  <EntrieType.TRAM_STOP: 'tram_stop'>: [],
  <EntrieType.SUBWAY_STATION: 'subway_station'>: [],
  <EntrieType.CAFE: 'cafe'>: [],
  <EntrieType.OFFICE: 'office'>: [],
  <EntrieType.POINT_OF_INTEREST: 'point_of_interest'>: [],
  <EntrieType.SCHOOL: 'school'>: [],
  <EntrieType.HOSPITAL: 'hospital'>: [],
  <EntrieType.BANK: 'bank'>: [],
  <EntrieType.UNIVERCITY: 'univercity'>: [],
  <EntrieType.CINEMA: 'cinema'>: [],
  <EntrieType.NIGHTLIFE: 'nightlife'>: [],
  <EntrieType.GOVERNMENT: 'government'>: [],
  <EntrieType.LEISURE: 'leisure'>: [],
  <EntrieType.HOTEL: 'hotel'>: []},
 '1000': {<EntrieType.VACANCY: 'vacancy'>: [],
  <EntrieType.APARTMENT: 'apartement'>: [],
  <EntrieType.SHOP: 'shop'>: [],
  <EntrieType.BUS_STOP: 'bus_stop'>: [],
  <EntrieType.TRAM_STOP: 'tram_stop'>: [],
  <EntrieType.SUBWAY_STATION: 'subway_station'>

In [94]:
smr_table = cook_table('smr', prepared_smr)

<class '__main__.Grid'>


StatisticsError: mean requires at least one data point

In [100]:
spb_table = cook_table('spb', prepared_spb)

NameError: name 'coord_dict' is not defined

In [None]:

spb_table.to_csv('./datasets/spb.csv', index=False)

In [9]:
smr_table = cook_table('smr', prepared_smr)
smr_table.to_csv('./datasets/smr.csv', index=False)

KeyboardInterrupt: 

In [109]:
len(res[EntrieType.VACANCY])

29

In [233]:
g.cells[0][0]

59.833802000000006, 30.176307

In [236]:
g.nw, g.se

((60.080844, 30.175307), (59.833302, 30.525166))

In [95]:
smr_table.to_csv('./datasets/smr.csv', index=False)

In [83]:
df = 

In [84]:
df

Unnamed: 0,area,bank_1000,bank_500,bus_stop_1000,bus_stop_500,cafe_1000,cafe_500,cinema_1000,cinema_500,distance_to_center,...,shop_1000,shop_500,subway_station_1000,subway_station_500,tram_stop_1000,tram_stop_500,univercity_1000,univercity_500,vacancy_1000,vacancy_500
0,109.4,3,0,61,20,6,1,0,0,4929,...,19,3,0,0,0,0,0,0,92,1


In [4]:
entries = Cell([0, 0]).entries

405.4873611929934

In [25]:
smr_grid.find_suitable_position(50.1995781, 53.2009557)

In [47]:
r = smr_grid.get_all_entries_in_radius((53.277885, 50.056253), 10000)

In [91]:
distance((53.277885, 50.056253), (50.1995781, 53.2009557))

405396.3914606943

In [88]:
grid.cells[0][0].center, grid.cells[0][1].center

((60.14131115764205, 29.9328822447388), (60.14131115764205, 29.93366073117118))

In [26]:
midpoint(grid.cells[0][2].center, grid.cells[0][3].center)

(60.16331988210298, 30.04919449246976)

In [83]:
# Set up projections
p_ll = pyproj.Proj(init='epsg:4326')
p_mt = pyproj.Proj(init='epsg:3857') # metric; same as EPSG:900913

# Create corners of rectangle to be transformed to a grid
nw = shapely.geometry.Point((-5.0, 40.0))
se = shapely.geometry.Point((-4.999, 40.03439880201911))

stepsize = 50

# Project corners to target projection
s = pyproj.transform(p_ll, p_mt, nw.x, nw.y) # Transform NW point to 3857
e = pyproj.transform(p_ll, p_mt, se.x, se.y) # .. same for SE

# Iterate over 2D area
gridpoints = []
x = s[0]
while x < e[0]:
    y = s[1]
    row = []
    while y < e[1]:
        p = shapely.geometry.Point(pyproj.transform(p_mt, p_ll, x, y))
        row.append([p.x, p.y])
        y += stepsize * 1.314
    gridpoints.append(row)
    x += stepsize

In [85]:
distance([-5.0, 40.00045211267976], [-5.0, 40.0])

50.137436430313635

In [None]:
gridpoints

In [63]:
distance((-5.0, 40.03439880201911), [-5.0, 40.0])

3814.68564478778