In [1]:
import shapely.geometry
import pyproj
import json
import statistics
import numpy as np
import pandas as pd


#from grid import Grid
#from cell import Cell
from entrie import Entrie, EntrieType 
from utils import *

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [48]:
def clean_avito_data(column_order, column_types):
    for city_abbr in ['smr', 'spb']:
        avito_data = pd.read_csv(f'./data/flats/{city_abbr}.csv')
        avito_data.drop_duplicates(inplace=True)
        avito_data['year'] = 0       
        columns_to_drop = ['adress', 'distance to subway', 'subway_station']
        #avito_data.drop(columns_to_drop, inplace=True, axis=1)
        #avito_data = avito_data.fillna(0).astype(int)
        avito_data = avito_data[column_order].astype(column_types)
        avito_data = avito_data        
        avito_data.to_csv(f'./data/flats/{city_abbr}_avito.csv', index=False)

In [114]:
def save_domofond_data(city_abbr, domofond_data):
    clone = pd.DataFrame(columns=domofond_data.columns)
    new_rows = []
    for i, row in domofond_data.iterrows():
        floor, floors_total = [int(f) for f in row["floor"].split('/')]
        row['floor'] = floor
        row['floors_total'] = floors_total
        row['year'] = int(row['year'].split('.')[0]) if type(row['year']) == str and row['year'] != 'None' else 0   
        new_rows.append(row.values)
    clone.append(pd.DataFrame(new_rows, columns=domofond_data.columns)).to_csv(f'./data/flats/{city_abbr}_domofond.csv', index=False)

In [115]:
def clean_domofond_data(column_order, column_types):
    for city_abbr in ['smr']:
        domofond_data = pd.read_csv(f'./data/flats/{city_abbr}_domofond.csv')
        domofond_data.drop_duplicates(inplace=True)
        columns_to_drop = ['adress']
        #avito_data.drop(columns_to_drop, inplace=True, axis=1)
        save_domofond_data(city_abbr, domofond_data)

In [117]:
def merge_domofond_and_avito():
    for city_abbr in ['smr']:
        domofond_data = pd.read_csv(f'./data/flats/{city_abbr}_domofond.csv')
        avito_data = pd.read_csv(f'./data/flats/{city_abbr}_avito.csv.csv')
        merged = pd.concat([domofond_data, avito_data])
        merged.to_csv(f'./data/flats/{city_abbr}.csv', index=False)        

In [119]:
column_order = ["price","longitude","latitude","rooms","area","floor","floors_total","year"]
column_types = {"floor": int, "year": int}
clean_avito_data(column_order, column_types)
clean_domofond_data(column_order, column_types)


AttributeError: 'numpy.float64' object has no attribute 'split'

In [120]:
merge_domofond_and_avito()


In [87]:
clone = pd.DataFrame(columns=pd.read_csv(f'./data/flats/{city_abbr}_domofond.csv').columns)

Unnamed: 0,price,longitude,latitude,rooms,area,floor,floors_total,year


In [53]:
domofond_data = pd.read_csv(f'./data/flats/{city_abbr}_domofond.csv')


In [109]:
clone

Unnamed: 0,price,longitude,latitude,rooms,area,floor,floors_total,year


In [13]:
domofond_data = pd.read_csv(f'./data/flats/{city_abbr}_domofond.csv')

In [None]:
domofond_data = domofond_data

In [279]:
def prepare_table(city_abbr):
    flats = pd.read_csv(f'./data/flats/{city_abbr}.csv').drop(columns=['adress', 'subway_station'])
    cols = flats.columns.tolist()
    cols[-1], cols[-2] = cols[-2], cols[-1]
    cols = cols[-2:] + [cols[0]] + cols[1:-2]
    flats = flats[cols]
    flats['price_per_m'] = (flats['price'] / flats['area']).map(lambda x: int(x))
    flats = flats.drop(columns=['price'])
    flats['distance_to_center'] = [distance(x, city_dict[city_abbr]['center']) for x in zip(flats.latitude, flats.longitude)]
    return flats

In [284]:
smr_flats = prepare_table('smr')
spb_flats = prepare_table('spb')

In [416]:
def assemble_dataset(city_abbr):
    with open(f'./data/entries/parsed/{city_abbr}.json') as file:
        entry_data = json.load(file)

    flats = prepare_table(city_abbr) 

    R = 6371
    feature_list = []
    i = 0
    for row in flats.iterrows():
        i += 1
        row = row[1]
        row_dict = {
             'price_per_m': int(row['price_per_m']),
             'rooms': int(row['rooms']),
             'area': row['area'], 
             'floor': int(row['floor']),
             'floors_total': int(row['floors_total']),
             'distance_to_subway': int(row['distance to subway']),
             'distance_to_center': int(row['distance_to_center'])
            }
        lat, lon = row['latitude'], row['longitude']
        for entry_type, entries in zip(entry_data.keys(), entry_data.values()):
            c = np.array([entry[0] for entry in entries if type(entry[0][0]) is float]).T
            d = c.copy()
            d[0] -= lat
            d[0] *= (np.pi / 180)
            d[1] -= lon
            d[1] *= (np.pi / 180)
            a = np.sin(d[0] / 2) * np.sin(d[0] / 2) 
            c = 2 * arctan2(np.sqrt(a), np.sqrt(1 - a)) + np.cos(lat * np.pi / 180) * np.cos(c[0] * np.pi / 180) * np.sin(d[1] / 2) * np.sin(d[1] / 2) 
            distances = R * c * 1000
            if entry_type == 'shop' and np.nonzero(distances<=1000)[0].shape[0] == 0:
                break
            for r in [500, 1000]:
                indices = np.nonzero(distances<=r)[0]
                if entry_type in ['vacancy']:
                    salaries = [entries[i][1] for i in indices] or [0]
                    median = int(statistics.median(salaries))
                    mean = int(statistics.mean(salaries))
                    row_dict[f'mean_salaries_{r}'] = mean
                    row_dict[f'median_salaries_{r}'] = median
                row_dict[f'{entry_type}_{str(r)}'] = indices.shape[0]
        else:
            feature_list.append(row_dict)
        if i % 1000 == 0:
            print(i)

    dataset = pd.DataFrame(feature_list)
    dataset.to_csv(f'./datasets/{city_abbr}.csv', index=False)

In [417]:
assemble_dataset('smr')

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000


In [418]:
assemble_dataset('spb')

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000


In [413]:
entry_data.keys()

dict_keys(['shop', 'bus_stop', 'tram_stop', 'subway_station', 'cafe', 'office', 'point_of_interest', 'school', 'hospital', 'bank', 'univercity', 'cinema', 'nightlife', 'government', 'leisure', 'hotel', 'vacancy'])

In [273]:
from enum import Enum

from utils import midpoint
from entrie import Entrie, EntrieType


class CellTypes(Enum):
    URBAN = 1
    PARK = 2
    WATER = 3


class Cell:
    def __init__(self, nw, se):
        self.nw = nw
        self.se = se
        self.center = midpoint(nw, se)
        self.type = CellTypes.URBAN
        self.entries = {
            EntrieType.VACANCY: [],
            EntrieType.APARTMENT: [],
            EntrieType.SHOP: [],
            EntrieType.BUS_STOP: [],
            EntrieType.TRAM_STOP: [],
            EntrieType.SUBWAY_STATION: [],
            EntrieType.CAFE: [],
            EntrieType.OFFICE: [],
            EntrieType.POINT_OF_INTEREST: [],
            EntrieType.SCHOOL: [],
            EntrieType.HOSPITAL: [],
            EntrieType.BANK: [],
            EntrieType.UNIVERCITY: [],
            EntrieType.CINEMA: [],
            EntrieType.NIGHTLIFE: [],
            EntrieType.GOVERNMENT: [],
            EntrieType.LEISURE: [],
            EntrieType.HOTEL: [],
            EntrieType.VACANCY: [],
        }

    def add_entrie(self, entrie: Entrie, entrie_type: EntrieType):
        self.entries[entrie_type].append(entrie)


    def __add__(self, other):
        res = Cell(self.nw, self.se)
        for type in self.entries:
            res.entries[type] = self.entries[type] + other.entries[type]
        return res

    def __repr__(self):
        return f"{self.center[0]}, {self.center[1]}"


In [272]:
import shapely.geometry
import pyproj

import numpy as np

from typing import List
from math import ceil

from utils import midpoint, distance
from entrie import Entrie, EntrieType



class Grid:
    def __init__(self, nw, se, step=0.0005):
        self.lat_step = step
        self.lon_step = step * 2
        self.x_step = distance(nw, (nw[0], nw[1] + self.lon_step))
        self.y_step = distance(nw, (nw[0] + self.lat_step, nw[1]))
        self.cells = np.array([[Cell((lat, lon), (lat + self.lat_step, lon + self.lon_step)) for lon in np.linspace(nw[1], se[1], (se[1] - nw[1]) / self.lon_step)] for lat in np.linspace(nw[0], se[0], (nw[0] - se[0]) / self.lat_step)])
        self.nw = self.cells[0][0].nw
        self.se = self.cells[-1][-1].se
#indices
    def cell_indexes(self, coordinates):        
        if self.nw[0] >= coordinates[0] >= self.se[0] or self.se[1] >= coordinates[1] >= self.nw[1]:
            i = (self.nw[0] - coordinates[0]) // self.lat_step
            j = self.cells.shape[1] - 1 - (self.se[1] - coordinates[1]) // self.lon_step
            #i = int(min(max(i, 0), self.cells.shape[0] - 1))
            #j = int(min(max(j, 0), self.cells.shape[1] - 1))
            return (int(i), int(j))
        return

    def get_all_entries_in_radius(self, coordinates, r: float):
        pos = self.cell_indexes(coordinates)
        if pos is None:
            return {}

        i0, j0 = pos
        aggrigated_cell = self.cells[i0][j0]
        i_steps, j_steps = ceil(r / self.y_step), ceil(r / self.x_step)

        for i in range(max(0, i0 - i_steps), min(len(self.cells), i0 + i_steps + 1)):
            for j in range(max(0, j0 - j_steps), min(len(self.cells[0]), j0 + j_steps + 1)):
                if distance((self.cells[i][j].center), coordinates) - self.y_step / 2 <= r:
                    aggrigated_cell += self.cells[i][j]
        return aggrigated_cell.entries

    def add_entrie(self, entrie: Entrie, entrie_type: EntrieType):
        pos = self.cell_indexes(entrie.coorditates)
        if not pos is None:
            self.cells[pos[0]][pos[1]].add_entrie(entrie, entrie_type)



In [266]:
'''
def f(grid, coordinates, r):
    pos = grid.cell_indexes(coordinates)
    if pos is None:
        return {}

    i0, j0 = pos
    print((i0, j0), grid.cells.shape)
    aggrigated_cell = grid.cells[i0][j0]
    i_steps, j_steps = ceil(r / grid.y_step), ceil(r / grid.x_step)

    print([i0, j0])
    print(c, (grid.cells[i0][j0].center))
    print(distance(c, (grid.cells[i0][j0].center)))
    for i in range(max(0, i0 - i_steps), min(len(grid.cells), i0 + i_steps + 1)):
        for j in range(max(0, j0 - j_steps), min(len(grid.cells[0]), j0 + j_steps + 1)):
            if distance((grid.cells[i][j].center), coordinates) - grid.y_step / 2 <= r:
                aggrigated_cell += grid.cells[i][j]
                print(i, j)

    return aggrigated_cell.entries

g = Grid(*city_dict['spb']['borders'], 0.001)
c = (59.939811999999996, 30.391174)
res = f(g, c, 1000)
'''
1

1

In [267]:
g = Grid(*city_dict['spb']['borders'], 0.001)



In [268]:
g.cells.shape

(247, 174)

In [269]:
g.cell_indexes((60.080844, 30.175307))

(0, -2)

In [271]:
g.cell_indexes((59.833303, 30.525164))

(247, 172)

In [243]:
def make_grid(city_abbr):
    with open(f'./data/entries/parsed/{city_abbr}.json') as file:
        etrie_data = json.load(file)
        
    grid = Grid(*city_dict[city_abbr]['borders'], 0.001)
    grid.nw = city_dict[city_abbr]['borders'][0]
    grid.se = city_dict[city_abbr]['borders'][1]
    
    #print(abs(grid.nw[0] - grid.se[0]) // grid.dlat, abs(grid.nw[1] - grid.se[1]) // grid.dlon)
    
    i = 0
    for entrie_type_value, raw_entries in zip(etrie_data.keys(), etrie_data.values()):
        entrie_type = EntrieType(entrie_type_value)
        j = 0
        for raw_entrie in raw_entries:
            
            processed_entrie = Entrie(*raw_entrie)

            if type(processed_entrie.coorditates[0]) != type(1.2):
                continue
                     
            grid.add_entrie(processed_entrie, entrie_type)
            j += 1
            if j  % 100 == 0:
                pass
                #print(f'{j}/{len(raw_entries)}')
        i += 1
        print(f'{round(j/len(raw_entries), 2)} ({i} / {len(etrie_data.keys())})')
    
    return grid

In [244]:
def extract_public_trasport_stops(city_abbr):
    with open(f'./data/entries/parsed/{city_abbr}.json') as file:
        etrie_data = json.load(file)
    return [e[0] for e in etrie_data['subway_station']] + [e[0] for e in etrie_data['tram_stop']] + [e[0] for e in etrie_data['bus_stop']]
    

In [245]:
for city_abbr in city_dict.keys(): 
    city_dict[city_abbr]['grid'] = make_grid(city_abbr)



1.0 (1 / 17)
1.0 (2 / 17)
1.0 (3 / 17)
1.0 (4 / 17)
1.0 (5 / 17)
1.0 (6 / 17)
1.0 (7 / 17)
0.95 (8 / 17)
1.0 (9 / 17)
1.0 (10 / 17)
0.78 (11 / 17)
1.0 (12 / 17)
1.0 (13 / 17)
0.99 (14 / 17)
0.99 (15 / 17)
1.0 (16 / 17)
1.0 (17 / 17)
1.0 (1 / 17)
1.0 (2 / 17)
1.0 (3 / 17)
1.0 (4 / 17)
1.0 (5 / 17)
1.0 (6 / 17)
1.0 (7 / 17)
1.0 (8 / 17)
1.0 (9 / 17)
1.0 (10 / 17)
1.0 (11 / 17)
1.0 (12 / 17)
1.0 (13 / 17)
1.0 (14 / 17)
1.0 (15 / 17)
1.0 (16 / 17)
1.0 (17 / 17)


In [246]:
def prepare_table(city_abbr):
    flats = pd.read_csv(f'./data/flats/{city_abbr}.csv').drop(columns=['adress', 'subway_station'])
    cols = flats.columns.tolist()
    cols[-1], cols[-2] = cols[-2], cols[-1]
    cols = cols[-2:] + [cols[0]] + cols[1:-2]
    flats = flats[cols]
    flats['price_per_m'] = (flats['price'] / flats['area']).map(lambda x: int(x))
    flats = flats.drop(columns=['price'])
    flats['distance_to_center'] = [distance(x, city_dict[city_abbr]['center']) for x in zip(flats.latitude, flats.longitude)]
    return flats

In [247]:
prepared_smr = prepare_table('smr')
prepared_spb = prepare_table('spb')

In [248]:
def cook_table(city_abbr, flats):
    
    i = 0
    feature_list = []
    grid = city_dict[city_abbr]['grid']

    for row in flats.iterrows():
        row = row[1]
        d = {'price_per_m': int(row['price_per_m']),
             'rooms': int(row['rooms']),
             'area': row['area'], 
             'floor': int(row['floor']),
             'floors_total': int(row['floors_total']),
             'distance_to_subway': int(row['distance to subway']),
             'distance_to_center': int(row['distance_to_center'])
            }

        lat, lon = row["latitude"], row["longitude"]

        indexes = grid.cell_indexes((lat, lon)) 
        
                    
        if not (indexes):
            continue
    

        i, j = indexes
        dist = (distance((lat, lon), (grid.cells[i][j].center)))
        if dist > grid.y_step:
            print(f'{dist} / {(grid.y_step ** 2 + grid.x_step ** 2) ** 0.5}', [i, j])

        entries = {
            '500': grid.get_all_entries_in_radius((lat, lon), 500),
            '1000': grid.get_all_entries_in_radius((lat, lon), 1000),
        }
        
        
        try:
            salaries_1000 = [e.weight for e in entries['1000'][EntrieType.VACANCY]] or [0]
            salaries_500 = [e.weight for e in entries['500'][EntrieType.VACANCY]] or [0]
            
            d['mean_salary_1000'] = int(statistics.mean(salaries_1000))
            d['median_salary_1000'] = int(statistics.median(salaries_1000))
            d['mean_salary_500'] = int(statistics.mean(salaries_500))
            d['median_salary_500'] = int(statistics.median(salaries_500))
        except KeyError:
            continue
        for key in entries['500'].keys():
            for dist in entries.keys():
                d[f'{key.value}_{dist}'] = len([e for e in entries[dist][key]])
        feature_list.append(d)
        
        i += 1
        if i % 100 == 0:
            print(i)
            return pd.DataFrame(feature_list)
        
        pd.DataFrame(feature_list)
        
    return pd.DataFrame(feature_list)

In [249]:
smr_table = cook_table('smr', prepared_smr)

12539 / 173.23394586512194 [30, 0]
9037 / 173.23394586512194 [56, 0]
3899 / 173.23394586512194 [82, 0]
8426 / 173.23394586512194 [81, 0]
12722 / 173.23394586512194 [31, 0]
1768 / 173.23394586512194 [87, 0]
13271 / 173.23394586512194 [0, 0]
2976 / 173.23394586512194 [82, 0]
7195 / 173.23394586512194 [66, 0]
4027 / 173.23394586512194 [89, 0]
9325 / 173.23394586512194 [81, 0]
14430 / 173.23394586512194 [0, 0]
10436 / 173.23394586512194 [16, 0]
8514 / 173.23394586512194 [37, 0]
15158 / 173.23394586512194 [0, 0]
8296 / 173.23394586512194 [40, 0]
9037 / 173.23394586512194 [56, 0]
13271 / 173.23394586512194 [0, 0]
12539 / 173.23394586512194 [30, 0]
3899 / 173.23394586512194 [82, 0]
8373 / 173.23394586512194 [36, 0]
12722 / 173.23394586512194 [31, 0]
8514 / 173.23394586512194 [37, 0]
3417 / 173.23394586512194 [102, 0]
15158 / 173.23394586512194 [0, 0]
9325 / 173.23394586512194 [81, 0]
2976 / 173.23394586512194 [82, 0]
14430 / 173.23394586512194 [0, 0]
8876 / 173.23394586512194 [29, 0]
7195 / 1

KeyboardInterrupt: 

In [224]:
smr_table

Unnamed: 0,apartement_1000,apartement_500,area,bank_1000,bank_500,bus_stop_1000,bus_stop_500,cafe_1000,cafe_500,cinema_1000,...,shop_1000,shop_500,subway_station_1000,subway_station_500,tram_stop_1000,tram_stop_500,univercity_1000,univercity_500,vacancy_1000,vacancy_500
0,0,0,44.4,7,2,22,6,5,1,0,...,28,5,0,0,5,1,0,0,2,0


In [100]:
spb_table = cook_table('spb', prepared_spb)

NameError: name 'coord_dict' is not defined

In [9]:
smr_table = cook_table('smr', prepared_smr)
smr_table.to_csv('./datasets/smr.csv', index=False)

KeyboardInterrupt: 

In [109]:
len(res[EntrieType.VACANCY])

29

In [233]:
g.cells[0][0]

59.833802000000006, 30.176307

In [236]:
g.nw, g.se

((60.080844, 30.175307), (59.833302, 30.525166))

In [95]:
smr_table.to_csv('./datasets/smr.csv', index=False)

In [83]:
df = 

In [84]:
df

Unnamed: 0,area,bank_1000,bank_500,bus_stop_1000,bus_stop_500,cafe_1000,cafe_500,cinema_1000,cinema_500,distance_to_center,...,shop_1000,shop_500,subway_station_1000,subway_station_500,tram_stop_1000,tram_stop_500,univercity_1000,univercity_500,vacancy_1000,vacancy_500
0,109.4,3,0,61,20,6,1,0,0,4929,...,19,3,0,0,0,0,0,0,92,1


In [4]:
entries = Cell([0, 0]).entries

405.4873611929934

In [25]:
smr_grid.find_suitable_position(50.1995781, 53.2009557)

In [47]:
r = smr_grid.get_all_entries_in_radius((53.277885, 50.056253), 10000)

In [91]:
distance((53.277885, 50.056253), (50.1995781, 53.2009557))

405396.3914606943

In [88]:
grid.cells[0][0].center, grid.cells[0][1].center

((60.14131115764205, 29.9328822447388), (60.14131115764205, 29.93366073117118))

In [26]:
midpoint(grid.cells[0][2].center, grid.cells[0][3].center)

(60.16331988210298, 30.04919449246976)

In [83]:
# Set up projections
p_ll = pyproj.Proj(init='epsg:4326')
p_mt = pyproj.Proj(init='epsg:3857') # metric; same as EPSG:900913

# Create corners of rectangle to be transformed to a grid
nw = shapely.geometry.Point((-5.0, 40.0))
se = shapely.geometry.Point((-4.999, 40.03439880201911))

stepsize = 50

# Project corners to target projection
s = pyproj.transform(p_ll, p_mt, nw.x, nw.y) # Transform NW point to 3857
e = pyproj.transform(p_ll, p_mt, se.x, se.y) # .. same for SE

# Iterate over 2D area
gridpoints = []
x = s[0]
while x < e[0]:
    y = s[1]
    row = []
    while y < e[1]:
        p = shapely.geometry.Point(pyproj.transform(p_mt, p_ll, x, y))
        row.append([p.x, p.y])
        y += stepsize * 1.314
    gridpoints.append(row)
    x += stepsize

In [85]:
distance([-5.0, 40.00045211267976], [-5.0, 40.0])

50.137436430313635

In [None]:
gridpoints

In [63]:
distance((-5.0, 40.03439880201911), [-5.0, 40.0])

3814.68564478778