<a href="https://colab.research.google.com/github/siva4iov/FlatsPrice_DS_Proj/blob/master/Geocoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Geocoding

### Dependencies

In [1]:
!pip install geopy
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
from tqdm import tqdm
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import os

tqdm.pandas()

In [3]:
if not os.path.exists("/content/dataset_clean.csv"):
  from google.colab import files
  files.upload()

Saving dataset_clean.csv to dataset_clean.csv


### Addresses


In [4]:
df = pd.read_csv("/content/dataset_clean.csv", sep=";")
ads = df.loc[:, "Address"]
ads.head()


0    Республика Татарстан, Казань, ул. Разведчика А...
1    Республика Татарстан, Казань, Вахитовский райо...
2    Республика Татарстан, Казань, ул. Мидхата Була...
3    Республика Татарстан, Казань, пр-т Альберта Ка...
4    Республика Татарстан, Казань, ул. Сибгата Хаки...
Name: Address, dtype: object

### I will use only free geocoding services

In [5]:
geolocator = Nominatim(user_agent="my_request")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
data = ads.progress_apply(geocode)

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/geopy/geocoders/base.py", line 344, in _call_geocoder
    page = requester(req, timeout=timeout, **kwargs)
  File "/usr/lib/python3.7/urllib/request.py", line 525, in open
    response = self._open(req, data)
  File "/usr/lib/python3.7/urllib/request.py", line 543, in _open
    '_open', req)
  File "/usr/lib/python3.7/urllib/request.py", line 503, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.7/urllib/request.py", line 1393, in https_open
    context=self._context, check_hostname=self._check_hostname)
  File "/usr/lib/python3.7/urllib/request.py", line 1353, in do_open
    r = h.getresponse()
  File "/usr/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/usr/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/usr/lib/python3.7/http/client.py", line 280, in _read_status
    line = str(self.fp.re

In [22]:
data[:3]

0    (Газпромбанк, 3, улица Разведчика Ахмерова, Ar...
1    (53, улица Татарстан, Старо-Татарская слобода,...
2    (5, улица Мидхата Булатова, 33-й Военный город...
Name: Address, dtype: object

In [20]:
location = data.rename("location")

In [23]:
coord = pd.Series(list(map(lambda x: tuple(x.point)[:2] if x else None, data)), name="coord")

In [15]:
lat_n_lon = pd.DataFrame(data=list(map(lambda x: [x.latitude, x.longitude] if x else [None, None], data)), columns=["latitude", "longitude"])

In [24]:
geodata = pd.concat([location, coord, lat_n_lon], axis=1)

I will drop addresses which were not found

In [25]:
df = df.loc[geodata.notna().iloc[:, 0]].reset_index(drop=True)

geodata = geodata.dropna().reset_index(drop=True)


### Extract district

In [26]:
def extr_distr(x):
  objs = str(x).split(",")
  for obj in objs:
    if "район" in obj:
      result = obj
      result = result.replace("район", "").strip()
      return result

geodata.loc[:, "district"] = geodata["location"].apply(extr_distr)

In [27]:
geodata

Unnamed: 0,location,coord,latitude,longitude,district
0,"(Газпромбанк, 3, улица Разведчика Ахмерова, Ar...","(55.7978535, 49.1793271)",55.797854,49.179327,Советский
1,"(53, улица Татарстан, Старо-Татарская слобода,...","(55.77585095, 49.1077901578817)",55.775851,49.107790,Вахитовский
2,"(5, улица Мидхата Булатова, 33-й Военный город...","(55.731757599999995, 49.169174993589834)",55.731758,49.169175,Приволжский
3,"(31, улица Сибгата Хакима, Ново-Савиновский ра...","(55.816227350000005, 49.13680551681148)",55.816227,49.136806,Ново-Савиновский
4,"(Ozon, 13, улица Адоратского, Ново-Савиновский...","(55.8252666, 49.1468802)",55.825267,49.146880,Ново-Савиновский
...,...,...,...,...,...
1318,"(улица Чернышевского, Вахитовский район, Казан...","(55.7930993, 49.1107986)",55.793099,49.110799,Вахитовский
1319,"(12, улица Волкова, Аки, Советский район, Каза...","(55.84985115, 49.25372233475784)",55.849851,49.253722,Советский
1320,"(Пятёрочка, 14В, улица Юлиуса Фучика, Горки-3,...","(55.7420827, 49.2053247)",55.742083,49.205325,Приволжский
1321,"(61Б, Чистопольская улица, Ново-Савиновский ра...","(55.82022285, 49.13268965)",55.820223,49.132690,Ново-Савиновский


## Also we need to get rid of cyrillic to avoid future difficulties
But, i will leave column location for now, because it could be useful in future.

I will drop it later

In [28]:
condition = {dist: num+1 for num, dist in enumerate(geodata['district'].unique())}
display(condition)
districts = \
pd.get_dummies(geodata.replace({"district": condition})["district"], prefix="district")
geodata = pd.concat([geodata.drop(columns=["district"]), districts], axis=1)

{'Советский': 1,
 'Вахитовский': 2,
 'Приволжский': 3,
 'Ново-Савиновский': 4,
 'Московский': 5,
 'Кировский': 6,
 'Авиастроительный': 7,
 'микро М-14': 8,
 'РУВД Московского а': 9}

In [29]:
df = pd.concat([df.drop(columns=["Address"]), geodata], axis=1)

# Saving dataset and new features meaning

In [30]:
df.to_csv("dataset_clean_full.csv", sep=";", index=False)

In [39]:
features_decode = pd.read_csv("/content/features_decode.csv", sep=";")

In [32]:
meanings = {'Советский': 'district_1',
 'Вахитовский': 'district_2',
 'Приволжский': 'district_3',
 'Ново-Савиновский': 'district_4',
 'Московский': 'district_5',
 'Кировский': 'district_6',
 'Авиастроительный': 'district_7',
 'микро М-14': 'district_8',
 'РУВД Московского а': 'district_9'}

meanings = {'name': list(meanings.values()), 'meaning': list(meanings.keys())}
meanings = pd.DataFrame(meanings)
meanings

Unnamed: 0,name,meaning
0,district_1,Советский
1,district_2,Вахитовский
2,district_3,Приволжский
3,district_4,Ново-Савиновский
4,district_5,Московский
5,district_6,Кировский
6,district_7,Авиастроительный
7,district_8,микро М-14
8,district_9,РУВД Московского а


In [35]:
features_decode = pd.concat([features_decode, meanings]).reset_index(drop=True)
features_decode

In [38]:
features_decode.to_csv("features_decode.csv", sep=";", index=False)