In [278]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import geopandas as gpd
import requests
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from geopy.geocoders import Nominatim
import overpass
from keplergl import KeplerGl
from shapely.geometry import Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon
from geopy import distance
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

In [2]:
# !pip3 install pandas
# !pip3 install numpy
# !pip3 install geopandas
# !pip3 install sklearn
# !pip3 install geopy
# !pip3 install overpass
# !pip3 install keplergl
# !pip3 install shapely
# !pip3 install pygeos

In [None]:
# !apt install libspatialindex-dev

### Задача:

Прогноз продаж одной из популярных моделей [фичерфонов](https://ru.wikipedia.org/wiki/%D0%A4%D0%B8%D1%87%D0%B5%D1%80%D1%84%D0%BE%D0%BD) (на картинке ниже пример похожего устройства) в салонах МегаФона
![](https://39.img.avito.st/640x480/8468720439.jpg)

### Исходные данные:

Датасет содержит следующие поля:

1. `point_id` - Индентификатор салона
2. `lon` - Долгота точки
3. `lat` - Широта точки
4. `target` - Значение таргета, усредненное за несколько месяцев и отнормированное

### Требования к решению и советы:

Ниже приведен список из нескольких важных пунктов, необходимых для решения задания. Выполнение каждого из пунктов влияет на итоговую оценку. Вы можете выполнить каждый из пунктов разными способами, самым лучшим будет считаться вариант, когда всё получение и обработка данных будут реализованы на Питоне (пример: вы можете скачать данные из OSM через интерфейс на сайте overpass-turbo или с помощью библиотек `overpass`/`requests`. Оба варианта будут зачтены, но больше баллов можно заработать во втором случае)



1. Салоны расположены в нескольких разных городах, вам необходимо **определить город для каждого салона** (это понадобится во многих частях задания). К этому есть разные подходы. Вы можете провести [обратное геокодирование](https://en.wikipedia.org/wiki/Reverse_geocoding) с помощью геокодера [Nominatim](https://nominatim.org/), доступного через библиотеку `geopy` примерно вот так:
```python
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="specify_your_app_name_here")
location = geolocator.reverse("52.509669, 13.376294")
print(location.address)
```
В таком случае, вам придется обрабатывать полученную строку адреса, чтобы извлечь название города. Также вы можете скачать из OSM или найти в любом другом источнике границы административно территориальных границ России и пересечь с ними датасет с помощью `geopandas.sjoin` (этот вариант более надежный, но нужно будет разобраться с тем, как устроены границы АТД в OSM, обратите внимание на [этот тег](https://wiki.openstreetmap.org/wiki/Key:admin_level))


2. **Используйте данные OSM**: подумайте, какие объекты могут влиять на продажи фичерфонов. Гипотеза: такие телефоны покупают люди, приезжающие в город или страну ненадолго, чтобы вставить туда отдельную симкарту для роуминга. Можно попробовать использовать местоположения железнодорожных вокзалов (изучите [этот тег](https://wiki.openstreetmap.org/wiki/Tag:railway%3Dstation)). Необходимо использовать хотя бы 5 разных типов объектов из OSM. Скорее всего, вам придется качать данные OSM отдельно для разных городов (см. пример для Нью-Йорка из лекции)


3. **Используйте разные способы генерации признаков**: описать положение салона МегаФона относительно станций метро можно разными способами - найти ***расстояние до ближайшей станции***, или же посчитать, сколько станций попадает в ***500 метровую буферную зону*** вокруг салона. Такие признаки будут нести разную информацию. Так же попробуйте поэкспериментировать с размерами буферных зон (представьте, что значат в реальности радиусы 100, 500, 1000 метров). Попробуйте посчитать расстояние до центра города, до других объектов.

4. **Сделайте визуализации**: постройте 2-3 карты для какого нибудь из городов - как распределен в пространстве таргет, где находятся объекты, полученные вами из OSM. Можете использовать любой инструмент - обычный `plot()`, `folium`, `keplergl`. Если выберете Кеплер, обязательно сохраните в файл конфиг карты, чтобы ее можно было воспроизвести. Сделать это можно вот так:

```python
import json
json_data = kepler_map.config
with open('kepler_config.json', 'w') as outfile:
    json.dump(json_data, outfile)
```
5. Задание не ограничено приведенными выше пунктами, попробуйте нагенерировать интересных признаков, найти в интернете дополнительные данные (в таком случае в комментарии к коду укажите ссылку на ресурс, откуда взяли данные)



6. Это довольно сложная задача - датасет очень маленький, данные по своей природе довольно случайны. Поэтому место и скор на Kaggle не будут играть решающую роль в оценке, но позволят заработать дополнительные баллы

### Read data

In [None]:
# from google.colab import files
# uploaded = files.upload()

Saving boundaries4.geojson to boundaries4.geojson
Saving boundaries.geojson to boundaries.geojson
Saving mf_geo_test.csv to mf_geo_test (1).csv
Saving mf_geo_train.csv to mf_geo_train (1).csv


In [None]:
# train = pd.read_csv('data/mf_geo_train.csv')
# test = pd.read_csv('data/mf_geo_test.csv')
# !mkdir ./data
# !wget https://raw.githubusercontent.com/andreytyu/mf-geo-hw/main/data/mf_geo_train.csv -O ./data/train.csv
# !wget https://raw.githubusercontent.com/andreytyu/mf-geo-hw/main/data/mf_geo_test.csv -O ./data/test.csv
# train = pd.read_csv('mf_geo_train.csv')
# test = pd.read_csv('mf_geo_test.csv')

--2020-12-03 10:43:35--  https://raw.githubusercontent.com/andreytyu/mf-geo-hw/main/data/mf_geo_train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23272 (23K) [text/plain]
Saving to: ‘./data/train.csv’


2020-12-03 10:43:35 (16.5 MB/s) - ‘./data/train.csv’ saved [23272/23272]

--2020-12-03 10:43:35--  https://raw.githubusercontent.com/andreytyu/mf-geo-hw/main/data/mf_geo_test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4462 (4.4K) [text/plain]
Saving to: ‘./data/test.csv’


2020-12-03 10:43:35 (38.5 MB/s) - ‘./data/test.csv’ 

In [130]:
train = pd.read_csv('data/mf_geo_train.csv')
test = pd.read_csv('data/mf_geo_test.csv')

In [131]:
train.head(2)

Unnamed: 0,point_id,lon,lat,target
0,ommNZCUV,37.590776,55.84863,-0.348157
1,nMe2LHPb,37.78421,55.750271,1.294206


In [6]:
train.shape

(425, 4)

In [7]:
test.head(2)

Unnamed: 0,point_id,lon,lat,target
0,F4lXR1cG,37.681242,55.74804,0.0091
1,4LJu4GTf,60.58091,56.79586,0.0091


In [8]:
test.shape

(107, 4)

### Fit model

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train.drop('target', axis=1), train[['target']])
model = LinearRegression().fit(X_train.drop('point_id', axis=1), y_train)

In [None]:
mean_absolute_error(y_valid, model.predict(X_valid.drop('point_id', axis=1)))

0.6399230367128608

### Make submission

In [None]:
submission = pd.read_csv('data/sample_submission.csv')
submission['target'] = model.predict(X_valid.drop('point_id', axis=1))
submission.to_csv('data/my_submission_01.csv', index=False)

### Improvements

## 1)

In [9]:
# boundaries = gpd.read_file('boundaries4.geojson') #данные скачаны с сайта boundaries.osm ,  admin_level=4
# boundaries.head(2)

DriverError: boundaries4.geojson: No such file or directory

In [None]:
# boundaries[boundaries.geometry.isnull()] #отсутствуют границы важных нам городов, поэтому использовать будем Nominatim

Unnamed: 0,id,admin_level,parents,name,local_name,name_en,geometry
5,-1574364,4,"-1059500,-60189",Sevastopol,Севастополь,Sevastopol,
6,-3788485,4,"-1059500,-60189",Sevastopol,Севастополь,Sevastopol,
9,-253252,4,"-389344,-60189",Ingushetia,Ингушетия,Ingushetia,
16,-103906,4,"-1216601,-60189",Kaliningrad,Калининградская область,Kaliningrad,
22,-72223,4,"-1029256,-60189",Kursk Oblast,Курская область,Kursk Oblast,
23,-72224,4,"-1029256,-60189",Oryol Oblast,Орловская область,Oryol Oblast,
34,-337422,4,"-1216601,-60189",Saint Petersburg,Санкт-Петербург,Saint Petersburg,
37,-85617,4,"-1029256,-60189",Ivanovo Oblast,Ивановская область,Ivanovo Oblast,
42,-80513,4,"-1075831,-60189",Chuvashia,Чувашия,Chuvashia,
46,-115114,4,"-1075831,-60189",Mari El,Марий Эл,Mari El,


In [None]:
# train['geometry'] = [Point(x, y) for x, y in zip(train.lat, train.lon)]
# train = gpd.GeoDataFrame(train)
# train = gpd.sjoin(train, boundaries[['name', 'geometry']], op='within')
# train = train.rename(columns={'name':'region'})

Unnamed: 0,point_id,lon,lat,target,geometry
0,ommNZCUV,37.590776,55.84863,-0.348157,POINT (55.84863 37.59078)
1,nMe2LHPb,37.78421,55.750271,1.294206,POINT (55.75027 37.78421)


In [298]:
train = pd.read_csv('data/mf_geo_train.csv')
test = pd.read_csv('data/mf_geo_test.csv')

In [299]:
train['geometry'] = [Point(x, y) for x, y in zip(train.lat, train.lon)]
test['geometry'] = [Point(x, y) for x, y in zip(test.lat, test.lon)]

In [300]:
geolocator = Nominatim(user_agent="65")
location = geolocator.reverse('52.5094982,13.3765983')
loc_dict = location.raw
print(loc_dict['address'])
def get_city(lon, lat):
    geolocator = Nominatim(user_agent="65")
    location = geolocator.reverse(str(lat) + ', ' + str(lon))
    location = location.raw['address']
    result = ''
    try:
        result = location['city']
    except:
        result = location['state']
    if len(result.split(' ')) > 1:
        result = location['state']
    return result

{'road': 'Potsdamer Platz', 'suburb': 'Tiergarten', 'borough': 'Mitte', 'city': 'Berlin', 'district': 'Rixdorf', 'state': 'Berlin', 'postcode': '10117', 'country': 'Deutschland', 'country_code': 'de'}


In [303]:
%%time
train['city'] = train.apply(lambda x: get_city(x.lon, x.lat), axis=1)
test['city'] = test.apply(lambda x: get_city(x.lon, x.lat), axis=1)

CPU times: user 10.3 s, sys: 652 ms, total: 10.9 s
Wall time: 4min 32s


In [304]:
train['city'].value_counts()

Москва                   157
Санкт-Петербург           78
Самарская область         22
Екатеринбург              22
Ростов-на-Дону            21
Нижегородская область     21
Красноярск                20
Уфа                       19
Татарстан                 16
Новосибирская область     14
Новосибирск               12
Казань                     9
Самара                     5
Колпино                    4
Зеленоград                 3
Балашиха                   1
Пушкин                     1
Name: city, dtype: int64

In [305]:
train.loc[train['city'] == 'Нижегородская область', 'city'] = 'Нижний Новгород'
train.loc[train['city'] == 'Колпино', 'city'] = 'Санкт-Петербург'
train.loc[train['city'] == 'Зеленоград', 'city'] = 'Москва'
train.loc[train['city'] == 'Пушкин', 'city'] = 'Санкт-Петербург'
train.loc[train['city'] == 'Балашиха', 'city'] = 'Москва'
train.loc[train['city'] == 'Новосибирская область', 'city'] = 'Новосибирск'
train.loc[train['city'] == 'Татарстан', 'city'] = 'Казань'

In [306]:
train['city'].value_counts()

Москва               161
Санкт-Петербург       83
Новосибирск           26
Казань                25
Самарская область     22
Екатеринбург          22
Нижний Новгород       21
Ростов-на-Дону        21
Красноярск            20
Уфа                   19
Самара                 5
Name: city, dtype: int64

In [307]:
test['city'].value_counts()

Москва                   39
Санкт-Петербург          21
Самарская область         7
Красноярск                5
Ростов-на-Дону            5
Уфа                       5
Екатеринбург              5
Нижегородская область     5
Новосибирск               4
Казань                    4
Татарстан                 3
Новосибирская область     3
Зеленоград                1
Name: city, dtype: int64

In [308]:
test.loc[test['city'] == 'Нижегородская область', 'city'] = 'Нижний Новгород'
test.loc[test['city'] == 'Зеленоград', 'city'] = 'Москва'
test.loc[test['city'] == 'Новосибирская область', 'city'] = 'Новосибирск'
test.loc[test['city'] == 'Татарстан', 'city'] = 'Казань'

## 2)

In [309]:
#добавим признак - вокзал в радиусе 1000 м

In [310]:
lon_min = train.lon.min(axis=0) - 1
lon_max = train.lon.max(axis=0) + 1
lat_min = train.lat.min(axis=0) - 1
lat_max = train.lat.max(axis=0) + 1

In [311]:
bbox = lat_min,lon_min,lat_max,lon_max

In [312]:
bbox

(46.208802, 28.76554, 61.093366, 94.023803)

In [313]:
overpass_url = "http://overpass-api.de/api/interpreter"


# node
#   ["railway"="station"]
#   ["station"!="subway"]
#   ["transport"!="subway"]
  
#   ({{bbox}});
# out;

overpass_query = '''
[out:json];
(node
  ["railway"="station"]
  ["station"!="subway"]
  ["transport"!="subway"]({},{},{},{});
);
out center;
'''.format(*bbox)

response = requests.get(overpass_url, 
                        params={'data': overpass_query})
train_stations_json = response.json()
train_stations = pd.DataFrame(train_stations_json['elements'])
train_stations = train_stations.join(
    pd.DataFrame([x['tags'] for x in train_stations_json['elements']])).drop('tags', axis=1)

In [314]:
def get_geometry(x):
 if x.type == 'node':
   return Point(x.lat, x.lon)
 elif x.type == 'way' or x.type == 'relation':
   return Point(x.center['lat'], x.center['lon'])
train_stations['geometry'] = train_stations.apply(lambda x: get_geometry(x) , axis=1)

In [315]:
print(train_stations.shape)
train_stations.head(2)

(6122, 241)


Unnamed: 0,type,id,lat,lon,esr:user,name,railway,uic_ref,name:ru,name:fi,...,layer,level,name_1,bin,status,disused:railway,esr:usr,railway:station,historic:railway,geometry
0,node,285075,60.749383,33.711172,49207,Янега,station,2004226,,,...,,,,,,,,,,POINT (60.7493828 33.7111717)
1,node,285173,60.622206,33.314252,49021,Заостровье,station,2004223,Заостровье,,...,,,,,,,,,,POINT (60.6222059 33.3142519)


In [316]:
def is_near(shop, df, buffer=1):
  temp = df.apply(lambda x: distance.great_circle(shop.coords, x.geometry.coords), axis=1)
  return (temp < buffer).any()

In [317]:
%%time
train['train_stations'] = train.apply(lambda x: is_near(x.geometry, train_stations), axis=1)
test['train_stations'] = test.apply(lambda x: is_near(x.geometry, train_stations, buffer=1), axis=1)

CPU times: user 5min 42s, sys: 4.44 s, total: 5min 46s
Wall time: 6min 8s


In [318]:
train.loc[421] # Савеловский Вокзал рядом

point_id                             uNw6t6xk
lon                                   37.5868
lat                                   55.7942
target                             -0.0023966
geometry          POINT (55.794233 37.586825)
city                                   Москва
train_stations                           True
Name: 421, dtype: object

In [319]:
# добавим признак - коммерческое здание в радиусе 500 м
# (
#   node
#   ["building"="office"]
#   ({{bbox}});
#   way
#   ["building"="office"]
#   ({{bbox}});
#  rel
#   ["building"="office"]
#   ({{bbox}});
#  );
# /*added by auto repair*/
# /*end of auto repair*/
# out center;


overpass_query = '''
[out:json];
(
  node
  ["building"="office"]({},{},{},{});
  way
  ["building"="office"]({},{},{},{});
  rel
  ["building"="office"]({},{},{},{});
);
out center;
'''.format(*bbox, *bbox, *bbox)

response = requests.get(overpass_url, 
                        params={'data': overpass_query})
business_centers_json = response.json()
business_centers = pd.DataFrame(business_centers_json['elements'])
business_centers = business_centers.drop('tags', axis=1)

In [320]:
business_centers['geometry'] = business_centers.apply(lambda x: get_geometry(x) , axis=1)

In [321]:
%%time
train['business_centers'] = train.apply(lambda x: is_near(x.geometry, business_centers, buffer=0.5), axis=1)
test['business_centers'] = test.apply(lambda x: is_near(x.geometry, business_centers, buffer=0.5), axis=1)

CPU times: user 7min 26s, sys: 3.85 s, total: 7min 29s
Wall time: 7min 53s


In [322]:
#добавим признаки - какое  кол-во станций метро в радиусе 100, 200, 300, 500, 1000 м
overpass_query = '''
[out:json];
(node
  ["railway"="station"]
  ["station"="subway"]({},{},{},{});
);
out center;
'''.format(*bbox)

response = requests.get(overpass_url, 
                        params={'data': overpass_query})
subway_stations_json = response.json()
subway_stations = pd.DataFrame(subway_stations_json['elements'])
subway_stations = subway_stations.join(
    pd.DataFrame([x['tags'] for x in subway_stations_json['elements']])).drop('tags', axis=1)

In [323]:
subway_stations = subway_stations.iloc[:, :7]
subway_stations.drop('colour', axis=1, inplace=True)

In [324]:
subway_stations['geometry'] = subway_stations.apply(lambda x: Point(x.lat, x.lon), axis=1)

In [325]:
subway_stations.head(2)

Unnamed: 0,type,id,lat,lon,layer,name,geometry
0,node,60660466,55.887177,37.66155,-1,Медведково,POINT (55.8871767 37.66155)
1,node,60660469,55.869625,37.664184,-2,Бабушкинская,POINT (55.8696254 37.6641842)


In [326]:
def is_near_count(shop, df, buffer=1):
  temp = df.apply(lambda x: distance.great_circle(shop.coords, x.geometry.coords), axis=1)
  return (temp < buffer).sum()

In [327]:
%%time
train['subway_stations_100'] = train.apply(lambda x: is_near(x.geometry, subway_stations, buffer=0.1), axis=1)
test['subway_stations_100'] = test.apply(lambda x: is_near(x.geometry, subway_stations, buffer=0.1), axis=1)
train['subway_stations_200'] = train.apply(lambda x: is_near(x.geometry, subway_stations, buffer=0.2), axis=1)
test['subway_stations_200'] = test.apply(lambda x: is_near(x.geometry, subway_stations, buffer=0.2), axis=1)
train['subway_stations_300'] = train.apply(lambda x: is_near(x.geometry, subway_stations, buffer=0.3), axis=1)
test['subway_stations_300'] = test.apply(lambda x: is_near(x.geometry, subway_stations, buffer=0.3), axis=1)
train['subway_stations_500'] = train.apply(lambda x: is_near(x.geometry, subway_stations, buffer=0.5), axis=1)
test['subway_stations_500'] = test.apply(lambda x: is_near(x.geometry, subway_stations, buffer=0.5), axis=1)
train['subway_stations_1000'] = train.apply(lambda x: is_near(x.geometry, subway_stations, buffer=1), axis=1)
test['subway_stations_1000'] = test.apply(lambda x: is_near(x.geometry, subway_stations, buffer=1), axis=1)

CPU times: user 1min 50s, sys: 349 ms, total: 1min 51s
Wall time: 1min 51s


In [328]:
city_coords = {}
for city in np.unique(train.city):
  response = requests.get("http://nominatim.openstreetmap.org/search?q={}&format=json".format(city))
  city_coords[city] = Point(float(response.json()[0]['lat']), float(response.json()[0]['lon']))

In [329]:
#посчитаем расстояние до центра города
train['distance_to_center'] = train.apply(lambda x: distance.great_circle(x.geometry.coords, city_coords[x.city].coords).km, axis=1)
test['distance_to_center'] = test.apply(lambda x: distance.great_circle(x.geometry.coords, city_coords[x.city].coords).km, axis=1)
train['distance_to_center'] = train.apply(lambda x: round(x.distance_to_center, 3), axis=1)
test['distance_to_center'] = test.apply(lambda x: round(x.distance_to_center, 3), axis=1)

In [330]:
test.head(2)

Unnamed: 0,point_id,lon,lat,target,geometry,city,train_stations,business_centers,subway_stations_100,subway_stations_200,subway_stations_300,subway_stations_500,subway_stations_1000,distance_to_center
0,F4lXR1cG,37.681242,55.74804,0.0091,POINT (55.74804 37.681242),Москва,True,True,False,True,True,True,True,3.998
1,4LJu4GTf,60.58091,56.79586,0.0091,POINT (56.79586 60.58091),Екатеринбург,False,False,False,False,False,False,False,5.088


In [331]:
train.head(2)

Unnamed: 0,point_id,lon,lat,target,geometry,city,train_stations,business_centers,subway_stations_100,subway_stations_200,subway_stations_300,subway_stations_500,subway_stations_1000,distance_to_center
0,ommNZCUV,37.590776,55.84863,-0.348157,POINT (55.84863000000001 37.590776),Москва,False,False,False,True,True,True,True,11.045
1,nMe2LHPb,37.78421,55.750271,1.294206,POINT (55.750271 37.78421),Москва,False,True,False,True,True,True,True,10.433


In [332]:
kg_map = KeplerGl()
kg_map.add_data(train.drop('geometry', axis=1)[train['city'] == 'Москва'])
kg_map

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'unnamed': {'index': [0, 1, 3, 4, 7, 11, 14, 15, 19, 21, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36…

In [333]:
json_data = kg_map.config
with open('kepler_config.json', 'w') as outfile:
    json.dump(json_data, outfile)

In [167]:
kg_map2 = KeplerGl()
kg_map2.add_data(train_stations.iloc[:, :4])
kg_map2

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'unnamed': {'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,…

In [168]:
json_data = kg_map2.config
with open('kepler_config2.json', 'w') as outfile:
    json.dump(json_data, outfile)

In [169]:
kg_map3 = KeplerGl()
kg_map3.add_data(subway_stations.iloc[:, :4])
kg_map3

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'unnamed': {'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,…

In [170]:
json_data = kg_map3.config
with open('kepler_config3.json', 'w') as outfile:
    json.dump(json_data, outfile)

In [336]:
X_train = train

In [337]:
X_train['city'].value_counts()

Москва               161
Санкт-Петербург       83
Новосибирск           26
Казань                25
Самарская область     22
Екатеринбург          22
Нижний Новгород       21
Ростов-на-Дону        21
Красноярск            20
Уфа                   19
Самара                 5
Name: city, dtype: int64

In [338]:
ohe = OneHotEncoder()
city_train = ohe.fit_transform(np.array(X_train['city']).reshape(-1, 1))

In [339]:
X_train.drop(columns=['city', 'point_id', 'geometry'], inplace=True)
X_train = pd.concat([X_train, pd.DataFrame(city_train.todense(), columns=ohe.get_feature_names())], axis=1)

Unnamed: 0,lon,lat,target,train_stations,business_centers,subway_stations_100,subway_stations_200,subway_stations_300,subway_stations_500,subway_stations_1000,...,x0_Казань,x0_Красноярск,x0_Москва,x0_Нижний Новгород,x0_Новосибирск,x0_Ростов-на-Дону,x0_Самара,x0_Самарская область,x0_Санкт-Петербург,x0_Уфа
0,37.590776,55.848630,-0.348157,False,False,False,True,True,True,True,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,37.784210,55.750271,1.294206,False,True,False,True,True,True,True,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,39.635721,47.213330,-1.039679,False,True,False,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,37.704570,55.782020,-1.169339,False,True,True,True,True,True,True,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,37.643983,55.730188,-0.088837,True,True,False,False,False,True,True,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,37.628765,55.740664,0.213704,False,True,True,True,True,True,True,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
421,37.586825,55.794233,-0.002397,True,True,False,False,True,True,True,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
422,37.678790,55.772910,-0.910019,False,True,False,True,True,True,True,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
423,37.752130,55.619640,-0.326547,False,False,False,False,False,False,True,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [340]:
X_train.head(2)

Unnamed: 0,lon,lat,target,train_stations,business_centers,subway_stations_100,subway_stations_200,subway_stations_300,subway_stations_500,subway_stations_1000,...,x0_Казань,x0_Красноярск,x0_Москва,x0_Нижний Новгород,x0_Новосибирск,x0_Ростов-на-Дону,x0_Самара,x0_Самарская область,x0_Санкт-Петербург,x0_Уфа
0,37.590776,55.84863,-0.348157,False,False,False,True,True,True,True,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,37.78421,55.750271,1.294206,False,True,False,True,True,True,True,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [341]:
y_train = X_train.target
X_train.drop(columns='target', inplace=True)

In [343]:
y_train

0     -0.348157
1      1.294206
2     -1.039679
3     -1.169339
4     -0.088837
         ...   
420    0.213704
421   -0.002397
422   -0.910019
423   -0.326547
424   -1.234169
Name: target, Length: 425, dtype: float64

In [344]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [345]:
mean_absolute_error(y_train, model.predict(X_train))

0.6082089170787052

In [346]:
np.mean(cross_val_score(model, X_train, y_train))

0.096672604582781

In [347]:
test

Unnamed: 0,point_id,lon,lat,target,geometry,city,train_stations,business_centers,subway_stations_100,subway_stations_200,subway_stations_300,subway_stations_500,subway_stations_1000,distance_to_center
0,F4lXR1cG,37.681242,55.748040,0.0091,POINT (55.74804 37.681242),Москва,True,True,False,True,True,True,True,3.998
1,4LJu4GTf,60.580910,56.795860,0.0091,POINT (56.79586 60.58091),Екатеринбург,False,False,False,False,False,False,False,5.088
2,kLuAAN3s,37.598614,55.781357,0.0091,POINT (55.78135699999999 37.59861400000001),Москва,False,True,False,True,True,True,True,3.634
3,OxQHvaNu,37.794051,55.717468,0.0091,POINT (55.717468 37.794051),Москва,False,False,True,True,True,True,True,11.646
4,paQsTa1K,49.213026,55.748290,0.0091,POINT (55.74829 49.213026),Казань,False,False,False,False,False,True,True,6.724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,y8oQuX5v,30.353777,60.049792,0.0091,POINT (60.049792 30.353777),Санкт-Петербург,False,False,False,False,False,False,False,12.525
103,4nmfqUw0,92.928927,56.116262,0.0091,POINT (56.116262 92.928927),Красноярск,False,False,False,False,False,False,False,12.420
104,N9O45mAh,93.015993,56.023697,0.0091,POINT (56.023697 93.01599299999999),Красноярск,False,False,False,False,False,False,False,9.064
105,h2InCLKa,30.381172,59.871149,0.0091,POINT (59.871149 30.381172),Санкт-Петербург,False,False,False,False,True,True,True,8.342


In [348]:
X_test = test

In [349]:
city_test = ohe.transform(np.array(X_test['city']).reshape(-1, 1))

In [353]:
X_test.drop(columns=['city', 'point_id', 'geometry', 'target'], inplace=True)
X_test = pd.concat([X_test, pd.DataFrame(city_test.todense(), columns=ohe.get_feature_names())], axis=1)
X_test.head(2)

Unnamed: 0,lon,lat,target,train_stations,business_centers,subway_stations_100,subway_stations_200,subway_stations_300,subway_stations_500,subway_stations_1000,...,x0_Казань,x0_Красноярск,x0_Москва,x0_Нижний Новгород,x0_Новосибирск,x0_Ростов-на-Дону,x0_Самара,x0_Самарская область,x0_Санкт-Петербург,x0_Уфа
0,37.681242,55.74804,0.0091,True,True,False,True,True,True,True,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,60.58091,56.79586,0.0091,False,False,False,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [359]:
submission = pd.read_csv('sample_submission.csv')
submission.head(2)

Unnamed: 0,point_id,target
0,F4lXR1cG,0.0091
1,4LJu4GTf,0.0091


In [368]:
submission['target'] = model.predict(X_test)
submission.to_csv('submission_1.csv', index=False)
# score 0.65511