In [1]:
!pip install keplergl
!pip3 install pickle5
!pip install pygeohash
!pip install lightgbm
!pip install h3
!cp -r /kaggle/input/cian-parser-v1/cianparser-main/* 
!pip install -e /kaggle/input/cian-parser-v1/cianparser-main
!jupyter nbextension install --py --sys-prefix keplergl # can be skipped for notebook 5.3 and above
!jupyter nbextension enable --py --sys-prefix keplergl # can be skipped for notebook 5.3 and above

In [2]:
import sys
import pandas as pd
import os
from keplergl import KeplerGl
import numpy as np

import pickle5 as pickle
from h3 import h3
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.neighbors import KDTree
import math
from sklearn.model_selection import KFold

from  sklearn.metrics import mean_absolute_error

import pygeohash as pgh
import seaborn as sns

def load_pickle(file_path):
    with open(file_path, 'rb') as handle:
        return pickle.load(handle)
        
def save_pickle(obj, filepath): 
    with open(filepath, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
pd.set_option('display.max_columns', None)

In [6]:
column_mapper = {
    'Автозапчасти для иномарок': 'autoparts',
    'Авторемонт и техобслуживание (СТО)': 'autoremont',
    'Алкогольные напитки': 'alcohols',
    'Аптеки': 'pharmacies',
    'Банки': 'banks',
    'Быстрое питание': 'fastfood',
    'Доставка готовых блюд': 'delivery',
    'Женская одежда': 'female_clothes',
    'Кафе': 'cafe',
    'Косметика / Парфюмерия': 'cosmetics',
    'Ногтевые студии': 'nails',
    'Овощи / Фрукты': 'vegetables',
    'Парикмахерские': 'hairs',
    'Платёжные терминалы': 'pay_terminals',
    'Постаматы': 'mails',
    'Продуктовые магазины': 'products',
    'Пункты выдачи интернет-заказов': 'internet_orders',
    'Рестораны': 'restaurants',
    'Страхование': 'insurance',
    'Супермаркеты': 'supermarkets',
    'Цветы': 'flowers',
    'Шиномонтаж': 'tires'
}
cat_features = [] 
num_features = ['population'] + list(column_mapper.values())
target = 'target'
features = cat_features + num_features

In [7]:
def population_metric(hexs, top_n=20):
    
    best_hexs = set([hex_id for hex_id in hexs if hex_id in uncovered][:top_n])
    small_tree = KDTree(df_isochrones[df_isochrones[geo_id].apply(lambda id: id in best_hexs)][['lon', 'lat']].values)
    df_uncovered = df_population[df_population[geo_id].apply(lambda id: id in uncovered)]
    
    dist, ind = small_tree.query(df_uncovered[['lon', 'lat']].values, k=1)
    new_peoples = df_uncovered[dist < mean_radius]['population'].sum()
    uplift = (new_peoples) / (total_peoples) * 100
    
    return uplift

In [8]:

def drop_dups(df):
    return df.drop_duplicates(subset=[geo_id])

ROOT = '/kaggle/input/geo-branch-data/train/train'
geo_id = 'geo_h3_10'
df_population = drop_dups(pd.read_csv(f"{ROOT}/rosstat_population_all_cities.csv"))
df_isochrones = drop_dups(pd.read_csv(f"{ROOT}/isochrones_walk_dataset.csv"))
df_companies = drop_dups(pd.read_csv(f"{ROOT}/osm_amenity.csv"))
df_target = drop_dups(pd.read_csv('/kaggle/input/bankmachinesrussia/target_hakaton_spb.csv', sep=';', encoding='Windows-1251'))

geo_id_mapper = dict(df_isochrones.apply(lambda row: (row[geo_id], (row['lat'], row['lon'])), axis=1).tolist())

    
def prepare_df(df):
    _df = df.merge(df_population, on=geo_id, suffixes=(None, '_y'), how='left').drop(['lat', 'lon', 'city'], axis=1)
    _df = _df.merge(df_companies, on=geo_id, suffixes=(None, '_y'), how='left').drop(['city', 'lat', 'lon'], axis=1).fillna(0)

    _df = _df.rename(columns=column_mapper)
    for feature in num_features:
        _df[feature] = (_df[feature] - _df[feature].mean())/ _df[feature].std()

    for feature in cat_features:
        _df[f'{feature}_ohe'] = _df[feature]
    _df = pd.get_dummies(_df, columns=[f'{col}_ohe' for col in cat_features], prefix=cat_features)

    return _df

def prepare_df2(df):
    _df = df.merge(df_companies, on=geo_id, suffixes=(None, '_y'), how='left').drop(['city', 'lat', 'lon'], axis=1).fillna(0)

    _df = _df.rename(columns=column_mapper)
    for feature in num_features:
        _df[feature] = (_df[feature] - _df[feature].mean())/ _df[feature].std()

    for feature in cat_features:
        _df[f'{feature}_ohe'] = _df[feature]
    _df = pd.get_dummies(_df, columns=[f'{col}_ohe' for col in cat_features], prefix=cat_features)

    ohe_cols = [col for col in _df.columns if col.startswith(tuple([item + '_' for item in cat_features]))]
    X = _df[num_features + ohe_cols].values
    
    return X

def prepare_target(_df):
    _df[target] = _df[target] / _df['atm_cnt']
    _df[target] = (_df[target] - _df[target].mean())/ _df[target].std()

    _df = _df[_df[target] - _df[target].mean() < 3 * _df[target].std()]
    return _df

def get_regression(df_target):

    df_target = prepare_df(df_target)
    df_target = prepare_target(df_target)

    ohe_cols = [col for col in df_target.columns if col.startswith(tuple([item + '_' for item in cat_features]))]
    X = df_target[num_features + ohe_cols].values
    y = df_target[target].values
    y = (y - y.min()) / (y.max() - y.min())
    
    k_fold = KFold(5)
    scores = []
    lr = linear_model.LassoCV(max_iter=1000)
    for k, (train, test) in enumerate(k_fold.split(X, y)):
        lr.fit(X[train], y[train])
        scores.append(mean_absolute_error(y[test], lr.predict(X[test])))
    print(f'Mean linear regression MAE: {float(np.mean(scores))}')
    
    return lr
lr = get_regression(df_target)
# df_isochrones = df_isochrones.merge(df_population, on=geo_id, how='left', suffixes=[None, '_y']).drop(['lat_y', 'lon_y', 'city_y'], axis=1)
# pop_values = df_population['population'].tolist()

# pop_tree = KDTree(df_population[['lon', 'lat']].values)
# def replace_na(row, tree):
#     if not math.isnan(row['population']):
#         return row
#     lon = float(row['lon'])
#     lat = float(row['lat'])
#     dist, ind = tree.query([[lon, lat]], k=3)
#     row['population'] = float(np.mean([pop_values[i] for i in ind[0]]))
#     return row
    
# df_isochrones = df_isochrones.apply(lambda row: replace_na(row, pop_tree), axis=1)

def get_mean_radius(df_isochrones):
    def get_radius(row):
        lat = float(row['lat'])
        lon = float(row['lon'])
        poly_str = row['walk_15min'].replace('POLYGON ((', '').replace('))', '')
        points = poly_str.split(',')
        dist = 0
        for point_str in points:
            items = point_str.strip().split(' ')
            _lon = float(items[0])
            _lat = float(items[1])
            dist += np.sqrt((lat - _lat) ** 2 + (lon - _lon) ** 2)
        return float(dist) / len(points)

    mean_radius = (df_isochrones.apply(get_radius, axis=1)).mean() # 5 mins
    
    return mean_radius

mean_radius = get_mean_radius(df_isochrones)
print(f'Mean isochrone radius: {mean_radius}')

cities = df_isochrones['city'].drop_duplicates().tolist()
print('Available cities: ', cities)

filled_hexs = set(df_target[geo_id].drop_duplicates().tolist())
filled_trees = {}
uncovered = {}
covered = {}
total_peoples = {}
uncovered_df = {}
uncovered_trees = {} 

for city in cities:
    city_isochrones = df_isochrones[df_isochrones['city'].apply(lambda val: val == city)]
    filled_trees[city] = KDTree(city_isochrones[city_isochrones[geo_id].apply(lambda id: id in filled_hexs)][['lon', 'lat']].values)
    dist, ind = filled_trees[city].query(city_isochrones[['lon', 'lat']].values, k=1)
    uncovered[city] = set(city_isochrones[dist > mean_radius][geo_id].drop_duplicates().tolist())
    covered[city] = set(city_isochrones[dist < mean_radius][geo_id].drop_duplicates().tolist())
    peoples = df_population[df_population['city'].apply(lambda val: val == city)]['population'].sum()
    total_peoples[city] = df_population[df_population['city'].apply(lambda val: val == city) & df_population[geo_id].apply(lambda id: id in covered[city])]['population'].sum()
    print(f'City - {city}, peoples  - {peoples}, peoples covered - {total_peoples[city]}')
    
    df = df_population[df_population['city'].apply(lambda val: val == city) & df_population[geo_id].apply(lambda id: id in uncovered[city])]
    df['score'] = lr.predict(prepare_df2(df))
    uncovered_df[city] = df
    uncovered_trees[city] = KDTree(uncovered_df[city][['lon', 'lat']].values)
    

def get_predictions(city, uncovered_df, total_peoples, uncovered_trees, mean_radius, geo_id='geo_h3_10', n=20):
    uncovered_pops = uncovered_df[city]['population'].tolist()
    uncovered_hexs = uncovered_df[city][geo_id].tolist()
    uncovered_scores = uncovered_df[city]['score'].tolist()

    indices = uncovered_trees[city].query_radius(uncovered_df[city][['lon', 'lat']].values, r=mean_radius)

    tmp_covered = set()
    selected = []
    total_lift = 0
    lifts = []
    for current_terminal in range(n):
        best_lift = 0
        best_item = 0
        for item_id, items in enumerate(indices):
            added = sum([uncovered_pops[item] for item in set(items) if item not in tmp_covered])
            lift = added / total_peoples[city] * 100
            score = uncovered_scores[item_id]
            if lift * score > best_lift:
                best_lift = lift * score
                best_item = item_id

        total_lift += best_lift
        lifts.append(total_lift)
        selected.append(uncovered_hexs[best_item])
        tmp_covered.update(set(indices[best_item]))

    return lifts, selected

total_lift, selected = get_predictions(cities[1], uncovered_df, total_peoples, uncovered_trees, mean_radius, geo_id, 5)
coords = [geo_id_mapper[id] for id in selected]
print(f'Total lift: {total_lift[-1]}')
print('Hexagon indices: ', selected)
print(coords)

In [9]:
import seaborn as sns
city_id = 1
lifts, selected = get_predictions(cities[city_id], uncovered_df, total_peoples, uncovered_trees, mean_radius, geo_id, 30)
print(len(lifts))
print(cities[city_id])
sns.lineplot(x=np.arange(1, 31), y=lifts)

In [10]:
lifts[0]