# Инструкция
Используя исходные или очищенные данные, сформируйте предсказание класса объявления из множества exposition_test.tsv.gz

Обязательно нужно использовать одну или несколько моделей кластеризации. Дополнительно можно использовать решающие деревья, CatBoost, LightGBM и XGBoost.

Подсказка: для использования day_mean в классификации/кластеризации потребуется его сформировать для тестовых данных. Это можно сделать либо при помощи других моделей (два этапа классификации), либо построив линейную модель прогноза day_mean от count_day.

Данные:
* https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_train.tsv.gz
* https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_test.tsv.gz
* https://video.ittensive.com/machine-learning/hacktherealty/data/metro.utf8.json
* https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_sample_submisson.tsv

Итоговый файл с кодом (.py или .ipynb) выложите в github с портфолио.
Ответ

In [1]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from transliterate import translit
from outliers import smirnov_grubbs as grubbs
from sklearn.ensemble import ExtraTreesClassifier
pd.set_option('display.max_columns', 200)


def data_preproccesing (data):
# add total items per day
    data_day_count = data.groupby("day").count()["build_year"]
    data["day_count"] = data["day"].apply(lambda x:data_day_count.loc[x])
# approximate values (clean-up)
    data.loc[data.build_year == 0, 'build_year'] = np.NaN
    data['build_year'] = data['build_year'].fillna((data.groupby(['building_series_id'])['build_year'].transform('median')))
    data.loc[data['build_year'].isna(), 'build_year'] = data['build_year'].mean()
    data['build_year'] = data['build_year'].astype(np.int16)
    if 'has_elevator' in data.columns:
# elevator for 6+ floors
        data.loc[(data.has_elevator==0) & (data.floor>5), 'has_elevator'] = 1
# fix living area
    data.loc[data.living_area == 0, 'living_area'] = np.NaN
    data['living_area'] = data['living_area'].fillna((data.groupby(['rooms'])['living_area'].transform('median')))
# fix price
    data.loc[data.price<100, 'price'] *= 1000
    data.loc[data.price<1000, 'price'] *= 60
    if 'floors_total' in data.columns:
# fix celing height
        data.loc[(data.ceiling_height<2) | (data.ceiling_height>5), 'ceiling_height'] = np.NaN
        data['ceiling_height'] = data['ceiling_height'].fillna(data.groupby(['building_series_id'])['ceiling_height'].transform('median'))
        data.loc[data['ceiling_height'].isna(), 'ceiling_height'] = data['ceiling_height'].mean()
# enrich data, % floor
        data['floor'] = data['floor'] / data["floors_total"]
# locality, village/region/moscow/metro
    if 'locality_name' in data.columns:
        data['loctype_village'] = (data['locality_name'].str.match(pat = 'городок|деревня|ДНП|поселок|посёлок|село|СНТ|товарищество|хутор')).astype(np.uint8)
        data['loctype_moscow'] = (data.locality_name == 'Москва').astype(np.uint8)
        data['loctype_region'] = ((data.loctype_village == 0) & (data.loctype_moscow == 0)).astype(np.uint8)
    if "site_id" in data.columns:
        data = data.drop(['site_id', 'main_image', 'area', 'building_id', 'unified_address'], axis=1)
    if 'target_string' in data.columns:
        data = data.drop(['target_string'], axis=1)
# processing date
    if 'day' in data.columns:
        data['day'] = pd.to_datetime(data['day'])
        data['year'] = data['day'].dt.year
        data['month'] = data['day'].dt.month
        data['week'] = data['day'].dt.week
        data['dow'] = data['day'].dt.dayofweek
        data['dom'] = data['day'].dt.day
        data['doy'] = data['day'].dt.dayofyear
        data = data.drop(["day"], axis=1)
# adding holydays, 1-7 Jan, 8 Mar, 1 May, 9 May, 12 Jun, 4 Nov
# http://www.consultant.ru/law/ref/calendar/proizvodstvennye/2017/
# http://www.consultant.ru/law/ref/calendar/proizvodstvennye/2018/
# http://www.consultant.ru/law/ref/calendar/proizvodstvennye/2019/
# http://www.consultant.ru/law/ref/calendar/proizvodstvennye/2020/
        data['is_holyday'] = ((data['year'] == 2017 &
                                (((data['dom'] > 0) & (data['dom'] < 8) & data['month'] == 1) | 
                                (((data['dom'] == 23) | data['dom'] == 24)) & (data['month'] == 2)) |
                                ((data['dom'] == 8) & (data['month'] == 3)) |
                                (((data['dom'] == 1) | (data['dom'] == 8) | (data['dom'] == 9)) & data['month'] == 5) |
                                ((data['dom'] == 12) & (data['month'] == 6)) |
                                ((data['dom'] == 6) & (data['month'] == 11))) |
                              ((data['year'] == 2018) &
                                (((data['dom'] > 0) & (data['dom'] < 9) & data['month'] == 1) | 
                                ((data['dom'] == 23) & (data['month'] == 2)) |
                                (((data['dom'] == 8) | (data['dom'] == 9)) & (data['month'] == 3)) |
                                ((data['dom'] == 30) & (data['month'] == 4)) |
                                (((data['dom'] == 1) | (data['dom'] == 2) | (data['dom'] == 9)) & data['month'] == 5) |
                                (((data['dom'] == 11) | (data['dom'] == 12)) & (data['month'] == 6)) |
                                ((data['dom'] == 5) & (data['month'] == 11)) |
                                ((data['dom'] == 31) & (data['month'] == 12)))) |
                              ((data['year'] == 2019) &
                                (((data['dom'] > 0) & (data['dom'] < 9) & data['month'] == 1) | 
                                ((data['dom'] == 8) & (data['month'] == 3)) |
                                (((data['dom'] == 1) | (data['dom'] == 2) | (data['dom'] == 3) | (data['dom'] == 9) | (data['dom'] == 10)) & data['month'] == 5) |
                                ((data['dom'] == 12) & (data['month'] == 6)) |
                                ((data['dom'] == 4) & (data['month'] == 11)))) |
                              ((data['year'] == 2020) &
                                (((data['dom'] > 0) & (data['dom'] < 9) & data['month'] == 1) | 
                                ((data['dom'] == 24) & (data['month'] == 2)) |
                                ((data['dom'] == 9) & (data['month'] == 3)) |
                                (((data['dom'] == 1) | (data['dom'] == 4) | (data['dom'] == 5) | (data['dom'] == 11)) & data['month'] == 5) |
                                ((data['dom'] == 12) & (data['month'] == 6)) |
                                ((data['dom'] == 4) & (data['month'] == 11))))).astype(np.uint8)
# one-hot vectors
    if 'year' in data.columns:
        for label in ['year', 'month', 'week', 'dow', 'doy', 'dom', 'renovation',
                      'balcony', 'building_type', 'parking', 'floors_total', 'locality_name']:
            for l in data[label].unique():
                data[label + "_" + translit(str(l), "ru", reversed=True)] = (data[label] == l).astype(np.uint8)
# boolean -> int
    if 'studio' in data.columns:
        for label in ['studio', 'has_elevator', 'expect_demolition', 'is_apartment']:
            data[label] = data[label].astype(np.uint8)
# index (remove id from columns)
    if 'id' in data.columns:
        data = data.set_index(['id'])
    return data


def calc_price (data, group="", label=""):
    if data[group] in price_groups[group][label]:
        return data["price"] / price_groups[group][label][data[group]]
    else:
        return 1

# Сопоставление "кластер -> класс" на обучающих данных

In [2]:
train_df = pd.read_csv(
    "https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz"
).drop("doy_108", axis=1)

scaler = StandardScaler()
train_x_df = pd.DataFrame(scaler.fit_transform(train_df[train_df.columns[:-1]]), columns=train_df.columns[:-1])
train_x_df.shape

(356500, 7)

#### Фильтрация выборосов с помощью теста Смирнова-Граббса

In [3]:
%%time
anomalies = []
for column in train_x_df.columns:
    print("Тест Смирнова-Граббса, обработка столбца:", column)
    anomaly = grubbs.two_sided_test_indices(train_x_df[column].to_numpy(), alpha=0.01)
    anomalies.extend(anomaly)

Тест Смирнова-Граббса, обработка столбца: total_area
Тест Смирнова-Граббса, обработка столбца: ceiling_height
Тест Смирнова-Граббса, обработка столбца: rooms
Тест Смирнова-Граббса, обработка столбца: living_area
Тест Смирнова-Граббса, обработка столбца: price
Тест Смирнова-Граббса, обработка столбца: day_mean
Тест Смирнова-Граббса, обработка столбца: price_locality_name_median
CPU times: total: 1min 45s
Wall time: 1min 45s


In [4]:
train_df = train_df[~train_df.index.isin(anomalies)]
train_x_df = pd.DataFrame(scaler.transform(train_df[train_df.columns[:-1]]), columns=train_df.columns[:-1])
train_y_df = pd.DataFrame(train_df["target"])

#### Кластеризация без учёта выбросов

In [5]:
%%time
kmeans = KMeans(n_clusters=100, random_state=11, max_iter=200, n_init=10).fit(train_x_df)

CPU times: total: 31min 53s
Wall time: 2min 41s


In [6]:
train_y_df["cluster"] = kmeans.labels_
train_y_df.head()

Unnamed: 0,target,cluster
0,1,80
1,2,19
2,2,91
3,2,52
4,3,80


In [7]:
cluster_to_target = np.round(train_y_df.groupby("cluster")["target"].mean()).astype("int8")
cluster_to_target[:5]

cluster
0    3
1    3
2    3
3    3
4    4
Name: target, dtype: int8

# Построение модели для восстановления day_mean

In [8]:
train_df = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_train.tsv.gz', sep='\t')
train_df.head()

Unnamed: 0,building_series_id,site_id,target,parking,target_string,build_year,expect_demolition,main_image,latitude,total_area,ceiling_height,rooms,floors_total,id,living_area,floor,is_apartment,building_id,has_elevator,studio,unified_address,area,kitchen_area,day,longitude,price,flats_count,building_type,balcony,locality_name,renovation
0,1564812,0,1,OPEN,LESS_7,2005,False,//avatars.mds.yandex.net/get-realty/903734/add...,55.645313,105.0,3.0,3,20,5677548107212057955,50.0,14,False,7969879732878112812,True,False,"Россия, Москва, Пролетарский проспект, 7",105.0,15.0,2018-07-15,37.65749,95000,407,MONOLIT,BALCONY,Москва,EURO
1,1564812,0,2,CLOSED,7_14,2010,False,//avatars.mds.yandex.net/get-realty/1702013/ad...,55.537102,40.0,3.0,1,3,155646401125694364,0.0,1,False,7667415960903930340,False,False,"Россия, Москва, посёлок Первомайское, Централь...",40.0,10.0,2019-01-18,37.155632,25000,40,MONOLIT,UNKNOWN,посёлок Первомайское,COSMETIC_DONE
2,663302,0,2,OPEN,7_14,1995,False,//avatars.mds.yandex.net/get-realty/924080/add...,55.662956,37.599998,2.64,0,17,9186198458182518100,0.0,4,False,7166215405310646476,True,True,"Россия, Москва, улица Намёткина, 13к1",37.599998,0.0,2018-04-24,37.555466,26000,472,PANEL,LOGGIA,Москва,GOOD
3,1564812,0,2,OPEN,7_14,2018,False,//avatars.mds.yandex.net/get-realty/1521999/ad...,55.669151,80.0,0.0,3,27,10844743366553352344,49.0,23,False,2039402855860137453,True,False,"Россия, Московская область, Одинцово, Верхне-П...",80.0,20.0,2019-02-19,37.285,35000,156,PANEL,UNKNOWN,Одинцово,GOOD
4,1564812,0,3,UNKNOWN,14_30,2004,False,//avatars.mds.yandex.net/get-realty/50286/f5c8...,55.828518,100.0,3.0,3,4,3712912186792420056,0.0,3,False,4638454967482853510,True,False,"Россия, Москва, улица Рословка, 12к1",100.0,0.0,2017-08-08,37.361897,80000,31,MONOLIT,UNKNOWN,Москва,EURO


In [9]:
DAY_MODEL_FEATURES = ["day_count", "rooms", "longitude", "latitude"]

day_to_tickets_number = train_df.groupby("day")["target"].count()
day_to_tickets_mean_number = np.round(train_df.groupby("day")["target"].mean()).astype(int)
train_df["day_count"] = train_df["day"].map(lambda day: day_to_tickets_number[day])
train_df["day_mean"] = train_df["day"].map(lambda day: day_to_tickets_mean_number[day])

day_model = ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=11).fit(
    train_df[DAY_MODEL_FEATURES], train_df["day_mean"]
)

# Подготовка тестовых данных

In [10]:
test_df = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_test.tsv.gz', sep='\t')
day_to_tickets_number = test_df.groupby("day")["total_area"].count()
test_df["day_count"] = test_df["day"].map(lambda day: day_to_tickets_number[day])
test_df["day_mean"] = day_model.predict(test_df[DAY_MODEL_FEATURES])

In [12]:
test_df = data_preproccesing(test_df)
test_df.head()

Unnamed: 0_level_0,building_series_id,parking,build_year,expect_demolition,latitude,total_area,ceiling_height,rooms,floors_total,living_area,floor,is_apartment,has_elevator,studio,kitchen_area,public,longitude,price,flats_count,building_type,balcony,locality_name,renovation,day_count,day_mean,loctype_village,loctype_moscow,loctype_region,year,month,week,dow,dom,doy,is_holyday,year_2020,year_2019,month_1,month_11,month_3,month_12,month_2,week_4,week_47,week_2,week_5,week_10,week_52,week_3,week_45,week_6,week_7,week_11,week_12,week_51,week_46,week_49,week_50,week_48,week_13,week_1,week_14,week_44,dow_5,dow_1,dow_0,dow_2,dow_6,dow_3,dow_4,doy_25,doy_323,doy_11,doy_27,doy_64,doy_62,doy_357,doy_359,doy_19,doy_7,doy_313,doy_36,doy_310,doy_44,doy_8,doy_74,doy_77,doy_352,doy_37,doy_16,doy_319,doy_338,doy_346,doy_40,doy_15,doy_360,doy_363,doy_348,doy_355,doy_351,...,locality_name_selo Dubna,locality_name_derevnja Chuprjakovo,locality_name_selo Luzhniki,locality_name_derevnja Beljaninovo,locality_name_derevnja Trubacheevka,locality_name_derevnja Hljupino,locality_name_Roshal',locality_name_selo Strokino,locality_name_selo Ostaf'evo,locality_name_selo Orud'evo,locality_name_derevnja Golikovo,locality_name_poselok Remmash,locality_name_poselok Zverosovhoza,locality_name_derevnja Koptelino,locality_name_kottedzhnyj poselok Nikolina Poljana,locality_name_derevnja Isakovo,locality_name_poselok doma otdyha Gorki,locality_name_poselok Kolychevo,locality_name_derevnja Vatutinki,locality_name_derevnja Zhitnevo,locality_name_poselok Gazoprovodsk,locality_name_derevnja Skolkovo,locality_name_derevnja Zhuchki,locality_name_poselok Povedniki,locality_name_derevnja Meshkovo,locality_name_SNT Poljana Veteranov,locality_name_poselok Zheleznodorozhnyj,locality_name_SNT Veteran-2,locality_name_derevnja Levoshevo,locality_name_derevnja Krivtsovo,locality_name_poselok sanatorija imeni Gertsena,locality_name_derevnja Timonovo,locality_name_Ozery,locality_name_SNT Dudkino-1,locality_name_selo Sokol'nikovo,locality_name_derevnja Strelino,locality_name_derevnja Mamonovo,locality_name_poselok Zhukovo,locality_name_hutor Il'ichevka,locality_name_kottedzhnyj poselok Beljaninovo,locality_name_SNT Gavrikovo-1,locality_name_DPK Cheremushki,locality_name_derevnja Suharevo,locality_name_derevnja Ol'javidovo,locality_name_selo Akulovo,locality_name_poselok Shevljakovo,locality_name_poselok Agrogorodok,locality_name_kottedzhnyj poselok Izumrudnyj,locality_name_SNT Druzhba,locality_name_kottedzhnyj poselok Bungalo Klub Kurovo,locality_name_derevnja Klisheva,locality_name_SNT Dubki,locality_name_derevnja Pogorelki,locality_name_selo Kamenskoe,locality_name_poselok Kuznechiki,locality_name_poselok Usady,locality_name_poselok Instituta Poliomielita,locality_name_derevnja Bol'shie Zherebtsy,locality_name_poselok Junost',locality_name_poselok Vinogradovo,locality_name_selo Novoe,locality_name_poselok Kosmodem'janskij,locality_name_SNT Berezka-Kommunarka-1,locality_name_dachnoe nekommercheskoe obschestvo Lunevo,locality_name_derevnja Mar'ino,locality_name_rabochij poselok Zhilevo,locality_name_poselok pansionata Polushkino,locality_name_selo Shemetovo,locality_name_derevnja Evseevo,locality_name_poselok Svetlye Gory,locality_name_poselok Lugovoj,locality_name_derevnja L'jalovo,locality_name_derevnja Grebnevo,locality_name_derevnja Rajsemenovskoe,locality_name_poselok Novoe Grishino,locality_name_sadovoe tovarischestvo Anis,locality_name_derevnja Izmalkovo,locality_name_derevnja Shohovo,locality_name_derevnja Dokukino,locality_name_selo Bylovo,locality_name_selo Nepetsino,locality_name_SNT Malyj Kar'er,locality_name_SNT Berezka-Kommunarka,locality_name_SNT fabriki imeni 1 Maja,locality_name_poselok Novoselki,locality_name_derevnja Kostino,locality_name_kottedzhnyj poselok Varezhki-2,locality_name_poselok Pirogovo,locality_name_derevnja Leont'evo,locality_name_derevnja Svatkovo,locality_name_derevnja Kabanovo,locality_name_derevnja Ivojlovo,locality_name_selo Vozdvizhenskoe,locality_name_derevnja Dolgoe Ledovo,locality_name_derevnja Martem'janovo,locality_name_poselok Veshki,locality_name_poselok Radiotsentr,locality_name_derevnja Zhilino,locality_name_poselok Shuvoe,locality_name_derevnja Vorschikovo
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1
13762887891614807236,663294,UNKNOWN,1971,0,55.795704,36.0,2.64,1,12,19.0,0.833333,0,1,0,0.0,True,37.602478,40000,80,PANEL,UNKNOWN,Москва,UNKNOWN,352,3,0,1,0,2020,1,4,5,25,25,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14654451946329972059,712125,UNKNOWN,1986,0,55.605583,40.0,2.48,1,16,20.0,0.5625,0,1,0,10.0,True,37.743679,25000,222,PANEL,LOGGIA,Москва,COSMETIC_DONE,553,3,0,1,0,2019,11,47,1,19,323,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17449292585625593873,0,UNKNOWN,2014,0,55.92556,25.0,2.7,0,16,12.0,1.0,0,1,1,0.0,True,37.862965,19000,179,MONOLIT,LOGGIA,Королёв,COSMETIC_DONE,381,3,0,0,1,2020,1,2,5,11,11,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15597282206699587329,0,UNKNOWN,2001,0,55.432522,42.0,2.7,1,10,20.0,0.4,0,1,0,10.0,True,37.544224,20000,0,PANEL,LOGGIA,Подольск,COSMETIC_DONE,501,3,0,0,1,2020,1,5,0,27,27,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3718201047023531068,1564812,UNKNOWN,2019,0,55.91753,73.300003,2.8,3,16,45.799999,0.375,0,1,0,10.2,False,37.411098,68000,0,MONOLIT,TWO_LOGGIA,Химки,EURO,464,2,0,0,1,2020,3,10,2,4,64,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
price_data = test_df[["locality_name", "price"]]
price_groups = {
    "locality_name": {
        "median": price_data.groupby(["locality_name"])["price"].median()
    }
}
for group in price_groups:
    print("Processing:", group, end=" ")
    for label in price_groups[group]:
        print (label, end=" ")
        test_df["price_" + group + "_" + label] = test_df.apply(calc_price, axis=1, group=group, label=label)
    print ("")

Processing: locality_name median 


In [14]:
train_df = pd.read_csv(
    "https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz"
).drop("doy_108", axis=1)
test_df[train_df.columns[:-1]].to_csv("exposition_test.basic.csv", index=False)

# Формирование предсказания

In [15]:
test_df = pd.read_csv('exposition_test.basic.csv')
x_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)
x_df.head()

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,day_mean,price_locality_name_median
0,-0.618155,-0.645106,-0.861151,-0.690542,-0.089888,0.240201,-0.168322
1,-0.483202,-1.427399,-0.861151,-0.635189,-0.217544,0.240201,-0.276751
2,-0.989277,-0.351746,-2.026671,-1.078012,-0.268606,0.240201,-0.199947
3,-0.415725,-0.351746,-0.861151,-0.635189,-0.260095,0.240201,-0.174608
4,0.640286,0.137186,1.469889,0.792913,0.148401,-3.161258,0.277401


In [16]:
predictions = pd.DataFrame(kmeans.predict(x_df), columns=["cluster"])
predictions["target"] = predictions["cluster"].map(lambda cluster: cluster_to_target.loc[cluster])
predictions.head(30)

Unnamed: 0,cluster,target
0,24,3
1,85,3
2,91,3
3,1,3
4,78,2
5,19,3
6,95,1
7,95,1
8,39,3
9,55,1


# Загрузка решения

In [17]:
submission = pd.read_csv(
    'https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_sample_submission.tsv', sep='\t'
)
submission["target"] = predictions["target"]
submission.to_csv('submission_last.tsv', sep='\t', index=False)