In [1]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import os
mdir = Path(Path(os.path.abspath('')).parent)
sys.path.append(str(mdir))
from copy import deepcopy
from copy import copy

import seaborn as sns

In [2]:
pd.set_option('display.max_columns', None)

In [None]:
mdir/'data'/'train.csv'

In [None]:
train = pd.read_csv(mdir/'data'/'train.csv')

In [None]:
train['city'].unique()

In [None]:
test = pd.read_csv(mdir/'data'/'test.csv')

In [None]:
train['is_train'] = 1
test['is_train'] = 0

full_df = pd.concat([train, test])

In [None]:
set(test.city) - set(train.city)

In [None]:
city = train[['city','id']].groupby(by='city').id.count().reset_index()
city.sort_values('id', ascending=False)

In [None]:
city = test[['city','id']].groupby(by='city').id.count().reset_index()
city.sort_values('id', ascending=False)

In [None]:
train[train['osm_city_nearest_name'].isin(['饶河县', '绥芬河市'])].sample(5) # osm_city_closest_dist туда же - поменять

In [None]:
train.date.min(), train.date.max()

In [None]:
test.date.min(), test.date.max()

In [None]:
train.price_type.unique(), test.price_type.unique()

In [None]:
def get_time_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Формирует time-признаки
    :param df: dataframe, обучающая выборка
    :return: dataframe
    """
    df_new = train.copy()
    
    df_new['date'] = pd.to_datetime(df_new['date'], format='%Y-%m-%d')

    df_new['year'] = df_new['date'].dt.year 
    df_new['month'] = df_new['date'].dt.month 
    df_new['day'] = df_new['date'].dt.day

    df_new['dayofweek_num'] = df_new['date'].dt.dayofweek  
    df_new['quarter'] = df_new['date'].dt.quarter

    df_new['dayofyear'] = df_new['date'].dt.dayofyear  
    df_new['weekofyear'] = df_new['date'].dt.weekofyear
    
    return df_new

In [None]:
full_df = get_time_features(full_df)

In [None]:
def change_target_inflation(df: pd.DataFrame) -> pd.DataFrame:
    """
    Изменяет таргет в зависимости от инфляции 
    :param df: dataframe, обучающая выборка
    :return: dataframe
    """
    df_new = df.copy()
    
    for mnth in TRAIN_INFLATION_M:
        df_new[TARGET] = df_new.apply(lambda x:  x[TARGET] + x[TARGET]*TRAIN_INFLATION[mnth-2]/100 if x['month'] >= mnth else x[TARGET],axis=1)
    
    return df_new

In [None]:
TRAIN_INFLATION = [0.40, 0.33, 0.55, 0.83, 0.27, 0.22, 0.35]
TRAIN_INFLATION_M = [2,   3,    4,     5,    6,   7,     8]
                 # фев    мар   апр.  май. июн    июль. авг.  сент
TARGET = 'per_square_meter_price'

In [None]:
full_df['month'].describe() #!= full_df['month']]

In [None]:
full_df = change_target_inflation(full_df)


In [None]:
full_df.sample(5)

In [None]:
def get_territory_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Формирует территориалььные признаки
    :param df: dataframe, обучающая выборка
    :return: dataframe
    """
    df_new = train.copy()
    
    other_001 =  ['osm_amenity_points_in_0.001', 'osm_catering_points_in_0.001',  'osm_shops_points_in_0.001',                                                                                                       'osm_culture_points_in_0.001']
    other_005 =  ['osm_amenity_points_in_0.005', 'osm_catering_points_in_0.005',  'osm_shops_points_in_0.005',  'osm_healthcare_points_in_0.005',  'osm_leisure_points_in_0.005',   'osm_historic_points_in_0.005',  'osm_culture_points_in_0.005']
    other_0075 = ['osm_amenity_points_in_0.0075','osm_catering_points_in_0.0075', 'osm_shops_points_in_0.0075', 'osm_healthcare_points_in_0.0075', 'osm_leisure_points_in_0.0075',  'osm_historic_points_in_0.0075', 'osm_culture_points_in_0.0075']
    other_01 =   ['osm_amenity_points_in_0.01',  'osm_catering_points_in_0.01',   'osm_shops_points_in_0.01',   'osm_healthcare_points_in_0.01',   'osm_leisure_points_in_0.01',    'osm_historic_points_in_0.01',   'osm_culture_points_in_0.01']
     
    df_new['sum_other_001'] = df_new[other_001].sum(axis=1)
    df_new['sum_other_005'] = df_new[other_005].sum(axis=1)
    df_new['sum_other_0075'] = df_new[other_0075].sum(axis=1)
    df_new['sum_other_01'] = df_new[other_01].sum(axis=1)
    
    all_mean_001 = df_new['sum_other_001'].mean()
    all_mean_005 = df_new['sum_other_005'].mean()
    all_mean_0075 = df_new['sum_other_0075'].mean()
    all_mean_01 = df_new['sum_other_01'].mean()
    
    df_new['sum_other_001_diff'] = df_new['sum_other_001'] - all_mean_001
    df_new['sum_other_005_diff'] = df_new['sum_other_005'] - all_mean_005
    df_new['sum_other_0075_diff'] = df_new['sum_other_0075'] - all_mean_0075
    df_new['sum_other_01_diff'] = df_new['sum_other_01'] - all_mean_01
    
    df_new['sum_other_001_share'] = df_new['sum_other_001'] / all_mean_001
    df_new['sum_other_005_share'] = df_new['sum_other_005'] / all_mean_005
    df_new['sum_other_0075_share'] = df_new['sum_other_0075'] / all_mean_0075
    df_new['sum_other_01_share'] = df_new['sum_other_01'] / all_mean_01
    
    # regional
    
    all_mean_001 = df_new.groupby('region')['sum_other_001'].mean().apply(lambda x: max(1, x)).reset_index().rename(columns={'sum_other_001':'sum_other_001_region'})
    all_mean_005 = df_new.groupby('region')['sum_other_005'].mean().apply(lambda x: max(1, x)).reset_index().rename(columns={'sum_other_005':'sum_other_005_region'})
    all_mean_0075 = df_new.groupby('region')['sum_other_0075'].mean().apply(lambda x: max(1, x)).reset_index().rename(columns={'sum_other_0075':'sum_other_0075_region'})
    all_mean_01 = df_new.groupby('region')['sum_other_01'].mean().apply(lambda x: max(1, x)).reset_index().rename(columns={'sum_other_01':'sum_other_01_region'})
    
    df_new = df_new.merge(all_mean_001, how='inner', on='region')
    df_new = df_new.merge(all_mean_005, how='inner', on='region')
    df_new = df_new.merge(all_mean_0075, how='inner', on='region')
    df_new = df_new.merge(all_mean_01, how='inner', on='region')
    
    
    df_new['sum_other_001_diff_regional'] = df_new['sum_other_001'] - df_new['sum_other_001_region']
    df_new['sum_other_005_diff_regional'] = df_new['sum_other_005'] - df_new['sum_other_005_region']
    df_new['sum_other_0075_diff_regional'] = df_new['sum_other_0075'] - df_new['sum_other_0075_region']
    df_new['sum_other_01_diff_regional'] = df_new['sum_other_01'] - df_new['sum_other_01_region']
    
    df_new['sum_other_001_share_regional'] = df_new['sum_other_001'] / df_new['sum_other_001_region']
    df_new['sum_other_005_share_regional'] = df_new['sum_other_005'] / df_new['sum_other_005_region']
    df_new['sum_other_0075_share_regional'] = df_new['sum_other_0075'] / df_new['sum_other_0075_region']
    df_new['sum_other_01_share_regional'] = df_new['sum_other_01'] / df_new['sum_other_01_region']
    
    ###
    
    
    df_new['sum_other_001_diff'] = df_new['sum_other_001'] - df_new['sum_other_001'].mean()
    df_new['sum_other_005_diff'] = df_new['sum_other_005'] - df_new['sum_other_005'].mean()
    df_new['sum_other_0075_diff'] = df_new['sum_other_0075'] - df_new['sum_other_0075'].mean()
    df_new['sum_other_01_diff'] = df_new['sum_other_01'] - df_new['sum_other_01'].mean()
    
    df_new['sum_other_001_share'] = df_new['sum_other_001'] / df_new['sum_other_001'].mean()
    df_new['sum_other_005_share'] = df_new['sum_other_005'] / df_new['sum_other_005'].mean()
    df_new['sum_other_0075_share'] = df_new['sum_other_0075'] / df_new['sum_other_0075'].mean()
    df_new['sum_other_01_share'] = df_new['sum_other_01'] / df_new['sum_other_01'].mean()
     
    
    build_001 =  ['osm_building_points_in_0.001',  'osm_finance_points_in_0.001',                                 'osm_offices_points_in_0.001']
    build_005 =  ['osm_building_points_in_0.005',  'osm_finance_points_in_0.005',  'osm_hotels_points_in_0.005',  'osm_offices_points_in_0.005']
    build_0075 = ['osm_building_points_in_0.0075', 'osm_finance_points_in_0.0075', 'osm_hotels_points_in_0.0075', 'osm_offices_points_in_0.0075']
    build_01 =   ['osm_building_points_in_0.01',   'osm_finance_points_in_0.01',   'osm_hotels_points_in_0.01',   'osm_offices_points_in_0.01']
     
    df_new['sum_build_001'] = df_new[build_001].sum(axis=1)
    df_new['sum_build_005'] = df_new[build_005].sum(axis=1)
    df_new['sum_build_0075'] = df_new[build_0075].sum(axis=1)
    df_new['sum_build_01'] = df_new[build_01].sum(axis=1)
    
    df_new['sum_build_001_diff'] = df_new['sum_build_001'] - df_new['sum_build_001'].mean()
    df_new['sum_build_005_diff'] = df_new['sum_build_005'] - df_new['sum_build_005'].mean()
    df_new['sum_build_0075_diff'] = df_new['sum_build_0075'] - df_new['sum_build_0075'].mean()
    df_new['sum_build_01_diff'] = df_new['sum_build_01'] - df_new['sum_build_01'].mean()
    
    df_new['sum_build_001_share'] = df_new['sum_build_001'] / df_new['sum_build_001'].mean()
    df_new['sum_build_005_share'] = df_new['sum_build_005'] / df_new['sum_build_005'].mean()
    df_new['sum_build_0075_share'] = df_new['sum_build_0075'] / df_new['sum_build_0075'].mean()
    df_new['sum_build_01_share'] = df_new['sum_build_01'] / df_new['sum_build_01'].mean()
    
    # regional
    
    all_mean_001 = df_new.groupby('region')['sum_build_001'].mean().apply(lambda x: max(1, x)).reset_index().rename(columns={'sum_build_001':'sum_build_001_region'})
    all_mean_005 = df_new.groupby('region')['sum_build_005'].mean().apply(lambda x: max(1, x)).reset_index().rename(columns={'sum_build_005':'sum_build_005_region'})
    all_mean_0075 = df_new.groupby('region')['sum_build_0075'].mean().apply(lambda x: max(1, x)).reset_index().rename(columns={'sum_build_0075':'sum_build_0075_region'})
    all_mean_01 = df_new.groupby('region')['sum_build_01'].mean().apply(lambda x: max(1, x)).reset_index().rename(columns={'sum_build_01':'sum_build_01_region'})
    
    df_new = df_new.merge(all_mean_001, how='inner', on='region')
    df_new = df_new.merge(all_mean_005, how='inner', on='region')
    df_new = df_new.merge(all_mean_0075, how='inner', on='region')
    df_new = df_new.merge(all_mean_01, how='inner', on='region')
    
    
    df_new['sum_build_001_diff_regional'] = df_new['sum_build_001'] - df_new['sum_build_001_region']
    df_new['sum_build_005_diff_regional'] = df_new['sum_build_005'] - df_new['sum_build_005_region']
    df_new['sum_build_0075_diff_regional'] = df_new['sum_build_0075'] - df_new['sum_build_0075_region']
    df_new['sum_build_01_diff_regional'] = df_new['sum_build_01'] - df_new['sum_build_01_region']
    
    df_new['sum_build_001_share_regional'] = df_new['sum_build_001'] / df_new['sum_build_001_region']
    df_new['sum_build_005_share_regional'] = df_new['sum_build_005'] / df_new['sum_build_005_region']
    df_new['sum_build_0075_share_regional'] = df_new['sum_build_0075'] / df_new['sum_build_0075_region']
    df_new['sum_build_01_share_regional'] = df_new['sum_build_01'] / df_new['sum_build_01_region']
    
  
    
    return df_new

In [None]:
full_df = get_territory_features(full_df)

In [None]:
len(full_df.columns)

In [None]:
def get_random_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Формирует территориалььные признаки
    :param df: dataframe, обучающая выборка
    :return: dataframe
    """
    df_new = df.copy()
    
    df_new['randNumCol'] = np.random.randint(1, 6, df_new.shape[0])
    
    return df_new

In [None]:
full_df = get_random_feature(full_df)

In [None]:
len(full_df.columns)

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing
import re


def preproc_floors(old_df):
    '''
    Функция энкодинга признака floor (этажи)
    В результате создаёт перезаписывает признак floor (-1 если много этажей или этаж не цифрой),
        создаёт признаки basement (наличие подвала или цоколя), mezzanine (наличие антресоли и мансарды)
        tech (наличие тех этажа)
    '''
    df = old_df.copy()
    
    floor = df['floor'].str.lower().to_numpy()
    num_floors = []
    res_floors = []
    basement = [] #наличие подвала или цоколя
    mezzanine = [] #наличие антресоли и мансарды
    tech = [] # тех этаж
    for item in floor:
        count_floors = 0
        
        if item != item:
            basement.append(0)
            mezzanine.append(0)
            tech.append(0)
            num_floors.append(1)
            res_floors.append(np.nan)
            continue
            
        if 'подв' or 'цок' in item:
            basement.append(1)
            count_floors += 1 # или 2
        else:
            basement.append(0)
            
        if 'манса' or 'антре' or 'мезо' in item:
            mezzanine.append(1)
            count_floors += 1 # или 2
            
        else:
            mezzanine.append(0)
            
        if 'тех' in item:
            tech.append(1)
            count_floors += 1 
        else:
            tech.append(0)
            
        item = re.sub('[^\d\. - :]',' ', item)
        item = item.replace('-', '.').replace(':', '.')
        new_item = item.split()
        if len(new_item) == 0:
            num_floors.append(count_floors)
            res_floors.append(-1)
            continue
        elif len(new_item) == 1:
            if '.' in new_item[0]:
                last_item = new_item[0].split('.')
                if last_item[-1] == '0':
                    count_floors += 1 
                    num_floors.append(count_floors)
                    res_floors.append(int(''.join(last_item[:-1])))
                    continue
                else:
                    count_floors += len(range(int(last_item[0]), int(last_item[-1]) + 1))
                    num_floors.append(count_floors)
                    res_floors.append(-1)
                    continue
            else:
                count_floors += 1 
                num_floors.append(count_floors)
                res_floors.append(int(float(new_item[0])))
                continue
        else:
            count_floors += len(new_item)
            num_floors.append(count_floors)
            res_floors.append(-1)
            
    df = df.drop(['floor'], axis = 1)
    df['floor'] = res_floors
    df['num_floors'] = num_floors
    df['basement'] = basement
    df['mezzanine'] = mezzanine
    df['tech'] = tech
            
    return df


def fill_na(old_df):
    '''
    Функция заполянет null значения в датафрейме
    '''
    df = old_df.copy()
    
    #floor
    # заполнение null = -1, дабавляется в num_floors площадь / медина площади на этаж
    # по факту заполянет всё 1 :(((
    square = df[df['floor'].isna() == False]['total_square'].to_numpy() 
    n_floor = df[df['floor'].isna() == False]['num_floors'].to_numpy() 
    square_per_floor = square / n_floor
    mean_square_per_floor = np.mean(square_per_floor)
    df[df['floor'].isna()]['num_floor'] = np.around(df[df['floor'].isna()]['total_square'].to_numpy() / mean_square_per_floor)
    df['floor'] = df['floor'].fillna(-1)
    
    # reform_house_population_1000 reform_house_population_500 
    # reform_mean_floor_count_1000 reform_mean_floor_count_500
    # reform_mean_year_building_1000 reform_mean_year_building_500
    # заполненяет null значения на среднее по региону
    group1 =  df[~df['reform_house_population_1000'].isna()].groupby(['region'])['reform_house_population_1000']
    group2 =  df[~df['reform_house_population_500'].isna()].groupby(['region'])['reform_house_population_500']
    group3 =  df[~df['reform_mean_floor_count_1000'].isna()].groupby(['region'])['reform_mean_floor_count_1000']
    group4 =  df[~df['reform_mean_floor_count_500'].isna()].groupby(['region'])['reform_mean_floor_count_500']
    group5 =  df[~df['reform_mean_year_building_1000'].isna()].groupby(['region'])['reform_mean_year_building_1000']
    group6 =  df[~df['reform_mean_year_building_500'].isna()].groupby(['region'])['reform_mean_year_building_500']
    
    df['reform_house_population_1000'] = df.apply(lambda x: np.mean(group1.groups[x['region']]) \
                                                  if x['reform_house_population_1000'] != x['reform_house_population_1000'] else \
                                                  x['reform_house_population_1000'], axis = 1)
    df['reform_house_population_500'] = df.apply(lambda x: np.mean(group1.groups[x['region']]) \
                                                  if x['reform_house_population_500'] != x['reform_house_population_500'] else \
                                                  x['reform_house_population_500'], axis = 1)
    df['reform_mean_floor_count_1000'] = df.apply(lambda x: np.mean(group1.groups[x['region']]) \
                                                  if x['reform_mean_floor_count_1000'] != x['reform_mean_floor_count_1000'] else \
                                                  x['reform_mean_floor_count_1000'], axis = 1)
    df['reform_mean_floor_count_500'] = df.apply(lambda x:  np.mean(group1.groups[x['region']]) \
                                                  if x['reform_mean_floor_count_500'] != x['reform_mean_floor_count_500'] else \
                                                  x['reform_mean_floor_count_500'], axis = 1)
    df['reform_mean_year_building_1000'] = df.apply(lambda x: np.mean(group1.groups[x['region']]) \
                                                  if x['reform_mean_year_building_1000'] != x['reform_mean_year_building_1000'] else \
                                                  x['reform_mean_year_building_1000'], axis = 1)
    df['reform_mean_year_building_500'] = df.apply(lambda x: np.mean(group1.groups[x['region']]) \
                                                  if x['reform_mean_year_building_500'] != x['reform_mean_year_building_500'] else \
                                                  x['reform_mean_year_building_500'], axis = 1)
    
    # дропаем street
    df = df.dropna(subset=['street'])
    
    return df


def number_encode_features(df):
    result = df.copy() 
    result['street'] += result['city']
    encoders = {}
    for column in result.columns:
        if result.dtypes[column] == 'object':
            encoders[column] = preprocessing.LabelEncoder() 
            result[column] = encoders[column].fit_transform(result[column])
    return result, encoders

In [None]:
full_df = preproc_floors(full_df)
len(full_df.columns)

In [None]:
full_df = fill_na(full_df)
len(full_df.columns)

In [None]:
full_df, _ = number_encode_features(full_df)
len(full_df.columns)

In [None]:
len(full_df.columns)

In [None]:
full_df

In [None]:
from raif_hack.settings import MODEL_PARAMS, LOGGING_CONFIG, NUM_FEATURES, CATEGORICAL_OHE_FEATURES,CATEGORICAL_STE_FEATURES,TARGET


set(full_df.columns.values) - set(NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES)

In [None]:
MY_FEATURES = ['basement',
 'floor',
 'mezzanine',
 'num_floors',
 'osm_city_nearest_name',
 'per_square_meter_price',
 'price_type',
 'randNumCol',
 'street',
 'sum_build_001',
 'sum_build_001_diff',
 'sum_build_001_diff_regional',
 'sum_build_001_region',
 'sum_build_001_share',
 'sum_build_001_share_regional',
 'sum_build_005',
 'sum_build_005_diff',
 'sum_build_005_diff_regional',
 'sum_build_005_region',
 'sum_build_005_share',
 'sum_build_005_share_regional',
 'sum_build_0075',
 'sum_build_0075_diff',
 'sum_build_0075_diff_regional',
 'sum_build_0075_region',
 'sum_build_0075_share',
 'sum_build_0075_share_regional',
 'sum_build_01',
 'sum_build_01_diff',
 'sum_build_01_diff_regional',
 'sum_build_01_region',
 'sum_build_01_share',
 'sum_build_01_share_regional',
 'sum_other_001',
 'sum_other_001_diff',
 'sum_other_001_diff_regional',
 'sum_other_001_region',
 'sum_other_001_share',
 'sum_other_001_share_regional',
 'sum_other_005',
 'sum_other_005_diff',
 'sum_other_005_diff_regional',
 'sum_other_005_region',
 'sum_other_005_share',
 'sum_other_005_share_regional',
 'sum_other_0075',
 'sum_other_0075_diff',
 'sum_other_0075_diff_regional',
 'sum_other_0075_region',
 'sum_other_0075_share',
 'sum_other_0075_share_regional',
 'sum_other_01',
 'sum_other_01_diff',
 'sum_other_01_diff_regional',
 'sum_other_01_region',
 'sum_other_01_share',
 'sum_other_01_share_regional',
 'tech' ]

In [None]:
full_df['street']

In [None]:
full_df.info()

In [3]:
# predict final


from raif_hack.model import BenchmarkModel
from raif_hack.settings import MODEL_PARAMS, LOGGING_CONFIG, NUM_FEATURES, CATEGORICAL_OHE_FEATURES,CATEGORICAL_STE_FEATURES,TARGET

from raif_hack.utils import PriceTypeEnum
from raif_hack.metrics import metrics_stat
from raif_hack.features import prepare_categorical, get_time_features, change_target_inflation, get_territory_features
from raif_hack.features import get_random_feature, preproc_floors, fill_na, number_encode_features

MY_FEATURES = ['basement',
 'floor',
 'mezzanine',
 'num_floors',
 'osm_city_nearest_name',
 'per_square_meter_price',
 'price_type',
 'randNumCol',
 'street',
 'sum_build_001',
 'sum_build_001_diff',
 'sum_build_001_diff_regional',
 'sum_build_001_region',
 'sum_build_001_share',
 'sum_build_001_share_regional',
 'sum_build_005',
 'sum_build_005_diff',
 'sum_build_005_diff_regional',
 'sum_build_005_region',
 'sum_build_005_share',
 'sum_build_005_share_regional',
 'sum_build_0075',
 'sum_build_0075_diff',
 'sum_build_0075_diff_regional',
 'sum_build_0075_region',
 'sum_build_0075_share',
 'sum_build_0075_share_regional',
 'sum_build_01',
 'sum_build_01_diff',
 'sum_build_01_diff_regional',
 'sum_build_01_region',
 'sum_build_01_share',
 'sum_build_01_share_regional',
 'sum_other_001',
 'sum_other_001_diff',
 'sum_other_001_diff_regional',
 'sum_other_001_region',
 'sum_other_001_share',
 'sum_other_001_share_regional',
 'sum_other_005',
 'sum_other_005_diff',
 'sum_other_005_diff_regional',
 'sum_other_005_region',
 'sum_other_005_share',
 'sum_other_005_share_regional',
 'sum_other_0075',
 'sum_other_0075_diff',
 'sum_other_0075_diff_regional',
 'sum_other_0075_region',
 'sum_other_0075_share',
 'sum_other_0075_share_regional',
 'sum_other_01',
 'sum_other_01_diff',
 'sum_other_01_diff_regional',
 'sum_other_01_region',
 'sum_other_01_share',
 'sum_other_01_share_regional',
 'tech' ]

In [4]:
train_df = pd.read_csv(mdir/'data'/'train.csv')
test_df = pd.read_csv(mdir/'data'/'test.csv')

train_df['is_train'] = 1
test_df['is_train'] = 0

full_df = pd.concat([train_df, test_df])


full_df = get_time_features(full_df)

full_df = get_territory_features(full_df)

# попробовать на честность
full_df = change_target_inflation(full_df)

full_df = preproc_floors(full_df)
full_df = fill_na(full_df)

full_df = get_random_feature(full_df)

full_df, _ = number_encode_features(full_df)

  interactivity=interactivity, compiler=compiler, result=result)
  df_new['weekofyear'] = df_new['date'].dt.weekofyear
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['floor'].isna()]['num_floor'] = np.around(df[df['floor'].isna()]['total_square'].to_numpy() / mean_square_per_floor)


In [111]:
train_df = full_df[full_df['is_train'] == 1]
test_df = full_df[full_df['is_train'] == 0]

In [112]:
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

#import pandas_profiling as pp # подсмотрел в kernels, интересно

import tensorflow as tf
from catboost import CatBoostRegressor, Pool

You should consider upgrading via the '/Users/denis.semenov/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/denis.semenov/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [113]:
import typing
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1
def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9



In [114]:
true_features = ['lat',
 'lng',
 'osm_amenity_points_in_0.001',
 'osm_amenity_points_in_0.005',
 'osm_amenity_points_in_0.0075',
 'osm_building_points_in_0.005',
 'osm_building_points_in_0.0075',
 'osm_building_points_in_0.01',
 'osm_catering_points_in_0.001',
 'osm_catering_points_in_0.005',
 'osm_catering_points_in_0.0075',
 'osm_catering_points_in_0.01',
 'osm_city_closest_dist',
 'osm_city_nearest_population',
 'osm_crossing_closest_dist',
 'osm_crossing_points_in_0.001',
 'osm_crossing_points_in_0.005',
 'osm_crossing_points_in_0.0075',
 'osm_crossing_points_in_0.01',
 'osm_culture_points_in_0.005',
 'osm_culture_points_in_0.0075',
 'osm_culture_points_in_0.01',
 'osm_finance_points_in_0.001',
 'osm_finance_points_in_0.005',
 'osm_finance_points_in_0.0075',
 'osm_finance_points_in_0.01',
 'osm_healthcare_points_in_0.005',
 'osm_healthcare_points_in_0.0075',
 'osm_healthcare_points_in_0.01',
 'osm_historic_points_in_0.005',
 'osm_historic_points_in_0.0075',
 'osm_historic_points_in_0.01',
 'osm_hotels_points_in_0.0075',
 'osm_hotels_points_in_0.01',
 'osm_leisure_points_in_0.005',
 'osm_leisure_points_in_0.0075',
 'osm_leisure_points_in_0.01',
 'osm_offices_points_in_0.001',
 'osm_offices_points_in_0.005',
 'osm_offices_points_in_0.0075',
 'osm_offices_points_in_0.01',
 'osm_shops_points_in_0.001',
 'osm_shops_points_in_0.005',
 'osm_shops_points_in_0.0075',
 'osm_shops_points_in_0.01',
 'osm_subway_closest_dist',
 'osm_train_stop_closest_dist',
 'osm_train_stop_points_in_0.005',
 'osm_transport_stop_closest_dist',
 'osm_transport_stop_points_in_0.005',
 'osm_transport_stop_points_in_0.0075',
 'osm_transport_stop_points_in_0.01',
 'reform_count_of_houses_1000',
 'reform_count_of_houses_500',
 'reform_house_population_1000',
 'reform_house_population_500',
 'reform_mean_floor_count_1000',
 'reform_mean_floor_count_500',
 'reform_mean_year_building_1000',
 'reform_mean_year_building_500',
 'region',
 'total_square',
 'street',
 'realty_type',
 'day',
 'dayofyear',
 'sum_other_001',
 'sum_other_005',
 'sum_other_001_diff',
 'sum_other_005_diff',
 'sum_other_0075_diff',
 'sum_other_001_share',
 'sum_other_005_share',
 'sum_other_001_region',
 'sum_other_005_region',
 'sum_other_0075_region',
 'sum_other_01_region',
 'sum_other_001_diff_regional',
 'sum_other_005_diff_regional',
 'sum_other_0075_diff_regional',
 'sum_other_01_diff_regional',
 'sum_other_001_share_regional',
 'sum_other_005_share_regional',
 'sum_other_0075_share_regional',
 'sum_other_01_share_regional',
 'sum_build_005',
 'sum_build_005_share',
 'sum_build_005_region',
 'sum_build_0075_region',
 'sum_build_01_region',
 'sum_build_005_diff_regional',
 'sum_build_01_diff_regional',
 'sum_build_005_share_regional',
 'sum_build_01_share_regional']

In [115]:
from sklearn.model_selection import train_test_split, GridSearchCV
X = train_df.drop(['per_square_meter_price','date', 'id', 'is_train'], axis=1)[true_features]
Y = train_df[['per_square_meter_price']]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)



In [93]:
from catboost import CatBoostRegressor, Pool


In [116]:
model = CatBoostRegressor(iterations=30000,
                          loss_function = 'RMSE',
                          task_type="CPU",
                          #depth= # 4 - 81480.24422
                         )
                          #devices='0:1',
                          #learning_rate= 0.3,
                          #depth = 14,
                          #l2_leaf_reg = 1
                        

model.fit(X = np.array(X_train),
            y = np.array(y_train),
          eval_set = (np.array(X_test), np.array(y_test)),
            #silent = False,
          verbose = 500,
            early_stopping_rounds=20)

Learning rate set to 0.015741
0:	learn: 174284.9046325	test: 166820.3673255	best: 166820.3673255 (0)	total: 50.8ms	remaining: 25m 23s
500:	learn: 102626.4608474	test: 99672.3377777	best: 99672.3377777 (500)	total: 12.1s	remaining: 11m 53s
1000:	learn: 96239.1393263	test: 94267.1417608	best: 94267.1417608 (1000)	total: 24.6s	remaining: 11m 53s
1500:	learn: 92106.0964590	test: 90823.0708439	best: 90823.0708439 (1500)	total: 37.1s	remaining: 11m 44s
2000:	learn: 89165.7586871	test: 88535.1560950	best: 88535.1560950 (2000)	total: 49.7s	remaining: 11m 36s
2500:	learn: 86943.6238094	test: 86827.2235620	best: 86827.2235620 (2500)	total: 1m 2s	remaining: 11m 29s
3000:	learn: 85152.9291612	test: 85534.4486533	best: 85534.4486533 (3000)	total: 1m 14s	remaining: 11m 13s
3500:	learn: 83604.2740330	test: 84449.8519189	best: 84449.8519189 (3500)	total: 1m 27s	remaining: 10m 59s
4000:	learn: 82228.6582524	test: 83509.3581122	best: 83509.3581122 (4000)	total: 1m 39s	remaining: 10m 48s
4500:	learn: 810

<catboost.core.CatBoostRegressor at 0x7f7efc78ff90>

In [117]:
'ok'

'ok'

In [118]:
mtest = test_df.drop(['per_square_meter_price','date'], axis=1)
mtest

Unnamed: 0,city,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,osm_building_points_in_0.01,osm_catering_points_in_0.001,osm_catering_points_in_0.005,osm_catering_points_in_0.0075,osm_catering_points_in_0.01,osm_city_closest_dist,osm_city_nearest_name,osm_city_nearest_population,osm_crossing_closest_dist,osm_crossing_points_in_0.001,osm_crossing_points_in_0.005,osm_crossing_points_in_0.0075,osm_crossing_points_in_0.01,osm_culture_points_in_0.001,osm_culture_points_in_0.005,osm_culture_points_in_0.0075,osm_culture_points_in_0.01,osm_finance_points_in_0.001,osm_finance_points_in_0.005,osm_finance_points_in_0.0075,osm_finance_points_in_0.01,osm_healthcare_points_in_0.005,osm_healthcare_points_in_0.0075,osm_healthcare_points_in_0.01,osm_historic_points_in_0.005,osm_historic_points_in_0.0075,osm_historic_points_in_0.01,osm_hotels_points_in_0.005,osm_hotels_points_in_0.0075,osm_hotels_points_in_0.01,osm_leisure_points_in_0.005,osm_leisure_points_in_0.0075,osm_leisure_points_in_0.01,osm_offices_points_in_0.001,osm_offices_points_in_0.005,osm_offices_points_in_0.0075,osm_offices_points_in_0.01,osm_shops_points_in_0.001,osm_shops_points_in_0.005,osm_shops_points_in_0.0075,osm_shops_points_in_0.01,osm_subway_closest_dist,osm_train_stop_closest_dist,osm_train_stop_points_in_0.005,osm_train_stop_points_in_0.0075,osm_train_stop_points_in_0.01,osm_transport_stop_closest_dist,osm_transport_stop_points_in_0.005,osm_transport_stop_points_in_0.0075,osm_transport_stop_points_in_0.01,reform_count_of_houses_1000,reform_count_of_houses_500,reform_house_population_1000,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,region,total_square,street,realty_type,price_type,is_train,year,month,day,dayofweek_num,quarter,dayofyear,weekofyear,sum_other_001,sum_other_005,sum_other_0075,sum_other_01,sum_other_001_diff,sum_other_005_diff,sum_other_0075_diff,sum_other_01_diff,sum_other_001_share,sum_other_005_share,sum_other_0075_share,sum_other_01_share,sum_other_001_region,sum_other_005_region,sum_other_0075_region,sum_other_01_region,sum_other_001_diff_regional,sum_other_005_diff_regional,sum_other_0075_diff_regional,sum_other_01_diff_regional,sum_other_001_share_regional,sum_other_005_share_regional,sum_other_0075_share_regional,sum_other_01_share_regional,sum_build_001,sum_build_005,sum_build_0075,sum_build_01,sum_build_001_diff,sum_build_005_diff,sum_build_0075_diff,sum_build_01_diff,sum_build_001_share,sum_build_005_share,sum_build_0075_share,sum_build_01_share,sum_build_001_region,sum_build_005_region,sum_build_0075_region,sum_build_01_region,sum_build_001_diff_regional,sum_build_005_diff_regional,sum_build_0075_diff_regional,sum_build_01_diff_regional,sum_build_001_share_regional,sum_build_005_share_regional,sum_build_0075_share_regional,sum_build_01_share_regional,floor,num_floors,basement,mezzanine,tech,randNumCol
5466,1599,COL_294951,59.042841,57.650394,2,17,32,48,0,0,0,0,0,0,0,2,62.945290,14,139209.0,0.599163,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,2,2,3,0,0,0,0,1,1,0,0,1,2,2,15,26,39,288.253235,1.356784,0,0,0,0.082988,3,10,14,84,20,970.0,304.0,3.640000,4.526316,1973.786667,1970.842105,31,388.274980,5404,110,1,0,2020,9,6,6,3,250,36,4,34,62,94,-1.031931,-45.619862,-97.82836,-166.927923,0.794923,0.427029,0.387916,0.360253,3.708160,56.467342,114.458115,183.046499,0.291840,-22.467342,-52.458115,-89.046499,1.078702,0.602118,0.541683,0.513531,0,0,1,2,-0.424156,-7.131324,-13.538063,-22.100129,0.000000,0.000000,0.068785,0.082987,1,4.903022,9.780713,15.566492,-1,-4.903022,-8.780713,-13.566492,0.0,0.000000,0.102242,0.128481,1.0,3,1,1,0,2
5467,3063,COL_301220,58.002740,56.243041,0,47,125,269,0,0,0,0,0,11,28,64,1.430043,109,1055397.0,0.079855,2,25,60,124,0,0,7,16,0,4,6,18,4,6,10,1,3,4,2,4,8,4,6,7,1,3,6,12,0,25,66,143,272.075084,2.091310,0,0,0,0.080955,5,15,28,153,48,2653.0,722.0,6.376712,6.666667,1969.560000,1971.340426,31,6683.779613,4475,10,1,0,2020,9,13,6,3,257,37,0,92,241,513,-5.031931,12.380138,81.17164,252.072077,0.000000,1.155491,1.507868,1.966060,3.708160,56.467342,114.458115,183.046499,-3.708160,35.532658,126.541885,329.953501,0.000000,1.629260,2.105574,2.802567,1,9,16,38,0.575844,1.868676,1.461937,13.899871,2.357621,1.262038,1.100559,1.576755,1,4.903022,9.780713,15.566492,0,4.096978,6.219287,22.433508,1.0,1.835603,1.635873,2.441141,-1.0,1,0,0,0,5
5468,3063,COL_302139,58.006028,56.250719,2,89,235,389,0,0,0,0,1,13,62,92,0.994769,109,1055397.0,0.181636,0,45,93,172,0,10,18,25,0,5,14,29,2,6,15,3,6,11,1,2,5,3,9,18,0,5,14,26,0,49,120,193,271.810010,1.601859,0,0,0,0.141321,10,23,38,172,40,2720.0,839.0,5.656627,6.657895,1962.517442,1967.350000,31,487.534388,3032,100,1,0,2020,9,13,6,3,257,37,3,169,456,743,-2.031931,89.380138,296.17164,482.072077,0.596193,2.122586,2.853061,2.847530,3.708160,56.467342,114.458115,183.046499,-0.708160,112.532658,341.541885,559.953501,0.809027,2.992880,3.983990,4.059078,0,11,30,60,-0.424156,3.868676,15.461937,35.899871,0.000000,1.542491,2.063549,2.489613,1,4.903022,9.780713,15.566492,-1,6.096978,20.219287,44.433508,0.0,2.243515,3.067261,3.854433,1.0,3,1,1,0,4
5469,3063,COL_307043,58.004530,56.257829,2,54,213,370,0,0,0,0,0,11,38,61,1.250558,109,1055397.0,0.013873,6,41,110,193,0,0,16,22,2,5,9,20,3,6,14,5,7,10,0,0,2,4,10,15,0,1,13,21,0,25,124,225,271.351339,1.742710,0,0,0,0.097325,11,22,50,189,36,3681.0,728.0,6.230769,6.393939,1963.385027,1960.944444,31,98.848219,20556,100,1,0,2020,9,13,6,3,257,37,2,102,414,717,-3.031931,22.380138,254.17164,456.072077,0.397462,1.281087,2.590279,2.747885,3.708160,56.467342,114.458115,183.046499,-1.708160,45.532658,299.541885,533.953501,0.539351,1.806354,3.617044,3.917037,2,6,22,43,1.575844,-1.131324,7.461937,18.899871,4.715242,0.841358,1.513269,1.784223,1,4.903022,9.780713,15.566492,1,1.096978,12.219287,27.433508,2.0,1.223735,2.249325,2.762344,-1.0,1,0,0,0,5
5470,4310,COL_314997,56.768550,54.161235,5,35,53,74,0,0,1,1,1,8,9,10,33.177079,29,98063.0,0.066569,2,14,20,26,0,0,1,1,0,2,3,4,1,6,8,2,4,4,0,1,1,0,0,1,0,1,3,4,4,22,31,47,323.961149,42.256920,0,0,0,0.080118,10,17,23,57,20,1129.0,411.0,5.607843,5.842105,1974.862745,1974.000000,31,501.930571,5868,10,1,0,2020,9,20,6,3,264,38,10,68,104,145,4.968069,-11.619862,-55.82836,-115.927923,1.987309,0.854058,0.650698,0.555709,3.708160,56.467342,114.458115,183.046499,6.291840,11.532658,-10.458115,-38.046499,2.696755,1.204236,0.908629,0.792148,0,3,8,10,-0.424156,-4.131324,-6.538063,-14.100129,0.000000,0.420679,0.550280,0.414936,1,4.903022,9.780713,15.566492,-1,-1.903022,-1.780713,-5.566492,0.0,0.611868,0.817936,0.642406,-1.0,1,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282727,3091,COL_444612,61.759068,34.454816,0,6,9,13,0,0,0,0,0,0,0,0,4.715492,110,280170.0,0.205710,0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,5,7,11,297.018619,92.107995,0,0,0,0.278992,2,4,6,62,17,1191.0,335.0,5.576271,5.000000,1977.186441,1973.133333,13,1317.315599,28303,10,1,0,2020,12,20,6,4,355,51,0,11,17,25,-5.031931,-68.619862,-142.82836,-235.927923,0.000000,0.138156,0.106364,0.095812,3.480683,46.022462,90.404313,145.349506,-3.480683,-35.022462,-73.404313,-120.349506,0.000000,0.239014,0.188044,0.171999,0,0,0,0,-0.424156,-7.131324,-14.538063,-24.100129,0.000000,0.000000,0.000000,0.000000,1,3.520216,6.714286,11.072776,-1,-3.520216,-6.714286,-11.072776,0.0,0.000000,0.000000,0.000000,-1.0,1,0,0,0,4
282728,3091,COL_445336,61.793530,34.365339,0,69,121,197,0,0,0,0,0,15,28,41,1.500167,110,280170.0,0.029002,1,20,36,52,0,0,2,7,0,6,13,16,1,4,7,0,3,8,0,1,1,6,6,12,0,9,18,24,0,42,64,103,296.210916,95.001603,0,0,0,0.305529,4,7,13,96,18,1284.0,232.0,4.021277,3.611111,1958.760417,1956.000000,13,364.594913,13311,10,1,0,2020,12,20,6,4,355,51,0,133,228,375,-5.031931,53.380138,68.17164,114.072077,0.000000,1.670437,1.426530,1.437178,3.480683,46.022462,90.404313,145.349506,-3.480683,86.977538,137.595687,229.650494,0.000000,2.889893,2.522004,2.579988,0,15,32,41,-0.424156,7.868676,17.461937,16.899871,0.000000,2.103396,2.201118,1.701236,1,3.520216,6.714286,11.072776,-1,11.479784,25.285714,29.927224,0.0,4.261103,4.765957,3.702775,-1.0,1,0,0,0,4
282729,3091,COL_446244,61.764217,34.442087,0,12,16,24,0,0,0,0,0,0,0,0,3.832614,110,280170.0,0.558334,0,0,8,11,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,2,3,0,0,0,0,0,9,12,19,296.900974,92.523700,0,0,0,0.243623,2,9,12,82,16,1357.0,200.0,5.296296,6.125000,1985.353659,2005.812500,13,167.282805,14822,10,1,0,2020,12,27,6,4,362,52,0,22,30,46,-5.031931,-57.619862,-129.82836,-214.927923,0.000000,0.276313,0.187701,0.176294,3.480683,46.022462,90.404313,145.349506,-3.480683,-24.022462,-60.404313,-99.349506,0.000000,0.478027,0.331843,0.316479,0,1,1,1,-0.424156,-6.131324,-13.538063,-23.100129,0.000000,0.140226,0.068785,0.041494,1,3.520216,6.714286,11.072776,-1,-2.520216,-5.714286,-10.072776,0.0,0.284074,0.148936,0.090312,-1.0,1,0,0,0,1
282730,3091,COL_446909,61.786427,34.394347,1,17,33,47,0,0,0,0,0,4,5,6,0.408451,110,280170.0,0.276507,0,8,16,24,0,3,4,7,1,1,1,2,0,0,0,0,2,6,1,2,2,0,0,0,0,0,0,0,0,7,16,21,296.774935,94.477102,0,0,0,0.221222,3,5,12,116,26,854.0,220.0,2.963964,3.076923,1958.422414,1954.538462,13,576.784699,47759,100,1,0,2020,12,27,6,4,362,52,1,31,60,87,-4.031931,-48.619862,-99.82836,-173.927923,0.198731,0.389350,0.375403,0.333425,3.480683,46.022462,90.404313,145.349506,-2.480683,-15.022462,-30.404313,-58.349506,0.287300,0.673584,0.663685,0.598557,1,2,3,4,0.575844,-5.131324,-11.538063,-20.100129,2.357621,0.280453,0.206355,0.165974,1,3.520216,6.714286,11.072776,0,-1.520216,-3.714286,-7.072776,1.0,0.568147,0.446809,0.361246,-1.0,1,0,0,0,2


In [119]:
res = model.predict(X_test)

In [120]:
res

array([ 50112.89937092,  48795.22124439,  37289.58796483, ...,
        41990.24395737,  51998.70519966, 190160.07303294])

In [121]:
def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9


def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()

def median_absolute_percentage_error(y_true: np.array, y_pred: np.array) -> float:
    return np.median(np.abs(y_pred-y_true)/y_true)

def metrics_stat(y_true: np.array, y_pred: np.array) -> typing.Dict[str,float]:
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mdape = median_absolute_percentage_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    raif_metric = deviation_metric(y_true, y_pred)
    return {'mape':mape, 'mdape':mdape, 'rmse': rmse, 'r2': r2, 'raif_metric':raif_metric}

EPS = 1e-8
assert deviation_metric(np.array([1,2,3,4,5]),np.array([1,2,3,4,5])) <= EPS
assert deviation_metric(np.array([1,2,3,4,5]),np.array([0.9,1.8,2.7,3.6,4.5])) <= EPS
assert deviation_metric(np.array([1,2,3,4,5]),np.array([1.1,2.2,3.3,4.4,5.5])) <= EPS
assert deviation_metric(np.array([1,2,3,4,5]),np.array([1.15,2.3,3.45,4.6,5.75])) <= EPS
assert np.abs(deviation_metric(np.array([1,2,3,4,5]),np.array([1.3,2.6,3.9,5.2,6.5]))-1) <= EPS
assert np.abs(deviation_metric(np.array([1,2,3,4,5]),np.array([0.7,1.4,2.1,2.8,3.5]))-1*NEGATIVE_WEIGHT) <= EPS
assert np.abs(deviation_metric(np.array([1,2,3,4,5]),np.array([10,20,30,40,50]))-9) <= EPS
assert np.abs(deviation_metric(np.array([1,2,3,4,5]),np.array([0,0,0,0,0]))-9*NEGATIVE_WEIGHT) <= EPS
assert np.abs(deviation_metric(np.array([1,2,3,4,5]),np.array([1,2.2,3.3,5,50])) - 85/45) <= EPS

In [122]:
"""{'mape': 1.5331790967384054,
 'mdape': 0.28967585280398345,
 'rmse': 74071.48571000711,
 'r2': 0.8060378887712242,
 'raif_metric': 3.3111947615740123}"""
metrics_stat(y_test.values.ravel(), res)

{'mape': 1.5455936266468224,
 'mdape': 0.2972146573890706,
 'rmse': 75104.62577313371,
 'r2': 0.8005894349226637,
 'raif_metric': 3.380622457796192}

In [123]:
y_test.values.ravel()

array([  4823.11153619,  43833.33333333,  47735.4527163 , ...,
         6225.        ,  50113.8317    , 120676.9749139 ])

In [151]:
res = model.predict(np.array(test_df.drop(['per_square_meter_price','date', 'id', 'is_train'], axis=1)[true_features]))

In [152]:
train_df[train_df['price_type'] == 1].per_square_meter_price.mean()


70191.04366107372

In [153]:
res.mean()

74667.87681725559

In [154]:
res

array([27578.37208272, 33508.12875499, 77322.90784576, ...,
       34242.93409903, 23488.69237361, 32326.71971518])

In [155]:
res

array([27578.37208272, 33508.12875499, 77322.90784576, ...,
       34242.93409903, 23488.69237361, 32326.71971518])

In [156]:
to_push = test_df[['id']]
to_push['per_square_meter_price'] = res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [157]:
test_sub = pd.read_csv(mdir/'data'/'test_submission.csv')


In [158]:
test_sub = test_sub[['id']].merge(to_push, how='left', on='id')
test_sub

Unnamed: 0,id,per_square_meter_price
0,COL_289284,30366.462557
1,COL_289305,67039.863843
2,COL_289318,62238.852212
3,COL_289354,93954.482543
4,COL_289399,104305.580192
...,...,...
2969,COL_455089,34852.588531
2970,COL_455212,43329.084754
2971,COL_455261,58359.722896
2972,COL_455381,72922.259036


In [159]:
test_sub.to_csv('res.csv', index=False)

In [37]:
model.feature_importances_, model.feature_names_, test_df.drop(['per_square_meter_price','date', 'id', 'is_train'], axis=1).columns


(array([1.22578854e-01, 1.39821286e+00, 2.34909278e+00, 4.89511187e-01,
        8.88308357e-01, 3.44494227e-01, 1.79151949e-01, 1.21768462e-01,
        3.23725036e-01, 4.36186559e-01, 8.14127564e-01, 6.58645141e-01,
        1.83919979e+00, 6.09952218e-01, 1.06045066e+00, 1.23523376e+00,
        1.79863465e-01, 5.05373457e+00, 9.73237876e-01, 4.17289150e-01,
        6.23666602e-01, 6.69944689e-01, 8.33445935e-01, 1.03089111e-01,
        7.83848839e-01, 4.58682420e-01, 1.73727615e+00, 5.02231790e-01,
        8.89051729e-01, 1.22404808e+00, 7.72307018e-01, 8.26797731e-01,
        7.80309403e-01, 6.62040879e-01, 6.56223884e-01, 6.47348391e-01,
        8.13709359e-01, 1.28577217e-01, 2.10180349e-01, 3.20119078e-01,
        3.80751845e-01, 7.89977523e-01, 9.06301746e-01, 3.84826365e-01,
        5.83116637e-01, 3.84671574e-01, 3.35223675e-01, 9.26301224e-01,
        1.37534756e+00, 5.96907442e-01, 3.14261793e-01, 6.14878793e+00,
        9.42257824e-01, 3.40520843e-01, 1.32456272e-01, 1.894148

In [54]:
imp = []
feat = []
for x,y in zip(test_df.drop(['per_square_meter_price','date', 'id', 'is_train'], axis=1).columns, model.feature_importances_):
      if y > 0.2101:
            imp.append((x,y))
            feat.append(x)

In [55]:
imp.sort(key=lambda x:x[1])

In [57]:
feat

['lat',
 'lng',
 'osm_amenity_points_in_0.001',
 'osm_amenity_points_in_0.005',
 'osm_amenity_points_in_0.0075',
 'osm_building_points_in_0.005',
 'osm_building_points_in_0.0075',
 'osm_building_points_in_0.01',
 'osm_catering_points_in_0.001',
 'osm_catering_points_in_0.005',
 'osm_catering_points_in_0.0075',
 'osm_catering_points_in_0.01',
 'osm_city_closest_dist',
 'osm_city_nearest_population',
 'osm_crossing_closest_dist',
 'osm_crossing_points_in_0.001',
 'osm_crossing_points_in_0.005',
 'osm_crossing_points_in_0.0075',
 'osm_crossing_points_in_0.01',
 'osm_culture_points_in_0.005',
 'osm_culture_points_in_0.0075',
 'osm_culture_points_in_0.01',
 'osm_finance_points_in_0.001',
 'osm_finance_points_in_0.005',
 'osm_finance_points_in_0.0075',
 'osm_finance_points_in_0.01',
 'osm_healthcare_points_in_0.005',
 'osm_healthcare_points_in_0.0075',
 'osm_healthcare_points_in_0.01',
 'osm_historic_points_in_0.005',
 'osm_historic_points_in_0.0075',
 'osm_historic_points_in_0.01',
 'osm_ho

In [None]:
to_test = pd.read_csv(mdir/'data'/'test.csv')
to_test

In [None]:
test_df

In [None]:
test_df.reset_index(drop=True, inplace=True)

In [None]:
test_df['id'] = to_test['id']

In [None]:
to_push = test_df[['id']]

In [None]:
test_df[['id']]

In [None]:
to_push

In [None]:
to_push['per_square_meter_price'] = res

In [None]:
to_push.to_csv('res.csv', index=False)