In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from catboost import Pool

import featuretools as ft
from featuretools import selection

from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, _fit_transform_one, _transform_one, make_union
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Ridge

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2, mutual_info_classif


from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from category_encoders import BackwardDifferenceEncoder, HelmertEncoder, BinaryEncoder
from category_encoders import CountEncoder
from category_encoders import LeaveOneOutEncoder, TargetEncoder, JamesSteinEncoder, MEstimateEncoder, WOEEncoder, CatBoostEncoder
from category_encoders.wrapper import NestedCVWrapper

from catboost import CatBoostRegressor

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore')

np.random.seed(42)
import os
import typing
from autogluon.tabular import TabularDataset, TabularPredictor
import re
import joblib

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [17]:
data = pd.read_csv('/app/full_data.csv')

In [18]:
train = data[~data.per_square_meter_price.isna()]
test = data[data.per_square_meter_price.isna()]

In [19]:
train.per_square_meter_price = np.log(train.per_square_meter_price)

In [21]:
target = 'per_square_meter_price'
# признаки (или набор признаков), для которых применяем smoothed target encoding
ste_cat_features = ['region', 'city', 'realty_type', 'month', 'year', 'day',
                    'street','floor', 'osm_city_nearest_name']

# признаки, для которых применяем one hot encoding
ohe_cat_features = []

# численные признаки
num_features = ['lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
      'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
       'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
       'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
       'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
       'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
       'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
       'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
       'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
       'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
       'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
       'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
       'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
       'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
       'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
       'osm_transport_stop_points_in_0.0075',
       'osm_transport_stop_points_in_0.01',
       'reform_count_of_houses_1000', 'reform_count_of_houses_500',
       'reform_house_population_1000', 'reform_house_population_500',
       'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
       'reform_mean_year_building_1000', 'reform_mean_year_building_500',
        'total_square','nn_100m_price', 'nn_300m_price', 'nn_1000m_price', 
        'mmvb_lag1', 'mmvb_lag2', 'number_of_supply1', 'number_of_supply2', 
        'price_dynamic1', 'price_dynamic2', 'mean_sqm_price1', 
        'mean_sqm_price2', 'exp_time', 'ipc_all_month', 'ipc_all_year', 
        'ipc_goods_month', 'ipc_goods_year', 'ipc_build_month', 
        'ipc_build_year', 'miacr', 'ipc_base', 'ipc_chain', 'interest_rate',
        'inc_per_capita', 'invest_residue', 'debts']

na_features = train[train.columns[train.isnull().any()]].columns

In [22]:
def floor(df2):
    df = df2.copy()
    a = pd.DataFrame(df.loc[df.floor.notnull(), 'floor'].unique(), columns = ['floor'])
    for i in a.floor:
        try:
            a.loc[a.floor == i, 'normal'] = np.float64(i)
        except ValueError:
            a.loc[a.floor == i, 'normal'] ='broken'
    a.loc[a.floor.str.contains('одва') == True, 'normal'] = -1
    a.loc[a.floor == '3 этаж', 'normal'] = 3
    a.loc[a.floor == '4 этаж', 'normal'] = 4
    a.loc[a.floor == '5 этаж', 'normal'] = 5
    a.loc[a.normal == 'broken', 'normal'] = np.nan
    for j in a.floor:
        df.loc[df.floor == j, 'floor'] = a.loc[a.floor == j, 'normal'].values[0]
    df.floor
    return df

In [23]:
train = floor(train)

In [24]:
train[train.floor>30].floor = 'mnogo'

In [25]:
encoder = TargetEncoder()
imputer = IterativeImputer(random_state=42)
scaler = StandardScaler()

In [26]:
train[ste_cat_features] = encoder.fit_transform(train[ste_cat_features],
                              train.per_square_meter_price)
train[na_features] = pd.DataFrame(imputer.fit_transform(train[na_features]), columns = na_features)
train[num_features] = pd.DataFrame(scaler.fit_transform(train[num_features]), columns = num_features)

In [27]:
train = train.drop(['Unnamed: 0','id','date','data_type'],1)

In [31]:
test = test.drop(['Unnamed: 0','id','date','data_type'],1)

In [33]:
test = floor(test)

In [34]:
test[test.floor>30].floor = 'mnogo'

In [35]:
test[ste_cat_features] = encoder.transform(test[ste_cat_features])
test[na_features] = pd.DataFrame(imputer.transform(test[na_features]), columns = na_features).values
test[num_features] = pd.DataFrame(scaler.transform(test[num_features]), columns = num_features).values

In [41]:
# joblib.dump(test,'test_data1337.jbl', compress=9)

['test_data1337.jbl']

In [42]:
# joblib.dump(train,'train_data1337.jbl', compress=9)

['train_data1337.jbl']