In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from catboost import Pool

import featuretools as ft
from featuretools import selection

from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, _fit_transform_one, _transform_one, make_union
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Ridge

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2, mutual_info_classif


from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from category_encoders import BackwardDifferenceEncoder, HelmertEncoder, BinaryEncoder
from category_encoders import CountEncoder
from category_encoders import LeaveOneOutEncoder, TargetEncoder, JamesSteinEncoder, MEstimateEncoder, WOEEncoder, CatBoostEncoder
from category_encoders.wrapper import NestedCVWrapper

from catboost import CatBoostRegressor

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore')

np.random.seed(42)
import os
import typing
from autogluon.tabular import TabularDataset, TabularPredictor
import re
import joblib

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


The next non-bugfix release of Featuretools will not support Python 3.6



# Теперь добавим генерацию фичей

In [2]:
data = pd.read_csv('/app/full_data.csv')

In [3]:
train = data[~data.per_square_meter_price.isna()]
test = data[data.per_square_meter_price.isna()]

In [4]:
from baseline.raif_hack.data_transformers import SmoothedTargetEncoding

In [5]:
target = 'per_square_meter_price'
# признаки (или набор признаков), для которых применяем smoothed target encoding
ste_cat_features = ['region', 'city', 'realty_type', 'month', 'year', 'day',
                    'street','floor', 'osm_city_nearest_name']

# признаки, для которых применяем one hot encoding
ohe_cat_features = []

# численные признаки
num_features = ['lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
      'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
       'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
       'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
       'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
       'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
       'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
       'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
       'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
       'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
       'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
       'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
       'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
       'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
       'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
       'osm_transport_stop_points_in_0.0075',
       'osm_transport_stop_points_in_0.01',
       'reform_count_of_houses_1000', 'reform_count_of_houses_500',
       'reform_house_population_1000', 'reform_house_population_500',
       'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
       'reform_mean_year_building_1000', 'reform_mean_year_building_500',
        'total_square','nn_100m_price', 'nn_300m_price', 'nn_1000m_price', 
        'mmvb_lag1', 'mmvb_lag2', 'number_of_supply1', 'number_of_supply2', 
        'price_dynamic1', 'price_dynamic2', 'mean_sqm_price1', 
        'mean_sqm_price2', 'exp_time', 'ipc_all_month', 'ipc_all_year', 
        'ipc_goods_month', 'ipc_goods_year', 'ipc_build_month', 
        'ipc_build_year', 'miacr', 'ipc_base', 'ipc_chain', 'interest_rate',
        'inc_per_capita', 'invest_residue', 'debts']

na_features = train[train.columns[train.isnull().any()]].columns

In [6]:
def floor(df2):
    df = df2.copy()
    a = pd.DataFrame(df.loc[df.floor.notnull(), 'floor'].unique(), columns = ['floor'])
    for i in a.floor:
        try:
            a.loc[a.floor == i, 'normal'] = np.float64(i)
        except ValueError:
            a.loc[a.floor == i, 'normal'] ='broken'
    a.loc[a.floor.str.contains('одва') == True, 'normal'] = -1
    a.loc[a.floor == '3 этаж', 'normal'] = 3
    a.loc[a.floor == '4 этаж', 'normal'] = 4
    a.loc[a.floor == '5 этаж', 'normal'] = 5
    a.loc[a.normal == 'broken', 'normal'] = np.nan
    for j in a.floor:
        df.loc[df.floor == j, 'floor'] = a.loc[a.floor == j, 'normal'].values[0]
    df.floor
    return df

In [7]:
test = floor(test)
train = floor(train)

In [8]:
train[train.floor>30].floor = 'mnogo'
test[test.floor>30].floor = 'mnogo'

In [9]:
# Merge application data
train['Test'] = False
test['Test'] = True
test['TARGET'] = np.nan
train.rename(columns={'per_square_meter_price':'TARGET'}, inplace=True)
app = train.append(test, ignore_index=True, sort=False).reset_index()

In [10]:
# Create an entity set
es = ft.EntitySet(id='raif')

In [11]:
# Add dataframe to entityset
es = es.entity_from_dataframe(
    entity_id='raif',
    dataframe=app,
    index='index'
)

In [12]:
# Manually define datatypes in app dataframe

variable_types = {
    'city': ft.variable_types.Categorical,
    'osm_city_nearest_name': ft.variable_types.Categorical,
    'region': ft.variable_types.Categorical,
    'realty_type': ft.variable_types.Categorical,
    'month': ft.variable_types.Categorical,
    'year': ft.variable_types.Categorical,
    'day': ft.variable_types.Categorical,
    'street': ft.variable_types.Categorical,
    'floor': ft.variable_types.Categorical,
    'date': ft.variable_types.Datetime
}

# Add dataframe to entityset, using manual datatypes
es = es.entity_from_dataframe(
    entity_id='raif',
    dataframe=app,
    index='index',
    variable_types=variable_types
)

In [13]:
agg_primitives =  ['count', 'mean', 'num_unique', 'entropy','n_most_common']
trans_primitives = ['cum_sum','is_weekend','percentile']
dfs_feat, dfs_defs = ft.dfs(
    entityset=es,
    target_entity='raif',
    trans_primitives=trans_primitives,
    agg_primitives=agg_primitives, 
    max_features=1000,
    chunk_size=5000,
    verbose=True,                            
    max_depth=2,
    n_jobs=-1
)

Built 301 features
Fewer chunks (57), than workers (64) consider reducing the chunk size
EntitySet scattered to 64 workers in 26 seconds
Elapsed: 00:15 | Progress: 100%|██████████


In [14]:
filtered = selection.remove_low_information_features(dfs_feat)

In [15]:
# Split data back into test + train
train = dfs_feat.loc[~app['Test'].values, :].copy()
test = dfs_feat.loc[app['Test'].values, :].copy()

test.drop('TARGET',1,inplace=True)

In [16]:
train = train.drop(['Unnamed: 0','id','data_type','Test'],1)
test = test.drop(['Unnamed: 0','id','data_type','Test'],1)

In [17]:
encoder = TargetEncoder()
# imputer = SimpleImputer(strategy='median', missing_values=np.nan)
scaler = StandardScaler()

In [18]:
na_features = train[train.columns[train.isnull().any()]].columns

In [19]:
train[ste_cat_features] = encoder.fit_transform(train[ste_cat_features], train.TARGET)
train = train.fillna(train.median())
train[num_features] = pd.DataFrame(scaler.fit_transform(train[num_features]), columns = num_features)

In [20]:
test[ste_cat_features] = encoder.transform(test[ste_cat_features])
test[num_features] = pd.DataFrame(scaler.transform(test[num_features]), columns = num_features)

In [21]:
train = train[train.price_type==1]

In [22]:
train.drop(['price_type'],1,inplace=True)
test.drop(['price_type'],1,inplace=True)

In [23]:
train.drop('PERCENTILE(TARGET)',1,inplace=True)
test.drop('PERCENTILE(TARGET)',1,inplace=True)

In [24]:
train = train.fillna(train.median()).dropna(axis=1)
test = test.fillna(train.median()).dropna(axis=1)

In [27]:
joblib.dump(train, 'train_data_huge3.jbl', compress=9)

['train_data_huge3.jbl']

In [28]:
joblib.dump(test, 'test_data_huge3.jbl', compress=9)

['test_data_huge3.jbl']