In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import FunctionTransformer, QuantileTransformer
from scipy.stats import uniform, geom, randint
from lightgbm import LGBMRegressor
from model_utils import dummify_persistent

## Getting the data

In [None]:
df = pd.read_csv(os.path.join(dirname, filename)).sample(frac=1).sample(frac=1, axis='columns')

In [None]:
prices = df.pop('price')

df.shape

In [None]:
CATEGORICAL_FIELDS = ['closest_subway', 'AO', 'h3', 'hm']
for field in CATEGORICAL_FIELDS:
    df[field] = df[field].astype('category')
df.info()

## Setting up the new feature generator

In [None]:
def bearing_degree(lat1, lng1):
    '''
    Calculate bearing to/from Moscow center
    '''
    
    lat2, lng2 = 55.75222, 37.61556  # Moscow center
    
    diff_lng = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(diff_lng) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(diff_lng)
    return np.arctan2(y, x)
 

def gen_new_features(df_original):
    df = df_original.copy()
    
    # rescaling
    df['footage_sq'] = df['footage'] ** 2
    df['subway_dist_to_center_sq'] = df['subway_dist_to_center'] ** 2
    df['max_floor_log'] = np.log(df['max_floor'])

    # interaction features
    df['footage_per_room'] = df['footage'] / df['rooms']
    df['footage_per_dist_to_center'] = df['footage'] / df['subway_dist_to_center']
    df['footage_per_max_floor'] = df['footage'] / df['max_floor']
    df['dist_per_max_floor'] = df['subway_dist_to_center'] / df['max_floor']
    df['footage_per_repair'] = df['footage'] / df['repair']
    
    # geographical features
    df['bearing'] = bearing_degree(df['lat'], df['lon'])
    
    return df


new_features_generator = FunctionTransformer(gen_new_features)
new_features_generator.transform(df).shape

## Removing the outliers

In [None]:
from sklearn.ensemble import IsolationForest

outlier_markers = IsolationForest(contamination=0.1).fit_predict(pd.get_dummies(df, CATEGORICAL_FIELDS))
df_filtered = df.loc[outlier_markers == 1]
prices_filtered = prices.loc[df_filtered.index]
df_filtered.shape

## Preparing and running the pipeline

In [None]:
regr = StackingRegressor([
    ('lgbm', LGBMRegressor(n_jobs=-1)),
    ('rf', RandomForestRegressor(n_jobs=-1, n_estimators=50, max_leaf_nodes=1000)),
    ('ridge', Ridge())
], n_jobs=-1)

dm = FunctionTransformer(dummify_persistent, kw_args={'columns': [
    'closest_subway', 'AO', 'h3', 'hm', 'sg', 'repair', 'max_floor'
]})
dm.fit_transform(df)  # needed to establish the possible caterogy space without folding

pipe = Pipeline([
    ('nf', new_features_generator),
    ('dummifier', dm),
    ('ttr', TransformedTargetRegressor(
        regressor=regr,
        transformer=QuantileTransformer(n_quantiles=500),

    ))
    ],
    verbose=True
)

In [None]:
param_grid = {
    'ttr__regressor__lgbm__n_estimators': geom(p = 0.02),
#    'ttr__regressor__rf__n_estimators': geom(p = 0.02),
#    'ttr__regressor__ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    'ttr__regressor__ridge__alpha': [300, 500, 1000, 3000],    
}

cv = RandomizedSearchCV(pipe, param_grid, n_jobs=-1, verbose=10, n_iter=20)
_ = cv.fit(df_filtered, prices_filtered)

## Estimating the results

R^2 score

In [None]:
cv.score(df, prices)

Best estimator parameters

In [None]:
cv.best_params_

Feature importance

In [None]:
try:
    fi_dict = {key: round(value*100, 4) for key, value 
               in zip(gen_new_features(df_filtered).columns, cv.best_estimator_[-1].regressor_.feature_importances_)}
    sorted(fi_dict.items(), key=lambda x: x[1], reverse=True)[:50]
except:
    pass

Model size

In [None]:
import pickle
import sys

p = pickle.dumps(cv)
print(sys.getsizeof(p))

Residuals analysis

In [None]:
df_plot = pd.DataFrame({
    'price': prices,
    'prediction': cv.predict(df),
    'outlier': outlier_markers
})

df_plot['outlier'] = df_plot['outlier'] == -1

df_plot['residual'] = df_plot['prediction'] - df_plot['price']

from seaborn import relplot
relplot(x='price', y='residual', col='outlier', hue='outlier',
          data=df_plot.sample(frac=0.1))


Residuals skewness

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(df_plot['price'].values.reshape(-1, 1), df_plot['residual'])
lr.coef_[0]