In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings

warnings.filterwarnings('ignore')
sns.set(rc={'figure.figsize':(20,15)})

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def plot_correlation_matrix(corr_mtrx):
    """Построение матрицы корреляции признаков"""
    f = plt.figure(figsize=(10, 10))
    plt.matshow(corr_mtrx, fignum=f.number)
    plt.xticks(range(corr_mtrx.shape[1]), fontsize=9, rotation=45)
    plt.yticks(range(corr_mtrx.shape[1]), fontsize=9)
    cb = plt.colorbar()
    plt.title('Correlation Matrix', fontsize=10)

**Чтение данных**

Read train data

In [None]:
train = pd.read_csv('/kaggle/input/technoatom-technopark-property-prices-spring21/Train.csv')
train.head()

Read test data

In [None]:
test = pd.read_csv('/kaggle/input/technoatom-technopark-property-prices-spring21/Test.csv')
test.head()

Запомним id test и train части

In [None]:
train_id = train['id']
test_id = test['id']

test_date = test['date']

train.drop("id", axis = 1, inplace = True)
test.drop("id", axis = 1, inplace = True)

**Первичная обработка данных**

Оценим набор данных на выбросы

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = train['area'], y = train['price'])
plt.ylabel('цена', fontsize=15)
plt.xlabel('площадь', fontsize=15)
plt.show()

Удалим значения из правого нижнего угла данного графика

In [None]:
train = train.drop(train[(train['area']>75) & (train['price']<2000000)].index)

Проверим скос(skew) нашей целевой переменной

In [None]:
color = sns.color_palette()
sns.set_style('darkgrid')

sns.distplot(train['price'], fit=stats.norm)

#price distribution
plt.ylabel('частота', fontsize=15)
plt.title('распределение цены', fontsize=15)

#probability plot
fig = plt.figure()
res = stats.probplot(train['price'], plot=plt)
plt.show()

Видим, что имеется достаточно сильный скос(skew). Произведем лог-трансформацию целевой переменной

In [None]:
train['price'] = np.log1p(train['price'])

In [None]:
train = train.drop(train[(train['area']>25) & (train['area']<100) & (train['price']>13.814) & (train['price'] < 13.817)].index)
train = train.drop(train[(train['area']>90) & (train['area']<100) & (train['price']>15.31) & (train['price'] < 15.33)].index)
train = train.drop(train[(train['area']>120) & (train['area']<130) & (train['price']>15.425) & (train['price'] < 15.450)].index)

In [None]:
color = sns.color_palette()
sns.set_style('darkgrid')

sns.distplot(train['price'], fit=stats.norm)

#price distribution
plt.ylabel('частота')
plt.title('распределение цены')

#probability plot
fig = plt.figure()
res = stats.probplot(train['price'], plot=plt)
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = train['area'], y = train['price'])
plt.ylabel('цена', fontsize=15)
plt.xlabel('площадь', fontsize=15)
plt.show()

Выделим отдельно целевую переменную

In [None]:
y = train['price'].copy()
train.drop(['price'], axis=1, inplace=True)

Запомним размер тренировочной части, так следующим шагом произведу объединение test и train

In [None]:
size_train = train.shape[0]
print(train.shape[0])
print(test.shape[0])

In [None]:
all_data = train.copy()
all_data = all_data.append(test, ignore_index=True)
all_data.head()

**Работа с пропущенными значениями**

In [None]:
all_data.info()

In [None]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

#визуализация пропущенных данных
f, ax = plt.subplots(figsize=(15, 12))  
sns.barplot(x=all_data_na.index, y=all_data_na)
plt.xlabel('признаки', fontsize=15)
plt.ylabel('процент пропущенных значений', fontsize=15);

In [None]:
all_data['g_lift'] = all_data['g_lift'].fillna(pd.Series(np.random.choice([0.0, 1.0], size=len(all_data.index))))
all_data['build_tech'] = all_data['build_tech'].fillna(pd.Series(np.random.choice([0.0, 1.0, 2.0], size=len(all_data.index))))
all_data['metro_dist'] = all_data['metro_dist'].fillna(pd.Series(np.random.choice([0.0, 5.0, 10.0, 15.0, 20.0, 25.0, 30.0], size=len(all_data.index))))

Поработаем над признаками

Удалим признаки, имеющие более 99.9% одних и тех же значений.(Неинформативные)

In [None]:
all_data.drop(['kw3', 'kw4', 'kw5', 'kw6', 'kw7', 'kw8', 'kw9', 'kw10', 'kw11', 'kw12', 'kw13'], axis='columns', inplace=True)

Разобьем дату на год и месяц

In [None]:
all_data['date'] = pd.to_datetime(all_data['date'])
all_data['year'] = all_data['date'].dt.year
all_data['month'] = all_data['date'].dt.month
all_data.drop(['date'], inplace=True, axis=1)

In [None]:
plot_correlation_matrix(np.corrcoef(all_data, rowvar=False))

In [None]:
all_data.info()

In [None]:
all_data['street_id'] = all_data['street_id'].apply(str)
all_data['g_lift'] = all_data['g_lift'].apply(str)
for i in range(1, 3):
    feature = 'kw' + str(i)
    all_data[feature] = all_data[feature].apply(str)

Обработаем числовые признаки

In [None]:
skewed_feats = all_data[['area', 'metro_dist']].apply(lambda x: stats.skew(x.dropna())).sort_values(ascending=False)#Нахождение асимметрии в найденных столбцах
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})#создание ДатаФрейма для показа результата
skewness.head(10)

In [None]:
skewness = skewness[abs(skewness) > 0.68]#выбор асимметрии, которая больше 0.68 по модулю
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p#Метод Бокс_Кокса для приведения к нормальному распределению
skewed_features = skewness.index
lam = 0.15#Лямбда для метода Бокс_Кокса
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)

Разделим данные

In [None]:
all_data_tree = all_data.copy()

Переведем категориальные порядковые в численное значение

In [None]:
all_data = pd.get_dummies(all_data)

Возвращаемся к разделению на train и test

In [None]:
train = all_data[:size_train]
test = all_data[size_train:]

In [None]:
train_tree = all_data_tree[:size_train]
test_tree = all_data_tree[size_train:]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.3, random_state=33)
X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(train_tree, y, test_size=0.3, random_state=33)

**Построение моделей**

In [None]:
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, Ridge
from sklearn.linear_model import LassoCV,RidgeCV,ElasticNetCV

from mlxtend.regressor import StackingCVRegressor
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
import xgboost as xgb

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from bayes_opt import BayesianOptimization

from scipy.stats import randint as sp_randint

Cross-validation

In [None]:
def mae_cv(model):
    kf = KFold(11, shuffle=True, random_state=33).get_n_splits(X_train)
    mae = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=kf)
    return -mae

In [None]:
def mae_cv_tree(model):
    kf = KFold(11, shuffle=True, random_state=33).get_n_splits(X_train_tree)
    mae = cross_val_score(model, X_train_tree, y_train_tree, scoring='neg_mean_absolute_error', cv=kf)
    return -mae

In [None]:
kfold= KFold(n_splits=11,random_state=42,shuffle=True)

Построим base-learners

**lasso**

In [None]:
lasso = make_pipeline(StandardScaler(), Lasso(alpha=1e-08))

In [None]:
#lasso.get_params().keys()

In [None]:
#alphas_lasso = [0.00000001, 0.0000001, 0.000001] #Best value of alpha parmaters for lasso
#param_grid = [{'lasso__alpha': alphas_lasso}]
#gs = GridSearchCV(estimator=lasso,
#                  param_grid=param_grid,
#                  scoring='neg_mean_absolute_error',
#                  cv=5,
#                  n_jobs=-1,
#                  verbose=4)
#gs = gs.fit(X_train, y_train)
#print(gs.best_score_)#перекрестно-проверочная верность
#print(gs.best_params_)#наилучшие параметры

**ElasticNet**

In [None]:
ENet = make_pipeline(StandardScaler(), ElasticNet(alpha=0.0001, l1_ratio=0.001, max_iter=100000, precompute=True))

In [None]:
#ENet.get_params().keys()

In [None]:
#alpha_elnet= [0.00001, 0.0001]
#l1ratio_elnet = [0.00001, 0.001]
#param_grid = [{'elasticnet__alpha': alpha_elnet,
#               'elasticnet__l1_ratio': l1ratio_elnet}]
#gs = GridSearchCV(estimator=ENet,
#                  param_grid=param_grid,
#                  scoring='neg_mean_absolute_error',
#                  cv=5,
#                  n_jobs=-1,
#                  verbose=4)
#gs = gs.fit(X_train, y_train)
#print(gs.best_score_)#перекрестно-проверочная верность
#print(gs.best_params_)#наилучшие параметры

**Ridge**

In [None]:
ridge = make_pipeline(StandardScaler(), Ridge(alpha=0.01, solver='sag'))

In [None]:
ridge.get_params().keys()

In [None]:
#alphas_ridge = [0.01, 0.02, 0.03] #Best value of alpha parmaters for Ridge regression
#param_grid = [{'ridge__alpha': alphas_ridge}]
#gs = GridSearchCV(estimator=ridge,
#                  param_grid=param_grid,
#                  scoring='neg_mean_absolute_error',
#                  cv=5,
#                  n_jobs=-1,
#                  verbose=4)
#gs = gs.fit(X_train, y_train)
#print(gs.best_score_)#перекрестно-проверочная верность
#print(gs.best_params_)#наилучшие параметры

**Bayes**

In [None]:
Bayes = make_pipeline(RobustScaler(), BayesianRidge(alpha_1=10000, alpha_2=0.000001, lambda_1=0.000001, lambda_2=10000))

In [None]:
#Bayes.get_params().keys()

In [None]:
#bayesianridge__alpha_1 = [0.01, 0.1]
#bayesianridge__alpha_2 = [0.00001, 0.000001]
#bayesianridge__lambda_1 = [0.00001, 0.000001]
#bayesianridge__lambda_2 = [0.01, 0.1]
#param_grid = [{'bayesianridge__alpha_1': bayesianridge__alpha_1,
#               'bayesianridge__alpha_2': bayesianridge__alpha_2,
#               'bayesianridge__lambda_1': bayesianridge__lambda_1,
#               'bayesianridge__lambda_2': bayesianridge__lambda_2}]
#gs = GridSearchCV(estimator=Bayes,
#                  param_grid=param_grid,
#                  scoring='neg_mean_absolute_error',
#                  cv=5,
#                  n_jobs=-1,
#                  verbose=4)
#gs = gs.fit(X_train, y_train)
#print(gs.best_score_)#перекрестно-проверочная верность
#print(gs.best_params_)#наилучшие параметры

**CatBoost**

In [None]:
cat = CatBoostRegressor(loss_function='MAE', iterations=1000, learning_rate=0.36, depth=6, l2_leaf_reg=76,  bagging_temperature=0, thread_count=-1)

In [None]:
#grid = {'iterations': [1000, 1100, 1200]}

#grid_search_result = cat.grid_search(grid,
#                                    X=X_train,
#                                    y=y_train,
#                                    plot=True)
#grid_search_result

Стакаем модели

In [None]:
from tqdm import tqdm

In [None]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=7, use_features_in_secondary=False):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
        self.use_features_in_secondary = use_features_in_secondary
        
    def fit(self, X, y):
        """Fit all the models on the given dataset"""
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        
        # Train cloned base models and create out-of-fold predictions
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in tqdm(enumerate(self.base_models)):
            for train_index, holdout_index in tqdm(kfold.split(X, y)):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X.iloc[train_index], y.iloc[train_index])
                y_pred = instance.predict(X.iloc[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
        
        if self.use_features_in_secondary:
            self.meta_model_.fit(np.hstack((X, out_of_fold_predictions)), y)
        else:
            self.meta_model_.fit(out_of_fold_predictions, y)   
        return self
    
    def predict(self, X):
        meta_features = np.column_stack([np.median(np.column_stack([model.predict(X) for model in base_models]), axis=1) for base_models in self.base_models_ ])
        if self.use_features_in_secondary:
            return self.meta_model_.predict(np.hstack((X, meta_features)))
        else:
            return self.meta_model_.predict(meta_features)

In [None]:
def mae(y, y_pred):
    return mean_absolute_error(y, y_pred)

CatBoost

In [None]:
cat.fit(train, y)
cat_train_pred = cat.predict(train)
cat_pred = np.expm1(cat.predict(test))
print(mae(y, cat_train_pred))

**CatBoost_categorical**

In [None]:
cat_with_categorical = CatBoostRegressor(iterations=1100, loss_function='MAE', depth=7, learning_rate=0.15, l2_leaf_reg=24,
                                         cat_features=['street_id', 'g_lift', 'kw1', 'kw2'],
                                         bagging_temperature=0, thread_count=-1)

In [None]:
cat_with_categorical.fit(train_tree, y)
cat_train_pred_c = cat_with_categorical.predict(train_tree)
cat_pred_c = np.expm1(cat_with_categorical.predict(test_tree))
print(mae(y, cat_train_pred_c))

In [None]:
#grid = {'iterations': [1000, 1200, 1400]}

#grid_search_result = cat_with_categorical.grid_search(grid,
#                                    X=X_train_tree,
#                                    y=y_train_tree)
#grid_search_result

**Stacking**

In [None]:
stack_monster_6 = StackingAveragedModels(base_models=(lasso, ENet, Bayes, ridge),
                                meta_model=cat,
                                use_features_in_secondary=True)

In [None]:
stack_monster_6.fit(train, y)
stack_monster_6_train_pred = stack_monster_6.predict(train)
stack_monster_6_pred = np.expm1(stack_monster_6.predict(test))

In [None]:
print('Meta-lasso', mae(y, stack_monster_6_train_pred))

In [None]:
min_ensemble = 100
best_cat = 0
best_cat_c = 0
best_stack = 0
best_stack_2 = 0
best_stack_3 = 0
best_stack_4 = 0
best_stack_5 = 0
best_stack_6 = 0


for cat_ in np.arange(0, 1.01, 0.01):
    for cat_c_ in np.arange(0, 1.01, 0.01):
        for stack_6_ in np.arange(0, 1.01, 0.01):
            if cat_ + cat_c_ + stack_6_ == 1:
                cur_ensemble = mae(y, cat_train_pred*cat_ + cat_train_pred_c*cat_c_ + stack_monster_6_train_pred*stack_6_)
                if cur_ensemble < min_ensemble:
                    min_ensemble = cur_ensemble
                    best_cat = cat_
                    best_cat_c = cat_c_
                    best_stack_6 = stack_6_
                    
min_ensemble

Ensemble prediction

In [None]:
ensemble = cat_pred*best_cat + cat_pred_c*best_cat_c + stack_monster_6_pred*best_stack_6

In [None]:
magic = pd.DataFrame()
magic['price'] = ensemble

year = test['year'].values.copy()
month = test['month'].values.copy()

test['year'] = test['year'].astype(str)
test['month'] = test['month'].astype(str)

magic['year'] = test_date

In [None]:
#2012-3
magic['price'] = np.where(magic['year'] == '2012-3', magic['price']*1.01, magic['price']*1)
#2012-4
magic['price'] = np.where(magic['year'] == '2012-4', magic['price']*1.0133, magic['price']*1)
#2012-5
magic['price'] = np.where(magic['year'] == '2012-5', magic['price']*1.0166, magic['price']*1)
#2012-6
magic['price'] = np.where(magic['year'] == '2012-6', magic['price']*1.0199, magic['price']*1)
#2012-7
magic['price'] = np.where(magic['year'] == '2012-7', magic['price']*1.0232, magic['price']*1)
#2012-8
magic['price'] = np.where(magic['year'] == '2012-8', magic['price']*1.0265, magic['price']*1)
#2012-9
magic['price'] = np.where(magic['year'] == '2012-9', magic['price']*1.0298, magic['price']*1)
#2012-10
magic['price'] = np.where(magic['year'] == '2012-10', magic['price']*1.0331, magic['price']*1)
#2012-11
magic['price'] = np.where(magic['year'] == '2012-11', magic['price']*1.0364, magic['price']*1)
#2012-12
magic['price'] = np.where(magic['year'] == '2012-12', magic['price']*1.0397, magic['price']*1)
#2013-1
magic['price'] = np.where(magic['year'] == '2013-1', magic['price']*1.043, magic['price']*1)
#2013-2
magic['price'] = np.where(magic['year'] == '2013-2', magic['price']*1.0463, magic['price']*1)
#2013-3
magic['price'] = np.where(magic['year'] == '2013-3', magic['price']*1.0496, magic['price']*1)
#2013-4
magic['price'] = np.where(magic['year'] == '2013-4', magic['price']*1.0529, magic['price']*1)
#2013-5
magic['price'] = np.where(magic['year'] == '2013-5', magic['price']*1.0562, magic['price']*1)
#2013-6
magic['price'] = np.where(magic['year'] == '2013-6', magic['price']*1.0595, magic['price']*1)
#2013-7
magic['price'] = np.where(magic['year'] == '2013-7', magic['price']*1.0628, magic['price']*1)
#2013-8
magic['price'] = np.where(magic['year'] == '2013-8', magic['price']*1.0661, magic['price']*1)
#2013-9
magic['price'] = np.where(magic['year'] == '2013-9', magic['price']*1.0694, magic['price']*1)
#2013-10
magic['price'] = np.where(magic['year'] == '2013-10', magic['price']*1.0727, magic['price']*1)
#2013-11
magic['price'] = np.where(magic['year'] == '2013-11', magic['price']*1.076, magic['price']*1)
#2013-12
magic['price'] = np.where(magic['year'] == '2013-12', magic['price']*1.08, magic['price']*1)
price = magic['price'].values.copy()

In [None]:
ensemble = price

In [None]:
ensemble

Submission

In [None]:
submission = pd.DataFrame()
submission['id'] = test_id
submission['price'] = ensemble
submission.to_csv('Ensembling.csv',index=False)