In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


# **Data import**

In [28]:
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

X_all = train_data.drop(['SalePrice'], axis=1)
y_all = train_data.SalePrice

# **Column description**

In [18]:
numeric_cols = X_all.drop(['Id', 'GarageYrBlt'], axis=1).select_dtypes(['int64', 'float64']).columns
categorical_cols = X_all.select_dtypes('object').columns.to_list()
categorical_cols.append('GarageYrBlt')


many_null_cols = [nc for nc in X_all.columns if len(X_all[nc][X_all[nc].isnull()]) > len(X_all.index) / 2] 
any_null_cols = [any_null_col for any_null_col in X_all.columns if X_all[any_null_col].isnull().any()]
null_to_not = ['MasVnrType', 'GarageType', 'GarageFinish', 'MiscFeature', 'Alley',  
               'FireplaceQu', 'Fence', 'BsmtQual', 'PoolQC', 
               'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2', 'GarageQual', 
               'GarageCond', 'GarageYrBlt']
null_to_most_freq = ['Electrical']
null_to_const = ['LotFrontage', 'MasVnrArea']

any_null_cols_test = [any_null_col for any_null_col in test_data.columns if test_data[any_null_col].isnull().any()]
dif = [i for i in any_null_cols_test if i not in any_null_cols]
null_to_most_freq_test = test_data[dif].select_dtypes('object').columns.to_list()
null_to_const_test = test_data[dif].select_dtypes('float64').columns.to_list()

cols_to_order = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'PoolQC', 
                 'FireplaceQu', 'GarageQual', 'GarageCond', 'Fence']

cols_to_onehot = [col for col in categorical_cols if col not in cols_to_order]

# **Data notes**

In [19]:
less_then_two_unique_values_cols = [i for i in X_all.columns if len(X_all[i].unique()) < 2]
#  len(less_then_two_unique_values_cols) => 0

cat_cols_to_numeric = ['YearRemodAdd']
#  YearRemodAdd, score better when numeric..

useless_cols = ['Id']

suspicious_obs = list()
suspicious_obs.extend(X_all['LotFrontage'][X_all['LotFrontage'] > 200].index.to_list()) 
# anomaly_obs in col => [934, 1298]
suspicious_obs.append(X_all['Electrical'][X_all['Electrical'].isnull()].index[0])
# electrical NA, only one obs.. in test without missing.. => 1379


interdep_cols = [
    ['Condition1', 'Condition2'],
    ['Exterior1st', 'Exterior2nd'],
    ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 
     'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'],
    ['Fireplaces', 'FireplaceQu'], 
    ['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond'], 
    ['PoolArea', 'PoolQC']
]

doubtful_cols = ['GarageYrBlt', 'YrSold', 'YearBuilt', 'YearRemodAdd'] 
#  numeric with 'not' potencially


more_then_ten_to_onehot = [i for i in cols_to_onehot if len(X_all[i].unique()) > 10]  
# len(more_then_ten_to_onehot) => 4

# **Missing value recovery**

In [20]:
from sklearn.impute import SimpleImputer

X = X_all.copy()
X_test = test_data.copy()

X[null_to_not] = X[null_to_not].fillna('not')
X_test[null_to_not] = X_test[null_to_not].fillna('not')

X[null_to_const] = X[null_to_const].fillna(0)
X_test[null_to_const] = X_test[null_to_const].fillna(0)
X_test[null_to_const_test] = X_test[null_to_const_test].fillna(0)

imp = SimpleImputer(strategy='most_frequent')

X[null_to_most_freq] = pd.DataFrame(imp.fit_transform(X[null_to_most_freq]))
X_test[null_to_most_freq_test] = pd.DataFrame(imp.fit_transform(X_test[null_to_most_freq_test]))

# **Сategorical transformation**

In [21]:
from sklearn.preprocessing import OrdinalEncoder

categories_to_order = [
    ['not', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ['not', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 
    ['not', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ['not', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    ['not', 'No', 'Mn', 'Av', 'Gd'], ['not', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    ['not', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], ['not', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    ['not', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ['not', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    ['not', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ['not', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 
    ['not', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ['not', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']
]

ord_enc = OrdinalEncoder(categories=categories_to_order)
X[cols_to_order] = ord_enc.fit_transform(X[cols_to_order])
X_test[cols_to_order] = ord_enc.transform(X_test[cols_to_order])

X_onehot = pd.get_dummies(X[cols_to_onehot])
X_test_onehot = pd.get_dummies(X_test[cols_to_onehot])

X = X.drop(cols_to_onehot, axis = 1)
X_test = X_test.drop(cols_to_onehot, axis = 1)

X = pd.concat([X, X_onehot], axis=1)
X_test = pd.concat([X_test, X_test_onehot], axis=1)

# **Data optimization**

In [22]:
cols_to_drop = ['Id', 'MiscFeature', 'PoolQC', 'Alley']

for x in [X, X_test]:
    for col in x.columns:
        for c in cols_to_drop:
            if c in col:
                if id(x) == id(X):
                    X = X.drop(col, axis=1)
                    x = X
                elif id(x) == id(X_test):
                    X_test = X_test.drop(col, axis=1)
                    x = X_test

# **Model optimization**

In [23]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_log_error

pre_model = XGBRegressor(
    n_estimators=460, 
    learning_rate=0.05, 
    random_state=42, 
    n_jobs=-1,
    reg_alpha=1.3,
    reg_lambda=8,
    max_depth=4
)

In [24]:
# import matplotlib.pyplot as plt
# # X[numeric_cols].shape
# X_num = X[numeric_cols]
# X_test_num = X_test[numeric_cols]
# for i in range(len(numeric_cols)):
#     X_num[numeric_cols[i]] = X_num[numeric_cols[i]].drop(X_num[numeric_cols[i]][X_num[numeric_cols[i]] == 0].index)
#     X_test_num[numeric_cols[i]] = X_test_num[numeric_cols[i]].drop(X_test_num[numeric_cols[i]][X_test_num[numeric_cols[i]] == 0].index)
#     plt.figure(figsize=(22, 4))
#     plt.title(numeric_cols[i])
#     X_num.iloc[:, i].hist(bins=100, )
#     X_test_num.iloc[:, i].hist(bins=100, histtype='step')

In [25]:
drop_from_cat_all = [1386, 1024, 1152, 1187, 185, 198, 267, 304, 635, 883, 1031, 1440, 944, 1230, 1003, 29, 548, 595, 398, 185, 268, 760, 810, 1386, 1349,
                    1416, 106, 991, 438, 1234, 848, 184, 1137, 653, 841, 809, 583, 1292, 185, 1235, 735, 848, 653, 1397, 1244, 657, 146, 8, 1400, 1343]
drop_from_cat = [1386, 1024, 1152, 1187, 185, 198, 267, 304, 635, 883, 1031, 1440, 944, 1230, 1003, 29, 548, 595, 398, 185, 268, 760, 810, 1386, 1349]
# 1386 Теннисный корт
# 1024 Каменная отделка
# 1152 Каменная отделка
# 1187 Иммитация лепнины (отделка)
# 185     2.5Fin два с половиной этажа
# 198     2.5Fin
# 267     2.5Fin
# 304     2.5Fin
# 635     2.5Fin
# 883     2.5Fin
# 1031    2.5Fin
# 1440    2.5Fin
# 944   без воды

drop_from_num_all = [1298, 934, 249, 313, 335, 451, 706]
drop_from_num = []
# 1298 самый крупный
# 934 самая большая придворовая территория (как и у 1298)
# 249     159000  большая территория
# 313     215245
# 335     164660
# 451      70761
# 706     115149

X_new = X.drop(drop_from_cat_all + drop_from_num_all)
y_new = y_all.drop(drop_from_cat_all + drop_from_num_all)

X_sub = X.drop(drop_from_cat + drop_from_num)
y_sub = y_all.drop(drop_from_cat + drop_from_num)

# pre_model.fit(X, y_all)
# pred_all = pre_model.predict(X_new)
# mse_all = mean_squared_log_error(y_new, pred_all)
# print(mse_all)

# pre_model.fit(X_sub, y_sub)
# pred_new = pre_model.predict(X_new)
# mse_new = mean_squared_log_error(y_new, pred_new)
# print(mse_new)

# print(f'{mse_all - mse_new:.{19}f}')

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

X_train, X_valid, y_train, y_valid = train_test_split(X_sub, y_sub, random_state=42)

pre_model.fit(X_sub, y_sub)
pred_self = pre_model.predict(X_sub)
mse_self = mean_squared_log_error(y_sub, pred_self)
print(f'Self score: {mse_self ** 0.5}')

# pre_model.fit(X_train, y_train)
# pred_split = pre_model.predict(X_valid)
# mse_split = mean_squared_log_error(y_valid, pred_split)
# print(f'Split score: {mse_split}')


scores = cross_val_score(pre_model, X_sub, y_sub, cv=5, scoring='neg_mean_squared_log_error')
print(f'CV score: {(scores.mean() * -1) ** 0.5}')

Self score: 0.06710169915630847
CV score: 0.12615332854982214


In [12]:
# from sklearn.model_selection import GridSearchCV

# params = {
# #     'n_estimators': range(450, 1550, 50),
# #     'learning_rate': [0.005, 0.01, 0.05, 0.09, 0.1, 0.5, 0.9], 
# #     'reg_alpha': [a / 10 for a in range(12, 15)] + [0], 
# #     'reg_lambda': [a / 10 for a in range(75, 80)] + [0]
#     'max_depth': range(2, 6)
    
# #     'n_estimators': range(440, 470, 5),
# #     'learning_rate': [0.049, 0.05, 0.051], 
# #     'reg_alpha': [0, 1, 2, 3], 
# #     'reg_lambda': [0, 1, 7, 8, 9]
    
# }

# grid = GridSearchCV(
#     pre_model, 
#     params, 
#     n_jobs=-1, 
#     scoring='neg_mean_squared_log_error', 
#     cv=5,
#     verbose=2
# )

# grid.fit(X_sub, y_sub)

# print(grid.best_params_)

In [13]:
# from sklearn.model_selection import cross_val_score

# scores = cross_val_score(pre_model, X, y_all, cv=5, scoring='neg_mean_squared_log_error')
# scores.mean() * -1

In [14]:
import matplotlib.pyplot as plt

pre_model.fit(X, y_all)

feat_imp = pre_model.feature_importances_
feat_names = pre_model.feature_names_in_

df_imp = pd.DataFrame(index=feat_names, data=feat_imp).sort_values(0, ascending=False)

# plt.figure(figsize=(15, 8))
# plt.xticks(rotation=45)
# plt.bar(df_imp.index, df_imp[0])

In [15]:
not_used_features = df_imp[df_imp[0] == 0].index

df_imp.sort_values(0, ascending=False).head(20)

Unnamed: 0,0
OverallQual,0.246468
GarageCars,0.111065
KitchenQual,0.048222
ExterQual,0.042798
BsmtQual,0.040921
GarageType_Attchd,0.037763
LandContour_Bnk,0.035048
GrLivArea,0.033727
MSZoning_RM,0.0224
FireplaceQu,0.021159


# **Experimental block**

In [None]:
# pre_model.fit(X, y_all)

# folds_list = list()
# for s in range(5, 10):
#     part = s / 10
#     X_t, X_v, y_t, y_v = train_test_split(X, y_all, random_state=42, train_size = part)
#     pre_model.fit(X_t, y_t)
#     pred = pre_model.predict(X_v)
#     score = mean_squared_log_error(y_v, pred)
#     folds_list.append([part, score])
    
    
# import matplotlib.pyplot as plt

# folds = [i[0] for i in folds_list]
# scores = [i[1] for i in folds_list]

# plt.plot(folds, scores)

In [None]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(15, 4))
# plt.hist(X_all.MSSubClass, bins=len(X_all.index), histtype='step')

# **Final prediction**

In [None]:
X, X_test = X.align(X_test, join='left', axis=1)

model = XGBRegressor(
    n_estimators=460,
    learning_rate=0.05,
    reg_alpha=1.3,
    reg_lambda=8,
    max_depth=4
)

# model.fit(X, y_all)
model.fit(X_sub, y_sub)

predictions = model.predict(X_test)

In [None]:
output = pd.DataFrame({'Id': test_data['Id'],
                       'SalePrice': predictions})
output.to_csv('submission.csv', index=False)

In [None]:
output