# Load libraries and data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
#train = pd.read_csv('./train.csv')
#test = pd.read_csv('./test.csv')

### Correlation, first look

In [None]:
corr_matrix = train.corr()[['SalePrice']].sort_values(by = ['SalePrice'], ascending = False).drop(['SalePrice'])
corr_matrix.style.background_gradient(cmap = 'coolwarm').set_precision(2)

# EDA, missing values and data tuning

#### I will make all the transformation of data by hand in order to better understand and improve control over it. 

### Missing values

In [None]:
missing = pd.concat([train.isna().sum().sort_values(ascending = False), train.dtypes], axis=1, keys=['Total', 'Type'])
missing[missing['Total'] > 0]

I will not drop any columns with missing values for now, first look at the data and try to fill them some way.

### Identifying and removing outliers

I will do it before any data transfomation, because at this stage there are less numeric values. But drop the respective rows only after the transformation - there can be distinct categories in test and train data, so dropping some values before data normalization can cause errors.

In [None]:
train.describe().transpose()

In [None]:
sns.boxplot(data = train['LotFrontage'], orient = 'h')

In [None]:
train['LotFrontage'].sort_values(ascending = False)

##### This sorting shows that rows <span style="color:red">934 and 1298</span> contain outliers. I can't remove them now - tried already, this action causes an error in normalization **test** data. So I only remember this rows and see if I can work with them later, after normalization step.

In [None]:
sns.boxplot(data = train['LotArea'], orient = 'h')

In [None]:
train['LotArea'].sort_values(ascending = False)

##### <span style="color:red">313, 335, 249, 706</span>

In [None]:
sns.boxplot(data = train['MasVnrArea'], orient = 'h')

In [None]:
train['MasVnrArea'].sort_values(ascending = False)

##### <span style="color:red">297, 1169</span>

In [None]:
sns.boxplot(data = train['BsmtFinSF1'], orient = 'h')

In [None]:
train['BsmtFinSF1'].sort_values(ascending = False)

##### <span style="color:red">1298</span>

In [None]:
sns.boxplot(data = train['BsmtFinSF2'], orient = 'h')

In [None]:
train['BsmtFinSF2'].sort_values(ascending = False)

##### <span style="color:red">322</span>

In [None]:
sns.boxplot(data = train['BsmtUnfSF'], orient = 'h')

In [None]:
sns.boxplot(data = train['TotalBsmtSF'], orient = 'h')

In [None]:
train['TotalBsmtSF'].sort_values(ascending = False)

##### <span style="color:red">1298</span> again

In [None]:
sns.boxplot(data = train['1stFlrSF'], orient = 'h')

In [None]:
train['1stFlrSF'].sort_values(ascending = False)

##### <span style="color:red">1298</span>

In [None]:
sns.boxplot(data = train['2ndFlrSF'], orient = 'h')

In [None]:
sns.boxplot(data = train['LowQualFinSF'], orient = 'h')

In [None]:
sns.boxplot(data = train['GrLivArea'], orient = 'h')

In [None]:
train['GrLivArea'].sort_values(ascending = False)

##### <span style="color:red">1298, 523, 1182, 691</span>

In [None]:
sns.boxplot(data = train['BsmtFullBath'], orient = 'h')

In [None]:
train['BsmtFullBath'].sort_values(ascending = False)

##### <span style="color:red">738</span>
##### the only one lot with 3 baths

In [None]:
sns.boxplot(data = train['BsmtHalfBath'], orient = 'h')

In [None]:
sns.boxplot(data = train['FullBath'], orient = 'h')

In [None]:
sns.boxplot(data = train['HalfBath'], orient = 'h')

In [None]:
sns.boxplot(data = train['BedroomAbvGr'], orient = 'h')

In [None]:
train['BedroomAbvGr'].sort_values(ascending = False)

##### <span style="color:red">635</span>

In [None]:
sns.boxplot(data = train['KitchenAbvGr'], orient = 'h')

In [None]:
sns.boxplot(data = train['TotRmsAbvGrd'], orient = 'h')

In [None]:
sns.boxplot(data = train['Fireplaces'], orient = 'h')

In [None]:
train['Fireplaces'].sort_values(ascending = False)

In [None]:
sns.boxplot(data = train['GarageArea'], orient = 'h')

In [None]:
sns.boxplot(data = train['WoodDeckSF'], orient = 'h')

In [None]:
train['WoodDeckSF'].sort_values(ascending = False)

In [None]:
sns.boxplot(data = train['OpenPorchSF'], orient = 'h')

In [None]:
train['OpenPorchSF'].sort_values(ascending = False)

##### <span style="color:red">1328, 495, 583</span>

In [None]:
sns.boxplot(data = train['EnclosedPorch'], orient = 'h')

In [None]:
train['EnclosedPorch'].sort_values(ascending = False)

##### <span style="color:red">197</span>

In [None]:
sns.boxplot(data = train['3SsnPorch'], orient = 'h')

In [None]:
train['3SsnPorch'].sort_values(ascending = False)

In [None]:
sns.boxplot(data = train['ScreenPorch'], orient = 'h')

In [None]:
train['ScreenPorch'].sort_values(ascending = False)

#### So, my outliers are:
934, 1298, 313, 335, 249, 706, 297, 1169, 322, 523, 1182, 691, 738, 635, 1328, 495, 583, 197

### MSSubClass: Identifies the type of dwelling involved in the sale.

In [None]:
train['MSSubClass'].value_counts()

In [None]:
corr_matrix.loc[['MSSubClass']]

Correlation with price is negative. Most of the values are in categories 20 (1-STORY 1946 & NEWER ALL STYLES), 50 (1-1/2 STORY FINISHED ALL AGES) and 60 (2-STORY 1946 & NEWER). This column seems useless - we have year in another. So, I will delete it.

In [None]:
train = train.drop('MSSubClass', 1)

### MSZoning: Identifies the general zoning classification of the sale.

In [None]:
train['MSZoning'].value_counts()

No data about correlation - it can be estimated only for numeric values. So, let's make transformation. This column is about density. The less - the better, I think.

##### This is first of large group of columns in my data processing there I apply these steps :
* group data by categories
* estimate the mean price for each category
* normalize mean prices
* set these normalized values as new names of categories
* change their type to float - this step allows to estimate correlation

##### Code goes here:

In [None]:
msz = train.groupby('MSZoning', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
msz['normalized'] = (msz['SalePrice'] - msz['SalePrice'].min()) / (msz['SalePrice'].max() - msz['SalePrice'].min()).round(3)
train['MSZoning'] = train['MSZoning'].apply(lambda x: msz.loc[msz['MSZoning'] == x, 'normalized'].item())
train['MSZoning'] = train['MSZoning'].astype(float)

And now?

In [None]:
train.corr()[['SalePrice']].sort_values(by = ['SalePrice'], ascending = False).drop(['SalePrice']).loc[['MSZoning']]

Ok, I will do it only once, just to show the changes.

### LotFrontage: Linear feet of street connected to propert

Got some missing values here. I will use mean to fill them.

In [None]:
train['LotFrontage'].skew()

In [None]:
train['LotFrontage'].hist()

In [None]:
train['LotFrontage'].fillna(train['LotFrontage'].mean(), inplace = True)

### LotArea: Lot size in square feet

In [None]:
train['LotArea'].value_counts()

Nothing to do here.

### Street: Type of road access to property

In [None]:
train['Street'].value_counts()

This one is extremely skewed (99.6% and 0.4%), no way to improve it. I will drop this column, because it can have nop impact on the price at all.

In [None]:
train = train.drop('Street', 1)

### Alley: Type of alley access to property

A lot of missing values (1369 out of 1460). But I will transform this one other way. The presence of alley will be encoded as 1, the absence as 0.

In [None]:
train['Alley'].fillna(0, inplace = True)
train['Alley'] = train['Alley'].replace('Grvl', '1', regex = True)
train['Alley'] = train['Alley'].replace('Pave', '1', regex = True)
train['Alley'] = train['Alley'].astype(int)

### LotShape: General shape of property

In [None]:
lshp = train.groupby('LotShape', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
lshp['normalized'] = (lshp['SalePrice'] - lshp['SalePrice'].min()) / (lshp['SalePrice'].max() - lshp['SalePrice'].min()).round(3)
train['LotShape'] = train['LotShape'].apply(lambda x: lshp.loc[lshp['LotShape'] == x, 'normalized'].item())
train['LotShape'] = train['LotShape'].astype(float)

### LandContour: Flatness of the property

In [None]:
lctr = train.groupby('LandContour', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
lctr['normalized'] = (lctr['SalePrice'] - lctr['SalePrice'].min()) / (lctr['SalePrice'].max() - lctr['SalePrice'].min()).round(3)
train['LandContour'] = train['LandContour'].apply(lambda x: lctr.loc[lctr['LandContour'] == x, 'normalized'].item())
train['LandContour'] = train['LandContour'].astype(float)

### Utilities: Type of utilities available

In [None]:
train['Utilities'].value_counts()

99.999999999% are the same. No impact on price, I think. Drop it.

In [None]:
train = train.drop('Utilities', 1)

### LotConfig: Lot configuration

In [None]:
lcfg = train.groupby('LotConfig', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
lcfg['normalized'] = (lcfg['SalePrice'] - lcfg['SalePrice'].min()) / (lcfg['SalePrice'].max() - lcfg['SalePrice'].min()).round(3)
train['LotConfig'] = train['LotConfig'].apply(lambda x: lcfg.loc[lcfg['LotConfig'] == x, 'normalized'].item())
train['LotConfig'] = train['LotConfig'].astype(float)

### LandSlope: Slope of property

In [None]:
lslp = train.groupby('LandSlope', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
lslp['normalized'] = (lslp['SalePrice'] - lslp['SalePrice'].min()) / (lslp['SalePrice'].max() - lslp['SalePrice'].min()).round(3)
train['LandSlope'] = train['LandSlope'].apply(lambda x: lslp.loc[lslp['LandSlope'] == x, 'normalized'].item())
train['LandSlope'] = train['LandSlope'].astype(float)

### Neighborhood: Physical locations within Ames city limits

In [None]:
ngbh = train.groupby('Neighborhood', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
ngbh['normalized'] = (ngbh['SalePrice'] - ngbh['SalePrice'].min()) / (ngbh['SalePrice'].max() - ngbh['SalePrice'].min()).round(3)
train['Neighborhood'] = train['Neighborhood'].apply(lambda x: ngbh.loc[ngbh['Neighborhood'] == x, 'normalized'].item())
train['Neighborhood'] = train['Neighborhood'].astype(float)

### Condition1: Proximity to various conditions. Condition2: Proximity to various conditions (if more than one is present)

Just like the previous section (and the next, **Condition2**) this one reflect the property's location. And 99% of **Condition2** are **Norm**. I will drop this column.

In [None]:
cond1 = train.groupby('Condition1', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
cond1['normalized'] = (cond1['SalePrice'] - cond1['SalePrice'].min()) / (cond1['SalePrice'].max() - cond1['SalePrice'].min()).round(3)
train['Condition1'] = train['Condition1'].apply(lambda x: cond1.loc[cond1['Condition1'] == x, 'normalized'].item())
train['Condition1'] = train['Condition1'].astype(float)
train = train.drop('Condition2', 1)

### BldgType: Type of dwelling

In [None]:
bldg = train.groupby('BldgType', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
bldg['normalized'] = (bldg['SalePrice'] - bldg['SalePrice'].min()) / (bldg['SalePrice'].max() - bldg['SalePrice'].min()).round(3)
train['BldgType'] = train['BldgType'].apply(lambda x: bldg.loc[bldg['BldgType'] == x, 'normalized'].item())
train['BldgType'] = train['BldgType'].astype(float)

### HouseStyle: Style of dwelling

In [None]:
hstl = train.groupby('HouseStyle', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
hstl['normalized'] = (hstl['SalePrice'] - hstl['SalePrice'].min()) / (hstl['SalePrice'].max() - hstl['SalePrice'].min()).round(3)
train['HouseStyle'] = train['HouseStyle'].apply(lambda x: hstl.loc[hstl['HouseStyle'] == x, 'normalized'].item())
train['HouseStyle'] = train['HouseStyle'].astype(float)

### OverallQual: Rates the overall material and finish of the house

Let's just check if the impact of quality on the price is appropriate

In [None]:
train.groupby('OverallQual', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)

Ok. More quality - higher the price is.

### OverallCond: Rates the overall condition of the house

In [None]:
train.groupby('OverallCond', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)

This time the scale is negatively correlated with price. I think there are some houses in more prestigious area, but in poorer condition - they still cost more.

### YearBuilt: Original construction date

The oldest property in Ames (which present in this dataset) was built in 1872. The city itself [was founded in 1864](https://en.wikipedia.org/wiki/Ames,_Iowa#History). Well, this lot can be of special interest for buyers. Let's look at it.

In [None]:
with pd.option_context('display.max_columns', None): 
    display(train[train['YearBuilt'].isin([1872])])

And the impact of year on sale price

In [None]:
train_year_sp = train.groupby('YearBuilt', as_index = False)['SalePrice'].mean().sort_values(by = 'YearBuilt')
ax = train_year_sp[['SalePrice']].plot(kind = 'line',
                                       title = 'Impact of construction year on sale price',
                                       figsize = (19,5),
                                       legend = True,
                                       fontsize = 12)
ax.set_xlabel('YearBuilt', fontsize=12)
ax.set_ylabel('SalePrice', fontsize=12)
ax.set_xticks(range(len(train_year_sp)))
ax.set_xticklabels(train_year_sp['YearBuilt'])
plt.xticks(rotation = 90)
plt.show()

### YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)

This one looks strange. 12.2% of all renovations happened in 1950. Maybe this is the first year when such activities were fixed?

In [None]:
train_year_rem_sp = train.groupby('YearRemodAdd', as_index = False)['SalePrice'].mean().sort_values(by = 'YearRemodAdd')
ax = train_year_rem_sp[['SalePrice']].plot(kind = 'line',
                                                   title = 'Impact of renovation year on sale price',
                                                   figsize = (19,5),
                                                   legend = True,
                                                   fontsize = 12)
ax.set_xlabel('YearRemodAdd', fontsize=12)
ax.set_ylabel('SalePrice', fontsize=12)
ax.set_xticks(range(len(train_year_rem_sp)))
ax.set_xticklabels(train_year_rem_sp['YearRemodAdd'])
plt.xticks(rotation=90)
plt.show()

Both construction and renovation years have several outliers. I will work with them in the next version of my notebook. This time just ignore them.

### RoofStyle: Type of roof

In [None]:
rfst = train.groupby('RoofStyle', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
rfst['normalized'] = (rfst['SalePrice'] - rfst['SalePrice'].min()) / (rfst['SalePrice'].max() - rfst['SalePrice'].min()).round(3)
train['RoofStyle'] = train['RoofStyle'].apply(lambda x: rfst.loc[rfst['RoofStyle'] == x, 'normalized'].item())
train['RoofStyle'] = train['RoofStyle'].astype(float)

### RoofMatl: Roof material

In [None]:
rfmt = train.groupby('RoofMatl', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
rfmt['normalized'] = (rfmt['SalePrice'] - rfmt['SalePrice'].min()) / (rfmt['SalePrice'].max() - rfmt['SalePrice'].min()).round(3)
train['RoofMatl'] = train['RoofMatl'].apply(lambda x: rfmt.loc[rfmt['RoofMatl'] == x, 'normalized'].item())
train['RoofMatl'] = train['RoofMatl'].astype(float)

### Exterior1st: Exterior covering on house

In [None]:
ext1 = train.groupby('Exterior1st', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
ext1['normalized'] = (ext1['SalePrice'] - ext1['SalePrice'].min()) / (ext1['SalePrice'].max() - ext1['SalePrice'].min()).round(3)
train['Exterior1st'] = train['Exterior1st'].apply(lambda x: ext1.loc[ext1['Exterior1st'] == x, 'normalized'].item())
train['Exterior1st'] = train['Exterior1st'].astype(float)

### Exterior2nd: Exterior covering on house (if more than one material)

In [None]:
ext2 = train.groupby('Exterior2nd', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
ext2['normalized'] = (ext2['SalePrice'] - ext2['SalePrice'].min()) / (ext2['SalePrice'].max() - ext2['SalePrice'].min()).round(3)
train['Exterior2nd'] = train['Exterior2nd'].apply(lambda x: ext2.loc[ext2['Exterior2nd'] == x, 'normalized'].item())
train['Exterior2nd'] = train['Exterior2nd'].astype(float)

* Noticed, that **Exterior1st** and **Exterior2nd** have a difference - Cement Board is called **CemntBd** and **CmentBd** respectively.
* One other thing - has the order of 1st and 2nd any influence? What if we change their order, will the price change be significant?

### MasVnrType: Masonry veneer type

In [None]:
train['MasVnrType'].fillna('None', inplace = True)
mvtp = train.groupby('MasVnrType', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
mvtp['normalized'] = (mvtp['SalePrice'] - mvtp['SalePrice'].min()) / (mvtp['SalePrice'].max() - mvtp['SalePrice'].min()).round(3)
train['MasVnrType'] = train['MasVnrType'].apply(lambda x: mvtp.loc[mvtp['MasVnrType'] == x, 'normalized'].item())
train['MasVnrType'] = train['MasVnrType'].astype(float)

Due to small amount of missing (8 rows) I will fill them with normalized value and hope it would work well in my future model.

### MasVnrArea: Masonry veneer area in square feet

Here's a tricky way to choose value for NaNs.

In [None]:
value = train[train['SalePrice'].between(train[train['MasVnrArea'].isna()]['SalePrice'].mean() - 1000, train[train['MasVnrArea'].isna()]['SalePrice'].mean() + 1000)]['MasVnrArea'].mean()
train['MasVnrArea'].fillna(value, inplace = True)

### ExterQual: Evaluates the quality of the material on the exterior 

In [None]:
exql = train.groupby('ExterQual', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
exql['normalized'] = (exql['SalePrice'] - exql['SalePrice'].min()) / (exql['SalePrice'].max() - exql['SalePrice'].min()).round(3)
train['ExterQual'] = train['ExterQual'].apply(lambda x: exql.loc[exql['ExterQual'] == x, 'normalized'].item())
train['ExterQual'] = train['ExterQual'].astype(float)

### ExterCond: Evaluates the present condition of the material on the exterior

In [None]:
excn = train.groupby('ExterCond', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
excn['normalized'] = (excn['SalePrice'] - excn['SalePrice'].min()) / (excn['SalePrice'].max() - excn['SalePrice'].min()).round(3)
train['ExterCond'] = train['ExterCond'].apply(lambda x: excn.loc[excn['ExterCond'] == x, 'normalized'].item())
train['ExterCond'] = train['ExterCond'].astype(float)

### Foundation: Type of foundation

In [None]:
fndt = train.groupby('Foundation', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
fndt['normalized'] = (fndt['SalePrice'] - fndt['SalePrice'].min()) / (fndt['SalePrice'].max() - fndt['SalePrice'].min()).round(3)
train['Foundation'] = train['Foundation'].apply(lambda x: fndt.loc[fndt['Foundation'] == x, 'normalized'].item())
train['Foundation'] = train['Foundation'].astype(float)

### Basement column's group

In [None]:
bsmt_cols = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']
train[train['BsmtExposure'].isna()][bsmt_cols]

So these 38 missing values are from **No Basement** group.

In [None]:
for col in bsmt_cols:
    train[col].fillna('None', inplace = True)

### BsmtQual: Evaluates the height of the basement

In [None]:
bsqu = train.groupby('BsmtQual', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
bsqu['normalized'] = (bsqu['SalePrice'] - bsqu['SalePrice'].min()) / (bsqu['SalePrice'].max() - bsqu['SalePrice'].min()).round(3)
train['BsmtQual'] = train['BsmtQual'].apply(lambda x: bsqu.loc[bsqu['BsmtQual'] == x, 'normalized'].item())
train['BsmtQual'] = train['BsmtQual'].astype(float)

### BsmtCond: Evaluates the general condition of the basement

In [None]:
bscn = train.groupby('BsmtCond', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
bscn['normalized'] = (bscn['SalePrice'] - bscn['SalePrice'].min()) / (bscn['SalePrice'].max() - bscn['SalePrice'].min()).round(3)
train['BsmtCond'] = train['BsmtCond'].apply(lambda x: bscn.loc[bscn['BsmtCond'] == x, 'normalized'].item())
train['BsmtCond'] = train['BsmtCond'].astype(float)

### BsmtExposure: Refers to walkout or garden level walls

In [None]:
bsxp = train.groupby('BsmtExposure', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
bsxp['normalized'] = (bsxp['SalePrice'] - bsxp['SalePrice'].min()) / (bsxp['SalePrice'].max() - bsxp['SalePrice'].min()).round(3)
train['BsmtExposure'] = train['BsmtExposure'].apply(lambda x: bsxp.loc[bsxp['BsmtExposure'] == x, 'normalized'].item())
train['BsmtExposure'] = train['BsmtExposure'].astype(float)

### BsmtFinType1: Rating of basement finished area

In [None]:
bsf1 = train.groupby('BsmtFinType1', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
bsf1['normalized'] = (bsf1['SalePrice'] - bsf1['SalePrice'].min()) / (bsf1['SalePrice'].max() - bsf1['SalePrice'].min()).round(3)
train['BsmtFinType1'] = train['BsmtFinType1'].apply(lambda x: bsf1.loc[bsf1['BsmtFinType1'] == x, 'normalized'].item())
train['BsmtFinType1'] = train['BsmtFinType1'].astype(float)

### BsmtFinType2: Rating of basement finished area (if multiple types)

In [None]:
bsf2 = train.groupby('BsmtFinType2', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
bsf2['normalized'] = (bsf2['SalePrice'] - bsf2['SalePrice'].min()) / (bsf2['SalePrice'].max() - bsf2['SalePrice'].min()).round(3)
train['BsmtFinType2'] = train['BsmtFinType2'].apply(lambda x: bsf2.loc[bsf2['BsmtFinType2'] == x, 'normalized'].item())
train['BsmtFinType2'] = train['BsmtFinType2'].astype(float)

### Other basement columns
* BsmtFinSF1: Type 1 finished square feet
* BsmtFinSF2: Type 2 finished square feet
* BsmtUnfSF: Unfinished square feet of basement area
* TotalBsmtSF: Total square feet of basement area
These are numeric, nothing to replace

### Heating: Type of heating

In [None]:
heat = train.groupby('Heating', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
heat['normalized'] = (heat['SalePrice'] - heat['SalePrice'].min()) / (heat['SalePrice'].max() - heat['SalePrice'].min()).round(3)
train['Heating'] = train['Heating'].apply(lambda x: heat.loc[heat['Heating'] == x, 'normalized'].item())
train['Heating'] = train['Heating'].astype(float)

### HeatingQC: Heating quality and condition

In [None]:
htqc = train.groupby('HeatingQC', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
htqc['normalized'] = (htqc['SalePrice'] - htqc['SalePrice'].min()) / (htqc['SalePrice'].max() - htqc['SalePrice'].min()).round(3)
train['HeatingQC'] = train['HeatingQC'].apply(lambda x: htqc.loc[htqc['HeatingQC'] == x, 'normalized'].item())
train['HeatingQC'] = train['HeatingQC'].astype(float)

### CentralAir: Central air conditioning
This has 2 values - Y and N. Change to 1 and 0 respectively.

In [None]:
train['CentralAir'] = train['CentralAir'].replace('Y', '1', regex = True)
train['CentralAir'] = train['CentralAir'].replace('N', '0', regex = True)
train['CentralAir'] = train['CentralAir'].astype(int)

### Electrical: Electrical system
There's one missing value in this column

In [None]:
train['Electrical'].fillna('None', inplace = True)
elec = train.groupby('Electrical', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
elec['normalized'] = (elec['SalePrice'] - elec['SalePrice'].min()) / (elec['SalePrice'].max() - elec['SalePrice'].min()).round(3)
train['Electrical'] = train['Electrical'].apply(lambda x: elec.loc[elec['Electrical'] == x, 'normalized'].item())
train['Electrical'] = train['Electrical'].astype(float)

### Next columns are numeric

* 1stFlrSF: First Floor square feet 
* 2ndFlrSF: Second floor square feet
* LowQualFinSF: Low quality finished square feet (all floors)
* GrLivArea: Above grade (ground) living area square feet
* BsmtFullBath: Basement full bathrooms
* BsmtHalfBath: Basement half bathrooms
* FullBath: Full bathrooms above grade
* HalfBath: Half baths above grade
* BedroomAbvGr: Bedrooms above grade (does NOT include basement bedrooms)
* KitchenAbvGr: Kitchens above grade
* TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)

**KitchenAbvGr** and **BedroomAbvGr** - these two are given as **Kitchen** and **Bedroom** in **description.txt**

In [None]:
train[['1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd']].describe()

No missing values, no corrupted data. I will not touch them.

### KitchenQual: Kitchen quality

In [None]:
kchq = train.groupby('KitchenQual', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
kchq['normalized'] = (kchq['SalePrice'] - kchq['SalePrice'].min()) / (kchq['SalePrice'].max() - kchq['SalePrice'].min()).round(3)
train['KitchenQual'] = train['KitchenQual'].apply(lambda x: kchq.loc[kchq['KitchenQual'] == x, 'normalized'].item())
train['KitchenQual'] = train['KitchenQual'].astype(float)

### Functional: Home functionality (Assume typical unless deductions are warranted)

In [None]:
func = train.groupby('Functional', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
func['normalized'] = (func['SalePrice'] - func['SalePrice'].min()) / (func['SalePrice'].max() - func['SalePrice'].min()).round(3)
train['Functional'] = train['Functional'].apply(lambda x: func.loc[func['Functional'] == x, 'normalized'].item())
train['Functional'] = train['Functional'].astype(float)

### Fireplaces: Number of fireplaces

In [None]:
train['Fireplaces'].value_counts()

### FireplaceQu: Fireplace quality
This one has 690 missing values. Just the same as amount of property with 0 fireplaces. Change them to None.

In [None]:
train['FireplaceQu'].fillna('None', inplace = True)
frpq = train.groupby('FireplaceQu', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
frpq['normalized'] = (frpq['SalePrice'] - frpq['SalePrice'].min()) / (frpq['SalePrice'].max() - frpq['SalePrice'].min()).round(3)
train['FireplaceQu'] = train['FireplaceQu'].apply(lambda x: frpq.loc[frpq['FireplaceQu'] == x, 'normalized'].item())
train['FireplaceQu'] = train['FireplaceQu'].astype(float)

### Here comes the Garage group

In [None]:
garage_cols = [col for col in train if col.startswith('Garage')]
train[train['GarageType'].isna()][garage_cols]['GarageArea'].sum()

Ok. No garage - no problem, fill with None, change some to numeric.

In [None]:
for col in garage_cols:
    train[col].fillna('None', inplace = True)
train['GarageYrBlt'] = train['GarageYrBlt'].replace('None', 0, regex = True)
train['GarageCars'] = train['GarageCars'].replace('None', 0, regex = True)
train['GarageArea'] = train['GarageArea'].replace('None', 0, regex = True)

### GarageType: Garage location

In [None]:
grtp = train.groupby('GarageType', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
grtp['normalized'] = (grtp['SalePrice'] - grtp['SalePrice'].min()) / (grtp['SalePrice'].max() - grtp['SalePrice'].min()).round(3)
train['GarageType'] = train['GarageType'].apply(lambda x: grtp.loc[grtp['GarageType'] == x, 'normalized'].item())
train['GarageType'] = train['GarageType'].astype(float)

### GarageYrBlt: Year garage was built
Let's plot

In [None]:
train_year_grg_blt = train.groupby('GarageYrBlt', as_index = False)['SalePrice'].mean().sort_values(by = 'GarageYrBlt')
ax = train_year_grg_blt[['SalePrice']].plot(kind = 'line',
                                                   title = 'Impact of garage construction year on sale price',
                                                   figsize = (19,5),
                                                   legend = True,
                                                   fontsize = 12)
ax.set_xlabel('GarageYrBlt', fontsize=12)
ax.set_ylabel('SalePrice', fontsize=12)
ax.set_xticks(range(len(train_year_grg_blt)))
ax.set_xticklabels(train_year_grg_blt['GarageYrBlt'])
plt.xticks(rotation=90)
plt.show()

The same peaks as on construction year's plot.

### GarageFinish

In [None]:
grfn = train.groupby('GarageFinish', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
grfn['normalized'] = (grfn['SalePrice'] - grfn['SalePrice'].min()) / (grfn['SalePrice'].max() - grfn['SalePrice'].min()).round(3)
train['GarageFinish'] = train['GarageFinish'].apply(lambda x: grfn.loc[grfn['GarageFinish'] == x, 'normalized'].item())
train['GarageFinish'] = train['GarageFinish'].astype(float)

### GarageQual: Garage quality

In [None]:
grqu = train.groupby('GarageQual', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
grqu['normalized'] = (grqu['SalePrice'] - grqu['SalePrice'].min()) / (grqu['SalePrice'].max() - grqu['SalePrice'].min()).round(3)
train['GarageQual'] = train['GarageQual'].apply(lambda x: grqu.loc[grqu['GarageQual'] == x, 'normalized'].item())
train['GarageQual'] = train['GarageQual'].astype(float)

### GarageCond: Garage condition

In [None]:
grcn = train.groupby('GarageCond', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
grcn['normalized'] = (grcn['SalePrice'] - grcn['SalePrice'].min()) / (grcn['SalePrice'].max() - grcn['SalePrice'].min()).round(3)
train['GarageCond'] = train['GarageCond'].apply(lambda x: grcn.loc[grcn['GarageCond'] == x, 'normalized'].item())
train['GarageCond'] = train['GarageCond'].astype(float)

### PavedDrive: Paved driveway

In [None]:
pvdr = train.groupby('PavedDrive', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
pvdr['normalized'] = (pvdr['SalePrice'] - pvdr['SalePrice'].min()) / (pvdr['SalePrice'].max() - pvdr['SalePrice'].min()).round(3)
train['PavedDrive'] = train['PavedDrive'].apply(lambda x: pvdr.loc[pvdr['PavedDrive'] == x, 'normalized'].item())
train['PavedDrive'] = train['PavedDrive'].astype(float)

### Next columns are numeric

* WoodDeckSF: Wood deck area in square feet
* OpenPorchSF: Open porch area in square feet
* EnclosedPorch: Enclosed porch area in square feet
* 3SsnPorch: Three season porch area in square feet
* ScreenPorch: Screen porch area in square feet
* PoolArea: Pool area in square feet

In [None]:
train[['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea']].describe()

### PoolQC: Pool quality

In [None]:
train['PoolQC'].value_counts()

In [None]:
train['PoolArea'].value_counts()

These two are appropriate. Fill missing with None.

In [None]:
train['PoolQC'].fillna('None', inplace = True)
plqc = train.groupby('PoolQC', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
plqc['normalized'] = (plqc['SalePrice'] - plqc['SalePrice'].min()) / (plqc['SalePrice'].max() - plqc['SalePrice'].min()).round(3)
train['PoolQC'] = train['PoolQC'].apply(lambda x: plqc.loc[plqc['PoolQC'] == x, 'normalized'].item())
train['PoolQC'] = train['PoolQC'].astype(float)

### Fence: Fence quality

In [None]:
train['Fence'].fillna('None', inplace = True)
fenc = train.groupby('Fence', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
fenc['normalized'] = (fenc['SalePrice'] - fenc['SalePrice'].min()) / (fenc['SalePrice'].max() - fenc['SalePrice'].min()).round(3)
train['Fence'] = train['Fence'].apply(lambda x: fenc.loc[fenc['Fence'] == x, 'normalized'].item())
train['Fence'] = train['Fence'].astype(float)

### MiscFeature: Miscellaneous feature not covered in other categories

In [None]:
train['MiscFeature'].fillna('None', inplace = True)
misc = train.groupby('MiscFeature', as_index = False)['SalePrice'].mean().sort_values(by = 'SalePrice', ascending = False)
misc['normalized'] = (misc['SalePrice'] - misc['SalePrice'].min()) / (misc['SalePrice'].max() - misc['SalePrice'].min()).round(3)
train['MiscFeature'] = train['MiscFeature'].apply(lambda x: misc.loc[misc['MiscFeature'] == x, 'normalized'].item())
train['MiscFeature'] = train['MiscFeature'].astype(float)

### MiscVal: $Value of miscellaneous feature

In [None]:
train_misc_val = train.groupby('MiscVal', as_index = False)['SalePrice'].mean().sort_values(by = 'MiscVal')
ax = train_misc_val[['SalePrice']].plot(kind = 'line',
                                                   title = 'Impact of value of miscellaneous feature on sale price',
                                                   figsize = (19,5),
                                                   legend = True,
                                                   fontsize = 12)
ax.set_xlabel('MiscVal', fontsize=12)
ax.set_ylabel('SalePrice', fontsize=12)
ax.set_xticks(range(len(train_misc_val)))
ax.set_xticklabels(train_misc_val['MiscVal'])
plt.xticks(rotation=90)
plt.show()

No correlation at all.

### And the last four

* MoSold: Month Sold (MM)
* YrSold: Year Sold (YYYY)
* SaleType: Type of sale
* SaleCondition: Condition of sale

I'm not sure that this data is of any importance, so drop it for now.

In [None]:
train = train.drop(['MoSold', 'YrSold', 'SaleType', 'SaleCondition'], 1)

### Modify Sale Price

#### My previous public score with this model was 0.15280, I want to see if it changes somehow.

In [None]:
train['OldPrice'] = train['SalePrice']
train['SalePrice'] = train['OldPrice'] - train['MiscVal']

train = train.drop(['OldPrice'], 1)

### Correlation of transformed data

In [None]:
corr_matrix = train.corr()[['SalePrice']].sort_values(by = ['SalePrice'], ascending = False) #Do not drop SalePrice this time!
corr_matrix[corr_matrix['SalePrice'] > 0.4].style.background_gradient(cmap = 'coolwarm').set_precision(2)

### Now I want to look, what happened to outliers after the transformation, are there still the same rows?

In [None]:
sns.boxplot(data = train['LotFrontage'], orient = 'h')

In [None]:
train['LotFrontage'].sort_values(ascending = False)

In [None]:
sns.boxplot(data = train['LotArea'], orient = 'h')

In [None]:
train['LotArea'].sort_values(ascending = False)

Yes, still the same.

### The same tranformations with test

#### First let's drop columns and fill all missing as it train

In [None]:
test = test.drop('MSSubClass', 1)
test = test.drop('Street', 1)
test = test.drop('Utilities', 1)
test = test.drop('Condition2', 1)
test = test.drop(['MoSold', 'YrSold', 'SaleType', 'SaleCondition'], 1)

test['LotFrontage'].fillna(train['LotFrontage'].mean(), inplace = True)
test['Alley'].fillna(0, inplace = True)
test['MasVnrType'].fillna('None', inplace = True)
test['MasVnrArea'].fillna(value, inplace = True)
for col in bsmt_cols:
    test[col].fillna('None', inplace = True)
for col in garage_cols:
    test[col].fillna('None', inplace = True)
test['Electrical'].fillna('None', inplace = True)
test['FireplaceQu'].fillna('None', inplace = True)
test['PoolQC'].fillna('None', inplace = True)
test['Fence'].fillna('None', inplace = True)
test['MiscFeature'].fillna('None', inplace = True)

Are there any missing data left?

In [None]:
missing_test = pd.concat([test.isna().sum().sort_values(ascending = False), test.dtypes], axis=1, keys=['Total', 'Type'])
missing_test[missing_test['Total'] > 0]

Just a few values, they make little impact on results. Change them all to the most popular values in respective columns.

In [None]:
test['MSZoning'].fillna('RL', inplace = True)
test['BsmtFullBath'].fillna('0.0', inplace = True)
test['Functional'].fillna('Typ', inplace = True)
test['BsmtHalfBath'].fillna('0.0', inplace = True)
test['KitchenQual'].fillna('TA', inplace = True)
test['Exterior1st'].fillna('VinylSd', inplace = True)
test['Exterior2nd'].fillna('VinylSd', inplace = True)

In [None]:
test['MSZoning'] = test['MSZoning'].apply(lambda x: msz.loc[msz['MSZoning'] == x, 'normalized'].item())
test['MSZoning'] = test['MSZoning'].astype(float)
test['Alley'] = test['Alley'].replace('Grvl', '1', regex = True)
test['Alley'] = test['Alley'].replace('Pave', '1', regex = True)
test['Alley'] = test['Alley'].astype(int)
test['LotShape'] = test['LotShape'].apply(lambda x: lshp.loc[lshp['LotShape'] == x, 'normalized'].item())
test['LotShape'] = test['LotShape'].astype(float)
test['LandContour'] = test['LandContour'].apply(lambda x: lctr.loc[lctr['LandContour'] == x, 'normalized'].item())
test['LandContour'] = test['LandContour'].astype(float)
test['LotConfig'] = test['LotConfig'].apply(lambda x: lcfg.loc[lcfg['LotConfig'] == x, 'normalized'].item())
test['LotConfig'] = test['LotConfig'].astype(float)
test['LandSlope'] = test['LandSlope'].apply(lambda x: lslp.loc[lslp['LandSlope'] == x, 'normalized'].item())
test['LandSlope'] = test['LandSlope'].astype(float)
test['Neighborhood'] = test['Neighborhood'].apply(lambda x: ngbh.loc[ngbh['Neighborhood'] == x, 'normalized'].item())
test['Neighborhood'] = test['Neighborhood'].astype(float)
test['Condition1'] = test['Condition1'].apply(lambda x: cond1.loc[cond1['Condition1'] == x, 'normalized'].item())
test['Condition1'] = test['Condition1'].astype(float)
test['BldgType'] = test['BldgType'].apply(lambda x: bldg.loc[bldg['BldgType'] == x, 'normalized'].item())
test['BldgType'] = test['BldgType'].astype(float)
test['HouseStyle'] = test['HouseStyle'].apply(lambda x: hstl.loc[hstl['HouseStyle'] == x, 'normalized'].item())
test['HouseStyle'] = test['HouseStyle'].astype(float)
test['RoofStyle'] = test['RoofStyle'].apply(lambda x: rfst.loc[rfst['RoofStyle'] == x, 'normalized'].item())
test['RoofStyle'] = test['RoofStyle'].astype(float)
test['RoofMatl'] = test['RoofMatl'].apply(lambda x: rfmt.loc[rfmt['RoofMatl'] == x, 'normalized'].item())
test['RoofMatl'] = test['RoofMatl'].astype(float)
test['Exterior1st'] = test['Exterior1st'].apply(lambda x: ext1.loc[ext1['Exterior1st'] == x, 'normalized'].item())
test['Exterior1st'] = test['Exterior1st'].astype(float)
test['Exterior2nd'] = test['Exterior2nd'].apply(lambda x: ext2.loc[ext2['Exterior2nd'] == x, 'normalized'].item())
test['Exterior2nd'] = test['Exterior2nd'].astype(float)
test['MasVnrType'] = test['MasVnrType'].apply(lambda x: mvtp.loc[mvtp['MasVnrType'] == x, 'normalized'].item())
test['MasVnrType'] = test['MasVnrType'].astype(float)
test['ExterQual'] = test['ExterQual'].apply(lambda x: exql.loc[exql['ExterQual'] == x, 'normalized'].item())
test['ExterQual'] = test['ExterQual'].astype(float)
test['ExterCond'] = test['ExterCond'].apply(lambda x: excn.loc[excn['ExterCond'] == x, 'normalized'].item())
test['ExterCond'] = test['ExterCond'].astype(float)
test['Foundation'] = test['Foundation'].apply(lambda x: fndt.loc[fndt['Foundation'] == x, 'normalized'].item())
test['Foundation'] = test['Foundation'].astype(float)
test['BsmtQual'] = test['BsmtQual'].apply(lambda x: bsqu.loc[bsqu['BsmtQual'] == x, 'normalized'].item())
test['BsmtQual'] = test['BsmtQual'].astype(float)
test['BsmtCond'] = test['BsmtCond'].apply(lambda x: bscn.loc[bscn['BsmtCond'] == x, 'normalized'].item())
test['BsmtCond'] = test['BsmtCond'].astype(float)
test['BsmtExposure'] = test['BsmtExposure'].apply(lambda x: bsxp.loc[bsxp['BsmtExposure'] == x, 'normalized'].item())
test['BsmtExposure'] = test['BsmtExposure'].astype(float)
test['BsmtFinType1'] = test['BsmtFinType1'].apply(lambda x: bsf1.loc[bsf1['BsmtFinType1'] == x, 'normalized'].item())
test['BsmtFinType1'] = test['BsmtFinType1'].astype(float)
test['BsmtFinType2'] = test['BsmtFinType2'].apply(lambda x: bsf2.loc[bsf2['BsmtFinType2'] == x, 'normalized'].item())
test['BsmtFinType2'] = test['BsmtFinType2'].astype(float)
test['Heating'] = test['Heating'].apply(lambda x: heat.loc[heat['Heating'] == x, 'normalized'].item())
test['Heating'] = test['Heating'].astype(float)
test['HeatingQC'] = test['HeatingQC'].apply(lambda x: htqc.loc[htqc['HeatingQC'] == x, 'normalized'].item())
test['HeatingQC'] = test['HeatingQC'].astype(float)
test['CentralAir'] = test['CentralAir'].replace('Y', '1', regex = True)
test['CentralAir'] = test['CentralAir'].replace('N', '0', regex = True)
test['CentralAir'] = test['CentralAir'].astype(int)
test['Electrical'] = test['Electrical'].apply(lambda x: elec.loc[elec['Electrical'] == x, 'normalized'].item())
test['Electrical'] = test['Electrical'].astype(float)
test['KitchenQual'] = test['KitchenQual'].apply(lambda x: kchq.loc[kchq['KitchenQual'] == x, 'normalized'].item())
test['KitchenQual'] = test['KitchenQual'].astype(float)
test['Functional'] = test['Functional'].apply(lambda x: func.loc[func['Functional'] == x, 'normalized'].item())
test['Functional'] = test['Functional'].astype(float)
test['FireplaceQu'] = test['FireplaceQu'].apply(lambda x: frpq.loc[frpq['FireplaceQu'] == x, 'normalized'].item())
test['FireplaceQu'] = test['FireplaceQu'].astype(float)
test['GarageYrBlt'] = test['GarageYrBlt'].replace('None', 0, regex = True)
test['GarageCars'] = test['GarageCars'].replace('None', 0, regex = True)
test['GarageArea'] = test['GarageArea'].replace('None', 0, regex = True)
test['GarageType'] = test['GarageType'].apply(lambda x: grtp.loc[grtp['GarageType'] == x, 'normalized'].item())
test['GarageType'] = test['GarageType'].astype(float)
test['GarageFinish'] = test['GarageFinish'].apply(lambda x: grfn.loc[grfn['GarageFinish'] == x, 'normalized'].item())
test['GarageFinish'] = test['GarageFinish'].astype(float)
test['GarageQual'] = test['GarageQual'].apply(lambda x: grqu.loc[grqu['GarageQual'] == x, 'normalized'].item())
test['GarageQual'] = test['GarageQual'].astype(float)
test['GarageCond'] = test['GarageCond'].apply(lambda x: grcn.loc[grcn['GarageCond'] == x, 'normalized'].item())
test['GarageCond'] = test['GarageCond'].astype(float)
test['PavedDrive'] = test['PavedDrive'].apply(lambda x: pvdr.loc[pvdr['PavedDrive'] == x, 'normalized'].item())
test['PavedDrive'] = test['PavedDrive'].astype(float)
test['PoolQC'] = test['PoolQC'].apply(lambda x: plqc.loc[plqc['PoolQC'] == x, 'normalized'].item())
test['PoolQC'] = test['PoolQC'].astype(float)
test['Fence'] = test['Fence'].apply(lambda x: fenc.loc[fenc['Fence'] == x, 'normalized'].item())
test['Fence'] = test['Fence'].astype(float)
test['MiscFeature'] = test['MiscFeature'].apply(lambda x: misc.loc[misc['MiscFeature'] == x, 'normalized'].item())
test['MiscFeature'] = test['MiscFeature'].astype(float)

test = test.replace('None', 0, regex = True)

### Here the outliers will be removed

934, 1298, 313, 335, 249, 706, 297, 1169, 322, 523, 1182, 691, 738, 635, 1328, 495, 583, 197

In [None]:
train = train.drop([934, 1298, 313, 335, 249, 706, 297, 1169, 322, 523, 1182, 691, 738, 635, 1328, 495, 583, 197])

# Models. CatBoost + Optuna.

https://github.com/optuna/optuna-examples/blob/main/catboost/catboost_simple.py

In [None]:
import optuna
import catboost as cb

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [None]:
X = train.drop(columns = ['SalePrice'])
Y = train['SalePrice']
X_train, X_validation, y_train, y_validation = train_test_split(X, Y, train_size = 0.75, random_state = 42)
X_test = test

In [None]:
def objective(trial):
    X = train.drop(columns = ['SalePrice'])
    Y = train['SalePrice']
    X_train, X_validation, y_train, y_validation = train_test_split(X, Y, train_size = 0.75, random_state = 42)

    param = {
        'objective': trial.suggest_categorical('objective', ['RMSE']),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.8, 0.9),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 3),
        'depth': trial.suggest_int('depth', 3, 5),
        #'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        #'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'used_ram_limit': '15gb',
    }

    #if param['bootstrap_type'] == 'Bayesian':
    #    param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    #elif param['bootstrap_type'] == 'Bernoulli':
    #    param['subsample'] = trial.suggest_float('subsample', 0.1, 1)

    gbm = cb.CatBoostRegressor(**param)

    gbm.fit(X_train, y_train, eval_set = [(X_validation, y_validation)], verbose = 0, early_stopping_rounds = 100)

    preds = gbm.predict(X_validation)
    pred_labels = np.rint(preds)
    accuracy = mean_absolute_error(y_validation, pred_labels)
    return accuracy


if __name__ == '__main__':
    study = optuna.create_study(direction = 'minimize')
    study.optimize(objective,
                   #n_trials = 250,
                   timeout = 8*3600)

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
#optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_edf(study)

In [None]:
model = cb.CatBoostRegressor(random_seed = 42,
                             logging_level = 'Silent',
                             **study.best_trial.params)

In [None]:
'''
test_preds = None
cols = list(train.columns)
cols.remove('SalePrice')

n_splits = 10
kf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = 42)
for fold, (tr_index , val_index) in enumerate(kf.split(X.values, Y.values)):

    print("-" * 50)
    print(f"Fold {fold + 1}")
    
    x_train, x_val = X.values[tr_index], X.values[val_index]
    y_train, y_val = Y.values[tr_index], Y.values[val_index]
        
    eval_set = [(x_val, y_val)]
    
    model = cb.CatBoostRegressor(random_seed = 42,
                             logging_level = 'Silent',
                             **study.best_trial.params)
    model.fit(x_train, y_train,
              eval_set = eval_set, verbose = 0)
    
    train_preds = model.predict(x_train)    
    val_preds = model.predict(x_val)
    
    print('RMSE => {}'.format(mean_absolute_error(y_val, val_preds)))
    
    if test_preds is None:
        test_preds = model.predict(test[cols].values)
    else:
        test_preds += model.predict(test[cols].values)

print("-" * 50)
test_preds /= n_splits
'''

#### KFold made my score worse.

In [None]:
model.fit(X_train,
          y_train,
          eval_set = (X_validation, y_validation))

# Submission

In [None]:
prediction = model.predict(X_test)

In [None]:
#submission = pd.DataFrame(test_preds)
submission = pd.DataFrame(prediction)
submission['Id'] = test['Id']
submission = submission.rename(columns = {0: 'SalePrice'})

In [None]:
submission.to_csv('submission.csv', index = False)

# Save trials

In [None]:
trials = study.trials_dataframe()
trials.to_csv('trials.csv', index = False)