***Hi, everybody.***

**This is my first notebook submission.**

**I would like to know your opinion on this notebook. How i can improve it or fix some bugs?**

**P.S. I am not a native English speaker. If you see an error or something is not clear, write in the comments, it will help me a lot.**

**Thank you in advance.**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from scipy.stats import uniform
%matplotlib inline

In [None]:
from xgboost import XGBRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from mlxtend.preprocessing import minmax_scaling

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# show unique values in the column
def show_uniq(col):
    print('Train: {}'.format(train[col].unique()))
    print('Test: {}'.format(test[col].unique()))

In [None]:
# show unique values in the columns
def show_uniqs(cols):
    for col in cols:
        print(col)
        show_uniq(col)
        print('=======================================')

# 1. Downloading and exploring data

## 1.1 Train

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
cat_original_columns = train.select_dtypes(exclude=np.number).columns # categorical columns
num_original_columns = train.select_dtypes(include=np.number).columns.drop('Id') # numerical columns

In [None]:
train.head()

In [None]:
train.shape

In [None]:
# columns with gaps
empty_columns_train = train.columns[[ind for ind,x in enumerate(train.isnull().any()) if x]]

In [None]:
# number of gaps in all columns
count_empty_columns_train = train[empty_columns_train].isnull().sum(axis = 0)
count_empty_columns_train

In [None]:
plt.figure(figsize=(15,7))
ploting = sns.barplot(x = count_empty_columns_train.index,y = count_empty_columns_train.values)
plt.xticks(rotation=30)

A lot of feature have missing values. Let's look at those that have more than 40% passes.

In [None]:
(train.shape[0]/100)*40

In [None]:
for c, name_col in zip(count_empty_columns_train,count_empty_columns_train.index):
    if c > 584:
        print(name_col)

+ Alley - Missing values mean the absence of an alley
+ FireplaceQu - Missing values mean the absence of a fireplace
+ PoolQC - Missing values mean the absence of a pool
+ Fence - Missing values mean the absence of a fence
+ MiscFeature - Missing values mean the absence of a other feature.

## 1.2 Test

In [None]:
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
test.shape

In [None]:
# columns with gaps
empty_columns_test = test.columns[[ind for ind,x in enumerate(test.isnull().any()) if x]]

In [None]:
# number of gaps in all columns
count_empty_columns_test = test[empty_columns_test].isnull().sum(axis = 0)
count_empty_columns_test

In [None]:
plt.figure(figsize=(15,7))
ploting = sns.barplot(x = count_empty_columns_test.index,y = count_empty_columns_test.values)
plt.xticks(rotation=30)

Let's look at the columns in which there are missing values on the test, but not on train

In [None]:
for col in count_empty_columns_test.index:
    if col not in count_empty_columns_train:
        print(col)

This will be important when transforming data

# 2. Processing of missing values in groups

## 2.1 Garage

In [None]:
garage_colums = []
regexp = re.compile(r"([-a-zA-Z]+)?"+r"Garage"+r"([-a-zA-Z]+)?")
for col in train.columns:
    if regexp.search(col):
        garage_colums.append(col)
garage_colums = np.array(garage_colums)
garage_colums

In [None]:
train[garage_colums].info()

In [None]:
test[garage_colums].info()

In [None]:
garage_num_cols = train[garage_colums].select_dtypes(include=np.number).columns # numerical garage columns
garage_cat_cols = train[garage_colums].select_dtypes(exclude=np.number).columns # categorical garage columns

Working with numerical missing data. Use it even for data because i'm gonna use random forest

In [None]:
for col in garage_num_cols:
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)

Working with categorical missing data

In [None]:
# Shows if there is another notation for nan in the data
show_uniqs(garage_cat_cols)

In [None]:
for col in garage_cat_cols:
    train[col] = train[col].fillna('NA')
    test[col] = test[col].fillna('NA')
    # show column if classes in test column not in train 
    if not set(test[col].unique()).issubset(train[col].unique()):
        print(col)

## 2.2 Basement

In [None]:
basement_colums = []
regexp = re.compile(r"([-a-zA-Z]+)?"+r"Bsmt"+r"([-a-zA-Z]+)?")
for col in train.columns:
    if regexp.search(col):
        basement_colums.append(col)
basement_colums = np.array(basement_colums)
basement_colums

In [None]:
train[basement_colums].info()

In [None]:
test[basement_colums].info()

In [None]:
basement_num_cols = train[basement_colums].select_dtypes(include=np.number).columns
basement_cat_cols = train[basement_colums].select_dtypes(exclude=np.number).columns

Working with numerical missing data

In [None]:
for col in basement_num_cols:
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)

Working with categorical missing data

In [None]:
show_uniqs(basement_cat_cols)

In [None]:
# show column if classes in test column not in train 
for col in basement_cat_cols:
    train[col] = train[col].fillna('NA')
    test[col] = test[col].fillna('NA')
    if not set(test[col].unique()).issubset(train[col].unique()):
        print(col)

## 2.3 Masonry veneer 

In [None]:
masvnr_colums = []
regexp = re.compile(r"([-a-zA-Z]+)?"+r"MasVnr"+r"([-a-zA-Z]+)?")
for col in train.columns:
    if regexp.search(col):
        masvnr_colums.append(col)
masvnr_colums = np.array(masvnr_colums)
masvnr_colums

In [None]:
train[masvnr_colums].info()

In [None]:
test[masvnr_colums].info()

In [None]:
masvnr_num_cols = train[masvnr_colums].select_dtypes(include=np.number).columns
masvnr_cat_cols = train[masvnr_colums].select_dtypes(exclude=np.number).columns

In [None]:
# if we have a missing value in MasVnrArea column than we have missing value in MasVnrType column on a same row
all(train.loc[train['MasVnrArea'].isnull()].index == train.loc[train['MasVnrType'].isnull()].index)

In [None]:
train.loc[train['MasVnrArea'] == 0][masvnr_colums].head()

Working with numerical missing data

In [None]:
train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
test['MasVnrArea'] = test['MasVnrArea'].fillna(0)

Working with categorical missing data

In [None]:
show_uniqs(masvnr_cat_cols)

In [None]:
train['MasVnrType'] = train['MasVnrType'].fillna('None')
test['MasVnrType'] = test['MasVnrType'].fillna('None')

In [None]:
show_uniqs(masvnr_cat_cols)

## 2.4 Pool

In [None]:
pool_colums = []
regexp = re.compile(r"([-a-zA-Z]+)?"+r"Pool"+r"([-a-zA-Z]+)?")
for col in train.columns:
    if regexp.search(col):
        pool_colums.append(col)
pool_colums = np.array(pool_colums)
pool_colums

In [None]:
show_uniqs(pool_colums)

In [None]:
train.loc[train['PoolQC'].isnull()][pool_colums].head()

In [None]:
train['PoolQC'] = train['PoolQC'].fillna('NA')
test['PoolQC'] = test['PoolQC'].fillna('NA')

## 2.5 Alley

In [None]:
show_uniq('Alley')

In [None]:
train['Alley'] = train['Alley'].fillna('NA')
test['Alley'] = test['Alley'].fillna('NA')

## 2.6 Fireplace

In [None]:
fireplace_colums = []
regexp = re.compile(r"([-a-zA-Z]+)?"+r"Fireplace"+r"([-a-zA-Z]+)?")
for col in train.columns:
    if regexp.search(col):
        fireplace_colums.append(col)
fireplace_colums = np.array(fireplace_colums)
fireplace_colums

In [None]:
show_uniqs(fireplace_colums)

In [None]:
train['FireplaceQu'] = train['FireplaceQu'].fillna('NA')
test['FireplaceQu'] = test['FireplaceQu'].fillna('NA')

## 2.7 Fence

In [None]:
show_uniq('Fence')

In [None]:
train['Fence'] = train['Fence'].fillna('NA')
test['Fence'] = test['Fence'].fillna('NA')

## 2.8 MiscFeature

In [None]:
misc_colums = []
regexp = re.compile(r"([-a-zA-Z]+)?"+r"Misc"+r"([-a-zA-Z]+)?")
for col in train.columns:
    if regexp.search(col):
        misc_colums.append(col)
misc_colums = np.array(misc_colums)
misc_colums

In [None]:
show_uniq('MiscFeature')

In [None]:
train['MiscFeature'] = train['MiscFeature'].fillna('NA')
test['MiscFeature'] = test['MiscFeature'].fillna('NA')

## 2.9 Electrical

In [None]:
show_uniq('Electrical')

There is a pass in Electical, but there is only one, so we will delete this row

In [None]:
train = train.drop(train.loc[train['Electrical'].isnull()].index)

## 2.10 MSZoning
Should correlate with the location

In [None]:
for i in test['Neighborhood'].unique():
    if test.MSZoning[test['Neighborhood'] == i].isnull().sum() > 0:
        test.loc[test['Neighborhood'] == i,'MSZoning'] = \
        test.loc[test['Neighborhood'] == i,'MSZoning'].fillna(test.loc[test['Neighborhood'] == i,'MSZoning'].mode()[0]) 

## 2.11 Other except LotFrontage

In [None]:
test['Exterior2nd'].fillna('None', inplace=True) 
test['Exterior1st'].fillna(test['Exterior1st'].mode()[0], inplace=True)        
test['SaleType'].fillna(test['SaleType'].mode()[0], inplace=True)                
test['KitchenQual'].fillna(test['KitchenQual'].mode()[0], inplace=True)    
test['Functional'].fillna(test['Functional'].mode()[0], inplace=True)       
test['Utilities'].fillna(test['Utilities'].mode()[0], inplace=True)  

## 2.12 LootFrontage

In [None]:
train['LotFrontage'].fillna(train['LotFrontage'].median(), inplace=True)
test['LotFrontage'].fillna(test['LotFrontage'].median(), inplace=True)

## 2.13 Drop SalePrice and Id from data

In [None]:
y_train = train['SalePrice']
train = train.drop('SalePrice',axis=1)
train = train.drop('Id',axis=1)

In [None]:
test_id = test['Id']
test = test.drop('Id', axis=1)

# 3. Encoding

Some of the properties can be processed manually, for example:

**OverallQual**

+ EX = 5
+ GD = 4

EX > GD and 5 > 4. The meaning will remain.

But we can't do that to others, for example:

**MasVnrType**:

+ BrkFace = 5
+ Stone = 4

Therefore, it is better to do it by hand, rather than using one hot encoding from sklearn

Let's create a new dataset that contains training and a test. It's easier to convert data this way

In [None]:
df = train.append(test)

Encoding

In [None]:
df['GarageCond'] = df['GarageCond'].map({'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df['GarageQual'] = df['GarageQual'].map({'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df['BsmtCond'] =  df['BsmtCond'].map({'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df['BsmtExposure'] = df['BsmtExposure'].map({'NA':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4})
df['BsmtFinType1'] = df['BsmtFinType1'].map({'NA':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6})
df['BsmtFinType2'] = df['BsmtFinType2'].map({'NA':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6})
df['BsmtQual'] = df['BsmtQual'].map({'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df['PoolQC'] = df['PoolQC'].map({'NA':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})
df['Alley'] = df['Alley'].map({'NA':0, 'Grvl':1, 'Pave':2})
df['FireplaceQu'] = df['FireplaceQu'].map({'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df['ExterCond'] = df['ExterCond'].map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df['ExterQual'] = df['ExterQual'].map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df['KitchenQual'] = df['KitchenQual'].map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df['LandSlope'] = df['LandSlope'].map({'Sev':1, 'Mod':2, 'Gtl':3}) 
df['PavedDrive'] = df['PavedDrive'].map({'N':1, 'P':2, 'Y':3})
df['Functional'] = df['Functional'].map({'Sal':1, 'Sev':2, 'Maj2':3, 'Maj1':4, 'Mod':5, 'Min2':6, 'Min1':7, 'Typ':8})
df['HeatingQC'] = df['HeatingQC'].map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df['Street'] = df['Street'].map({'Grvl':1, 'Pave':2})
df['Utilities'] = df['Utilities'].map({'ELO':1, 'NoSeWa':2, 'NoSewr':3, 'AllPub':4})
# Age have information about Month, because we can drop MoSold
df=df.drop('MoSold',axis=1)
# in description MSSubClass is categorical data
df['MSSubClass'] = df['MSSubClass'].map({20:'class1', 30:'class2', 40:'class3', 45:'class4',
                                   50:'class5', 60:'class6', 70:'class7', 75:'class8',
                                   80:'class9', 85:'class10', 90:'class11', 120:'class12',
                                   150:'class13', 160:'class14', 180:'class15', 190:'class16'})

# 4. Anomaly

In [None]:
cat_columns = df.select_dtypes(exclude=np.number).columns 
num_columns = df.select_dtypes(include=np.number).columns 

In [None]:
# visualize the distribution of each numerical feature
temp = pd.melt(df, value_vars=num_columns)
grid = sns.FacetGrid(temp, col="variable",  col_wrap=5 , size=3.0, 
                     aspect=1.0,sharex=False, sharey=False)
grid.map(sns.distplot, "value")
plt.show()

In [None]:
# box-plot of each numerical feature
temp = pd.melt(df, value_vars=num_columns)
grid = sns.FacetGrid(temp, col="variable",  col_wrap=5 , size=3.0, 
                     aspect=1.0,sharex=False, sharey=False)
grid.map(sns.boxplot, "value")
plt.show()

In [None]:
df = df.reset_index()
df = df.drop('index',axis = 1)

In [None]:
drop_id = df[df['LotArea'] > 100000].index

In [None]:
# we can to remove lines only from training
drop_id = drop_id[drop_id < 1459]

In [None]:
# change some position in dataframe.
df['MasVnrArea'][df[df['MasVnrArea'] > 1500].index] = df['MasVnrArea'].mean()
df['Utilities'][df[df['Utilities']==2].index] = df['Utilities'].mean()

In [None]:
df = df.drop(drop_id)

In [None]:
df = df.reset_index()

In [None]:
y_train = y_train.drop(drop_id)

# 5. Encoding categorical


Create dummy variables and delete the last column generated from each categorical feature

In [None]:
# create of list of dummy variables for drop
dummy_drop = []
for i in cat_columns:
    dummy_drop += [ i+'_'+str(df[i].unique()[-1]) ]

df = pd.get_dummies(df,columns=cat_columns) 
# drop the last column generated from each categorical feature
df = df.drop(dummy_drop,axis=1)

In [None]:
X_train  = df[:-1459].drop(['index'], axis=1)
X_test  = df[-1459:].drop(['index'], axis=1)

scaler = StandardScaler()
X_train[num_columns]= scaler.fit_transform(X_train[num_columns])
X_test[num_columns]= scaler.transform(X_test[num_columns])

X_train.shape, X_test.shape 

Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. Therefore, we convert the data to a logarithm

In [None]:
y_train_log = np.log(y_train)

# 6. Chose the most important feature

In [None]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
imp_feature = pd.DataFrame(xgb.feature_importances_ ,columns = ['Importance'],index = X_train.columns)
imp_feature = imp_feature.sort_values(['Importance'], ascending = False)

print(imp_feature)

In [None]:
ans = {}
# choose the most important feature
for i in range(1, 222):
    imp_col = imp_feature.iloc[:i].index
    # these parameters are taken from previous experiments
    ridge = KernelRidge(alpha = 0.5263157894736842, coef0 = 3.5, degree = 2, kernel ='polynomial')
    ridge = ridge.fit(X_train[imp_col], y_train_log)
    ans[i] = np.sqrt(mean_squared_error(y_train_log,ridge.predict(X_train[imp_col])))

In [None]:
minimum = ans[1]
ind_min = 1
for ind in range(1,len(ans.values())):
    if ans[ind] < minimum:
        minimum = ans[ind]
        ind_min = ind

In [None]:
imp_col = imp_feature.iloc[:ind_min+1].index

# 7. Model

## 7.1 Kernel Ridge

In [None]:
# metric
def neg_rmse(y_true, y_pred):
    return -1.0*np.sqrt(mean_squared_error(y_true,y_pred))

neg_rmse = make_scorer(neg_rmse)

The parameters for the model are taken from the cell next to it. For some reason kaggle runs this cell with a lot of errors. You can try to run this code at home or go to [github](https://github.com/GubanovDenis/kaggle-house-prices/blob/main/house_prices_kernel_ridge.ipynb).


ridge = KernelRidge()

parameters = {'alpha': np.linspace(0, 1,20), 'kernel': ['polynomial','sigmoid','chi2','laplacian'], 
              'degree': [2], 'coef0':np.linspace(0, 3.5,21)}

grid_cv = GridSearchCV(estimator = ridge,
                                   param_grid = parameters,
                                   cv = 3,
                                   scoring = neg_rmse,
                                   n_jobs = -1)

grid_cv = grid_cv.fit(X_train[imp_col], y_train_log)

print("Parameters of the best_estimator:")

print(grid_cv.best_params_)

print("Mean cross-validated RMSE of the best_estimator: {}".format(-grid_cv.best_score_))

model = grid_cv.best_estimator_

print("RMSE of the whole training set: {}".format(np.sqrt(mean_squared_error(y_train_log,model.predict(X_train[imp_col])))))

In [None]:
model = KernelRidge(alpha = 0.6842105263157894, coef0 = 3.5, degree = 2, kernel = 'polynomial')

model.fit(X_train[imp_col], y_train_log)

print("RMSE of the whole training set: {}".format(np.sqrt(mean_squared_error(y_train_log,model.predict(X_train[imp_col])))))

In [None]:
# inverse conversion to logarithm
y_pred = np.exp(model.predict(X_test[imp_col]))

In [None]:
def save_ans(ans, pasanger_id, name_alg):
    submission = pd.DataFrame({'Id':pasanger_id,'SalePrice':ans})
    print(submission.shape) 
    filename = r'./{}.csv'.format(name_alg)
    submission.to_csv(filename,index=False)
    print('Saved file: ' + filename)

In [None]:
save_ans(y_pred, test_id,'submission')