In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# House Prices Data Science Project

## Intro

This project is addressing factors which can influence price negotiations. We need to use this data to predict the price of house as accurately as possible.
In this notebook, we will conduct an EDA, prepare dataset features to use it in model predictions and test some models to predict house prices.

In [None]:
# import python libraries

# data analysis
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# sklearn utilities
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
# prediction
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Exploratory Data Analysis

## Acquire data

In [None]:
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
train_test_data = [train_data, test_data]
print('Training data shape: ', train_data.shape)
print('Test data shape: ', test_data.shape)

## Describing data

Now we need to research our data, look at data features and their types.

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
test_data.head()

In [None]:
test_data.info()

Let us describe numerical and object data separately.

In [None]:
train_data.describe()

In [None]:
train_data.describe(include=['O'])

Summary:
* Several features have a lot of empty values. Probably we will drop them in the future.
* We have too many features (80) to use all of them in our predictions. We have to reduce feature's count next. 
* Several features (e.g. LandContour) basically filled the same values. It could be transform to a binary feature.

## Analize features

Firstly, we drop Id feature to reduce number of features to analize.

In [None]:
id_test = test_data['Id'].tolist()

for data in train_test_data:
    data.drop(['Id'], axis=1, inplace=True)
print(train_data.shape, test_data.shape)

### Numerical data

#### feature extracting

In [None]:
train_data_num = train_data.select_dtypes(exclude=['object'])
test_data_num = test_data.select_dtypes(exclude=['object'])
train_data_num.head()

Let's take a look at features values distribution.

In [None]:
train_data_num.hist(figsize=(25, 30), bins=30);

We can see that dataset contains few low variance features. We can drop features contains more than 95% similar values because they will have minor impact on our predictions.

In [None]:
selector = VarianceThreshold(threshold=0.05)

selector.fit(train_data_num.iloc[:, :-1])

sup = selector.get_support()

print('Number of retained features: ', sum(sup))

print('Number low-variance features: ', sum(~sup))

low_var_fet = train_data_num.drop(['SalePrice'], axis=1).loc[:, ~sup].columns.values

print('Low-variance features: ', low_var_fet)

print('Before: ',train_data_num.shape, test_data_num.shape)
train_data_num.drop(low_var_fet, axis=1, inplace=True)
test_data_num.drop(low_var_fet, axis=1, inplace=True)
print('After: ', train_data_num.shape, test_data_num.shape)

We need to reduce number of feature in our model. Take a look at the correlation table to select the most correlated with SalePrice feature.

In [None]:
pd.options.display.float_format = "{:,.2f}".format

corr_mat = train_data_num.corr('pearson')

# replace very weak correlation
corr_mat[(corr_mat < 0.3) & (corr_mat > -0.3)] = 0

# define triangular mask for better visibility
mask = np.triu(np.ones_like(corr_mat, dtype=bool))
plt.figure(figsize=(20, 20))
sns.heatmap(corr_mat, mask=mask, vmax=1.0, vmin=-1.0, square=True, annot=True, annot_kws={"size": 9, "color": "black"}, linewidths=0.1, cmap='rocket');

Now we extract SalePrice correlations:

In [None]:
corr_features = corr_mat['SalePrice'].drop(['SalePrice'])
corr_features.sort_values(ascending=False)

There are various strength of correlation:
* < 0.3 - very weak correlation (on table it is replaced by 0)
* \> 0.3 & < 0.5 - weak correaltion
* \> 0.5 & < 0.7 - moderate correlation
* \> 0.7 - strong correlation

It also works for negative coefficients.

Now we can investigate the extracted features more in depth.

Let's consider separately the signs belonging to each degree of correlation.

In [None]:
# strong correlation features (> 0.7)

strong_corr_fet_names = corr_features[abs(corr_features) >= 0.7].sort_values(ascending=False).index.tolist()
print('Strongly correlated features: ', strong_corr_fet_names)

strong_fet = train_data_num.loc[:, strong_corr_fet_names + ['SalePrice']]

fig, ax = plt.subplots(1, 2, figsize=(15, 10))

for i, ax in enumerate(ax):
    if i < len(strong_corr_fet_names):
        sns.regplot(x=strong_corr_fet_names[i], y='SalePrice', data=strong_fet, ax=ax, line_kws={'color': 'red'})

In [None]:
# moderate correlation features ( > 0.5 & < 0.7)

moderate_fet_names = corr_features[(abs(corr_features) >= 0.5) & (abs(corr_features) < 0.7)].sort_values(ascending=False).index.tolist()
print('Moderate correlation features: ', moderate_fet_names)

moderate_fet = train_data_num.loc[:, moderate_fet_names + ["SalePrice"]]

fig, ax = plt.subplots(3, 3, figsize=(30, 30))

for i, ax in enumerate(fig.axes):
    if i < len(moderate_fet_names):
        sns.regplot(x=moderate_fet_names[i], y='SalePrice', data=moderate_fet, ax=ax, line_kws={'color': 'red'})

In [None]:
# weakly correlated features ( > 0.3 & < 0.5)

weak_fet_names = corr_features[(abs(corr_features) >= 0.3) & (abs(corr_features) < 0.5)].sort_values(ascending=False).index.tolist()
print('Weakly correlated features: ', weak_fet_names)

weak_fet = train_data_num.loc[:, weak_fet_names + ["SalePrice"]]

fig, ax = plt.subplots(3, 3, figsize=(30, 30))

for i, ax in enumerate(fig.axes):
    if i < len(weak_fet_names):
        sns.regplot(x=weak_fet_names[i], y='SalePrice', data=weak_fet, ax=ax, line_kws={'color': 'red'})

Let's merge all features and see how data looks like now.

In [None]:
numerical_features = strong_corr_fet_names + moderate_fet_names + weak_fet_names + ['SalePrice']

train_data_num = train_data_num.loc[:, numerical_features]
# excepting SalePrice
test_data_num = test_data_num.loc[:, numerical_features[:-1]]
train_data_num.head()

We've extracted the most important features for predictions but 18 features is still too many. We can drop features which have strong correlation with each other.

In [None]:
from itertools import combinations

# find pairs of strongly correalted features
cols = train_data_num.columns.tolist()[:-1]
pairs = []

for pair in combinations(range(len(cols)), 2):
    if corr_mat.loc[cols[pair[0]], cols[pair[1]]] >= 0.7:
        pairs.append((cols[pair[0]], cols[pair[1]]))
        
pairs

In [None]:
# Now we can drop one feature from each pair

for _, col in pairs:
    train_data_num.drop(col, axis=1, inplace=True)
    test_data_num.drop(col, axis=1, inplace=True)

train_data_num.head()

#### filling empty values

We need to fill empty values in data before using it our model. Take a look at data info:

In [None]:
train_data_num.info()

In [None]:
test_data_num.info()

In [None]:
train_nan_cols = train_data_num.columns[train_data_num.isnull().any()].tolist()
train_nan = pd.DataFrame({ 'Column': train_nan_cols,
                         'NaN_percent': [ train_data_num[col].isnull().sum() * 100 / len(train_data_num) 
                                         for col in train_nan_cols] })
sns.barplot(data=train_nan, x='Column', y='NaN_percent');

In [None]:
test_nan_cols = test_data_num.columns[test_data_num.isnull().any()].tolist()
test_nan = pd.DataFrame({ 'Column': test_nan_cols,
                         'NaN_percent': [ test_data_num[col].isnull().sum() * 100 / len(test_data_num) 
                                         for col in test_nan_cols] })
sns.barplot(data=test_nan, x='Column', y='NaN_percent');

It seems like we have 2 features with empty values in training data and 6 features in test data. The number of empty values in the LotFrontage column is significantly greater than in other columns (about 16-17%). Filling empty values should not greatly affect the original distribution. We try to replace empty values by median. 

In [None]:
imp = SimpleImputer(strategy='median')
imp_train_data = pd.DataFrame(imp.fit_transform(train_data_num))
imp_train_data.columns = train_data_num.columns

fig, ax = plt.subplots(2,2, figsize=(15, 15))

for i, col in enumerate(train_nan_cols):
    
    # before filling NaN
    bfr = sns.histplot(data=train_data_num, x=col, ax=ax[i, 0], stat='density', bins=30)
    sns.kdeplot(data=train_data_num, x=col, ax=ax[i, 0], color='red')
    bfr.set_xlim(left=0)
    bfr.set_ylabel('Before', fontsize=12)
    
    # after filling NaN
    aftr = sns.histplot(data=imp_train_data, x=col, ax=ax[i, 1], stat='density', bins=30)
    sns.kdeplot(data=train_data_num, x=col, ax=ax[i, 1], color='red')
    aftr.set_xlim(left=0)
    aftr.set_ylabel('After', fontsize=12)


The shape of LotFrontage distribution is affected by filling empty values so we can drop it. Other features not presented on plots have a very low percent of empty values (< 2%) so imputing it doesn't greatly affect the original distribution.

In [None]:
# drop LotFrontage
imp_train_data.drop(['LotFrontage'], axis=1, inplace=True)
test_data_num.drop(['LotFrontage'], axis=1, inplace=True)

# fill NaN in test data
imp = SimpleImputer(strategy='median')
imp_test_data = pd.DataFrame(imp.fit_transform(test_data_num))
imp_test_data.columns = test_data_num.columns

train_data_num = imp_train_data
test_data_num = imp_test_data

In [None]:
train_data_num.info()

In [None]:
test_data_num.info()

#### outliers

Now we need to get out of outliers.

In [None]:
fig, ax = plt.subplots(5, 3, figsize=(20, 20))
train_num_cols = train_data_num.columns.tolist()[:-1]

for i, ax in enumerate(fig.axes):
    if i < len(train_num_cols):
        sns.boxplot(data=train_data_num, y=train_num_cols[i], ax=ax)

To get out of outliers we will floor their values based on quantile. For outliers searching we will use Z-score from scipy library.

In [None]:
from scipy import stats

for col in train_num_cols:
    z_upper_train = stats.zscore(train_data_num[col]) > 3
    z_lower_train = stats.zscore(train_data_num[col]) < -3
    q1, q3 = train_data_num[col].quantile([0.25, 0.75])
    
    train_data_num[z_upper_train] = q3
    train_data_num[z_lower_train] = q1
    
    z_upper_test = stats.zscore(test_data_num[col]) > 3
    z_lower_test = stats.zscore(test_data_num[col]) < -3
    q1, q3 = test_data_num[col].quantile([0.25, 0.75])
    
    test_data_num[z_upper_test] = q3
    test_data_num[z_lower_test] = q1


Finally, let's take a look at numerical data:

In [None]:
train_data_num.head()

### Categorical data

#### feature extracting

In [None]:
categorical_features = [col for col in train_data.columns if train_data.dtypes[col] == 'object']

# training data
train_data_cat = train_data[categorical_features + ['SalePrice']]

# test data
test_data_cat = test_data[categorical_features]

train_data_cat.shape, test_data_cat.shape

Now we are going look at countplots of each categorical feature to determine dominating categories for each feature in data.

In [None]:
fig, ax = plt.subplots(round(len(train_data_cat.columns) / 2), 2, figsize=(20, 40))

for i, ax in enumerate(fig.axes):
    if i < len(train_data_cat.columns) - 1:
        ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=90)
        sns.countplot(data=train_data_cat, x=train_data_cat.columns[i], ax=ax, palette='pastel')
fig.tight_layout()

There are we can notice the obvious domination of one of the categories in some features. Such features make a negligible contribution to predictions so we can drop them.

In [None]:
features_to_drop = ['Street', 
                    'LandContour', 
                    'Utilities', 
                    'LandSlope', 
                    'Condition2', 
                    'RoofMatl', 
                    'BsmtCond', 
                    'BsmtFinType2', 
                    'Heating', 
                    'CentralAir', 
                    'Electrical',
                    'Functional',
                    'GarageQual',
                    'GarageCond',
                    'PavedDrive'
                   ]

train_data_cat.drop(features_to_drop, axis=1, inplace=True)
test_data_cat.drop(features_to_drop, axis=1, inplace=True)

train_data_cat.shape, test_data_cat.shape

We are left now with 28 categorical features in both data sets.

Next thing we will do is looking at variation of the target variable with respect to each categorical feature.

In [None]:
fig, ax = plt.subplots(round(len(train_data_cat.columns) / 2), 2, figsize=(20, 30))

for i, ax in enumerate(fig.axes):
    if i < len(train_data_cat.columns) - 1:
        ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=45)
        sns.boxplot(data=train_data_cat, x=train_data_cat.columns[i], y='SalePrice', ax=ax, palette='Spectral_r')

fig.tight_layout()

It seems a few features have similar distributions of SalePrice:
* Exterior1st and Exterior2nd
* ExterQual and MasVnrType
* BsmtQual and BsmtExposure

Based on the similarity of their distributions, we can say that these features are highly correlated with each other. Therefore, we can drop one feature from each pair.

In [None]:
train_data_cat.drop(['Exterior2nd', 'MasVnrType', 'BsmtExposure'], axis=1, inplace=True)
test_data_cat.drop(['Exterior2nd', 'MasVnrType', 'BsmtExposure'], axis=1, inplace=True)

train_data_cat.shape, test_data_cat.shape

#### filling empty values

Firstly, let's take a look how many empty values each feature has.

In [None]:
cols_null_train = train_data_cat.columns[train_data_cat.isnull().any()]

nan_counts_train = pd.DataFrame({
    'Column': cols_null_train,
    'NaN_percent': [train_data_cat[col].isnull().sum()*100 / len(train_data_cat) 
                    for col in cols_null_train]
})

nan_counts_train.sort_values('NaN_percent', ascending=False, inplace=True, ignore_index=True)
sns.barplot(data=nan_counts_train, y='Column', x='NaN_percent');

In the training data we can see that five features have more than 45% missing values. Filling empty values by feature's mode will significantly modify dustribution so we will drop them.

In [None]:
train_data_cat.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1, inplace=True)
test_data_cat.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1, inplace=True)

We will also fill empty values by feature's mode in other features.

In [None]:
for col in ['GarageType', 'GarageFinish', 'BsmtQual', 'BsmtFinType1']:
    train_data_cat[col].fillna(train_data_cat[col].mode()[0], inplace=True)
    test_data_cat[col].fillna(test_data_cat[col].mode()[0], inplace=True)
    

In [None]:
train_data_cat.isnull().sum()

In [None]:
test_data_cat.isnull().sum()

It seems like we have a few empty values in test data. We will get rid of it with similar method.

In [None]:
cols_null_test = test_data_cat.columns[test_data_cat.isnull().any()]

for col in cols_null_test:
    test_data_cat[col].fillna(test_data_cat[col].mode()[0], inplace=True)

test_data_cat.isnull().sum()

In [None]:
train_data_cat.shape, test_data_cat.shape

Finally, we have 20 categorical features in our data. However, before we go next we need to transform data from categories into binary.

#### transform categories

In [None]:
train_data_cat.drop(['SalePrice'], axis=1, inplace=True)

train_cat_dummies = pd.get_dummies(train_data_cat)
train_cat_dummies.head()

In [None]:
test_cat_dummies = pd.get_dummies(test_data_cat)
test_cat_dummies.head()

Training data contains more columns than the test data. Training and test data need to have an equal columns count. Let's find differences in data.

In [None]:
dif = [col for col in train_cat_dummies.columns if col not in test_cat_dummies.columns]
print('Found differences in: ', dif)

Now we can drop it.

In [None]:
train_cat_dummies.drop(dif, axis=1, inplace=True)

train_cat_dummies.shape, test_cat_dummies.shape

We have prepared categorical features to our model. Of course, 137 features it's too much so in future we will select the most important for predictions features. Now we can merge numerical and categorical data.

In [None]:
train_data_new = pd.concat([train_data_num, train_cat_dummies], axis=1)
test_data_new = pd.concat([test_data_num, test_cat_dummies], axis=1)

train_data_new.shape, test_data_new.shape

# Feature engeenering

Now we can create new features that can help us improve out predictions. 

In [None]:
# Age of house from the year of construction
train_data_new['Age'] = train_data_new['YearBuilt'].max() - train_data_new['YearBuilt']
test_data_new['Age'] = test_data_new['YearBuilt'].max() - test_data_new['YearBuilt']


In [None]:
# Age since renovating
train_data_new['Renovate'] = train_data_new['YearRemodAdd'] - train_data_new['YearBuilt']
test_data_new['Renovate'] = test_data_new['YearRemodAdd'] - test_data_new['YearBuilt']

train_data_new['Renovate'] = np.where(train_data_new['Renovate'] < 0, 0, train_data_new['Renovate'])
test_data_new['Renovate'] = np.where(test_data_new['Renovate'] < 0, 0, test_data_new['Renovate'])

# Drop YearBuilt
train_data_new.drop(['YearBuilt'], axis=1, inplace=True)
test_data_new.drop(['YearBuilt'], axis=1, inplace=True)
# Drop YearRemodAdd
train_data_new.drop(['YearRemodAdd'], axis=1, inplace=True)
test_data_new.drop(['YearRemodAdd'], axis=1, inplace=True)

In [None]:
# Artificial feature combines OverallQual and GrLivArea
train_data_new['Qual_Area'] = train_data_new['OverallQual'] * train_data_new['GrLivArea']
test_data_new['Qual_Area'] = test_data_new['OverallQual'] * test_data_new['GrLivArea']

Now we look at continuous features. We try to mitigate the strong variance of some variables by using log transformation. It make predictions easier for our model because of normalizing data.

In [None]:
cont_features = train_data_new.select_dtypes(include=['int', 'float']).drop(['SalePrice'], axis=1).columns.tolist()

cont_data = train_data_new.loc[:, cont_features]
cont_data.head()

To extract the skewed features, we take out features with skew > 0.5.

In [None]:
skew_data = pd.DataFrame({
    'Column': cont_features,
    'Skew': abs(cont_data.skew())
}).sort_values('Skew', ascending=False)

skew_data

In [None]:
skew_features = skew_data[skew_data['Skew'] > 0.5]['Column'].tolist()
skew_features

In [None]:
# Adding 1 to avoid logarithm of 0
for col in skew_features:
    train_data_new[col] = np.log(train_data_new[col] + 1)
    test_data_new[col] = np.log(test_data_new[col] + 1)
    

Looking at distribution from the beginning, we can see that SalePrice feature is skewed too so we use log transformation for it.  

In [None]:
train_data_new['SalePriceLog'] = np.log(train_data_new['SalePrice'])

train_data_new.drop('SalePrice', axis=1, inplace=True)

Let's take a look at numerical data distribution now.

In [None]:
train_data_new_num = train_data_new.select_dtypes(include=['int', 'float'])
train_data_new_num.hist(figsize=(20, 20), bins=30);

# Modeling

## Preparing data

Before fitting and evaluating models we need to transform and split our data.

In [None]:
X = train_data_new.drop(['SalePriceLog'], axis=1)
y = train_data_new['SalePriceLog']

print('X shape: ', X.shape)
print('y shape: ', y.shape)

In [None]:
# Standardize data
scaler = StandardScaler().fit(X)

Now we should select the most important features from all of we having now. We will use backward feature elimination for it.

In [None]:
import statsmodels.api as sm

def backward_elimination(X, y, threshold=0.05):
    features = X.columns.tolist()
    
    while True:
        changed = False
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[features]))).fit()
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()
        if worst_pval > threshold:
            changed = True
            worst_fet = pvalues.idxmax()
            features.remove(worst_fet)
        if not changed:
            break
            
    return features


In [None]:
selected_features = backward_elimination(X, y)
selected_features

In [None]:
X = X.loc[:, selected_features]
test_data_new = test_data_new.loc[:, selected_features]

Now we can divide data into test and validation data.

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15)
print('Train size:', X_train.shape, y_train.shape)
print('Validation size:', X_val.shape, y_val.shape)

## Predict and Solve

Now we are ready to use models for predicting houses prices. We will estimate quality of our predictions with 2 metrics: RMSE (main metric, checking in Kaggle submission) and $R^2$ score. Next we will use the following models:
* Linear Regression
* Ridge Regression (L2 penalty)
* Lasso Regression (L1 penalty)
* SVR
* Decision Tree
* Random Forest
* XGBoost
* Gradient Boosting
* CatBoost

In [None]:
# Creating RMSE

def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Creating estimating function

r2_list = []
rmse_list = []

def get_metrics(model):
    r2 = model.score(X_val, y_val)
    rmse = rmse_score(y_val, model.predict(X_val))
    r2_list.append(r2)
    rmse_list.append(rmse)
    print('Cross validation score:', cross_val_score(model, X_train, y_train, cv=5))
    print('R2 score:', r2)
    print('RMSE:', rmse)

### Linear Regression

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

get_metrics(linreg)

### Ridge Regression

In [None]:
ridge_reg = Ridge(alpha=.001)
ridge_reg.fit(X_train, y_train)

get_metrics(ridge_reg)

### Lasso Regression

In [None]:
lasso_reg = Lasso(alpha=.001)
lasso_reg.fit(X_train, y_train)

get_metrics(lasso_reg)

### SVR

In [None]:
svr = SVR()
svr.fit(X_train, y_train)

get_metrics(svr)

### Decision Tree

Before using decision tree model we should select the most effective depth. Let's see how rmse depends from tree's depth.

In [None]:
depths = []
scores = []

for d in range(3, 30, 3):
    m = DecisionTreeRegressor(max_depth=d).fit(X_train, y_train)
    depths.append(d)
    scores.append(rmse_score(y_val, m.predict(X_val)))

dt_scores = pd.DataFrame({
    'Depth': depths,
    'Score': scores
})
sns.lineplot(data=dt_scores, x='Depth', y='Score');

In [None]:
tree_depth = int(dt_scores.loc[dt_scores['Score'] == dt_scores['Score'].min(), 'Depth'])
tree_depth

In [None]:
dt = DecisionTreeRegressor(max_depth=tree_depth)
dt.fit(X_train, y_train)

get_metrics(dt)

### Random Forest

Now we will see how rmse depends from n_estimators. 

In [None]:
n_est = []
rf_rmse = []

for n in range(50, 300, 25):
    m = RandomForestRegressor(n_estimators=n).fit(X_train, y_train)
    n_est.append(n)
    rf_rmse.append(rmse_score(y_val, m.predict(X_val)))

rf_scores = pd.DataFrame({
    'N_estimators': n_est,
    'Score': rf_rmse
})
sns.lineplot(data=rf_scores, x='N_estimators', y='Score');

In [None]:
n_estimators = int(rf_scores.loc[rf_scores['Score'] == rf_scores['Score'].min(), 'N_estimators'])
n_estimators

In [None]:
rf = RandomForestRegressor(n_estimators=n_estimators)
rf.fit(X_train, y_train)

get_metrics(rf)

### XGBoost

In [None]:
xgb = XGBRegressor(n_estimators=n_estimators)
xgb.fit(X_train, y_train)

get_metrics(xgb)

### Gradient Boosting

In [None]:
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

get_metrics(gbr)

### CatBoost

In [None]:
cbr = CatBoostRegressor()
cbr.fit(X_train, y_train, verbose=0)

cbr_pred = cbr.predict(X_val)

cbr_r2 = r2_score(y_val, cbr_pred)
cbr_rmse = rmse_score(y_val, cbr_pred)
r2_list.append(cbr_r2)
rmse_list.append(cbr_rmse)

print('R2 score:', cbr_r2)
print('RMSE score:', cbr_rmse)

Now let's look at evaluating summary:

 Linear Regression
* Ridge Regression (L2 penalty)
* Lasso Regression (L1 penalty)
* SVR
* Decision Tree
* Random Forest
* XGBoost
* Gradient Boosting
* CatBoost

In [None]:
model_list = ['linreg', 'ridge', 'lasso', 'svr', 'dt', 'rf', 'xgb', 'gbr', 'cbr']

summary = pd.DataFrame({
    'Model': model_list,
    'R2': r2_list,
    'RMSE': rmse_list
})
summary.sort_values('RMSE')

From the table above we can see that CatBoost Regressor is the best model for our predictions. Now we can make test data predictions.

# Prediction

In [None]:
y_pred = np.exp(cbr.predict(test_data_new))


submission = pd.DataFrame({
    'Id': id_test,
    'SalePrice': y_pred
})

submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)