# House Prices Prediction

<h3>Hello friend, today I'm going to make my spot on house prices prediction problems, so here we go)</h3>

![](https://media.remax-dev.booj.io/91319a69-7a4b-3a88-83f0-e1a5be6c4d33/06_MiracleHomes.jpg)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.model_selection import train_test_split

# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")

# Data exploration
Let's load and exemaine our data

![](https://images.immediate.co.uk/production/volatile/sites/7/2018/04/BBC-WH9-final-red-2-3239864.jpg?quality=90&resize=620,413)

In [None]:
# Load train test dataframes
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
train_size = int(len(train)*0.8)
# Sort by time and drop target and id columns
train = train.sort_values(['YrSold', 'MoSold'], axis=0).reset_index(drop=True)
train_X = train.drop(['SalePrice', 'Id'], axis=1).loc[:train_size-1, :]
valid_X = train.drop(['SalePrice', 'Id'], axis=1).loc[train_size:, :]
train_y = train['SalePrice'][:train_size]
valid_y = train['SalePrice'][train_size:]
test_X = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
test_X = test_X.sort_values(['YrSold', 'MoSold'], axis=0)

In [None]:
train_X.head()

In [None]:
train_X.info()

## Na Values


![](https://image.freepik.com/free-photo/fill-missing-parts-fragment-white-jigsaw-concept-puzzle-succeed_33807-777.jpg)

As we see many columns has Na values. My way to deal with that is next: replace all NA in categorical columns with None or similar value and numerical features with 0

In [None]:
class CustomImputer:
    def __init__(self):
        pass
    
    def fit(self, data):
        cat_cols = data.columns[data.dtypes == 'object']
        self.impute_cols = cat_cols[data[cat_cols].isna().sum() > 0] 
    
    def transform(self, data):
        for column in self.impute_cols:
            if data[column].isin(['None', 'No', 'Othr']).sum() > 0:
                replace_value = data.loc[data[column].isin(['None', 'No', 'Othr']), column].unique()[0]
                data[column] = data[column].fillna(replace_value)
            else:
                data[column] = data[column].fillna('None')
        
        return data
                
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)
    
    def get_params(self):
        pass

In [None]:
na_cols = train_X.columns[(train_X.isna().sum() > 0).values]
print(f'Columns with NA: {na_cols}')
# Replace NA values in categorical features with None and similar values        
train_X.loc[:, na_cols] = CustomImputer().fit_transform(train_X.loc[:, na_cols])
test_X.loc[:, na_cols] = CustomImputer().fit_transform(test_X.loc[:, na_cols])
valid_X.loc[:, na_cols] = CustomImputer().fit_transform(valid_X.loc[:, na_cols])
# Replace NA values in continuos features with 0 value
train_X.loc[:, na_cols[train_X.loc[:, na_cols].dtypes == 'float64']] = train_X.loc[:, na_cols[train_X.loc[:, na_cols].dtypes == 'float64']].fillna(0)

In [None]:
train_X.columns[train_X.isna().sum() > 0]

# Feature Generation

![](https://pmp-practitioners.com/wp-content/uploads/2019/02/Brainstorming.jpg)

Lets try make some features. Feature engineering is very important step in improving our model

In [None]:
def get_sold_last_mnth(df):
    """ Generates sold houses lats month feature """
    timeline = pd.to_datetime(df['YrSold'].astype('str') + '-' + df['MoSold'].astype('str'), format='%Y-%m')
    tm_ln_indexed = pd.Series(data=timeline.index ,index=timeline.values)
    tm_ln_sold_last_mnth = tm_ln_indexed.rolling('62d').count() - tm_ln_indexed.rolling('31d').count()
    sold_lst_mnth = pd.Series(data=tm_ln_sold_last_mnth.values, index=tm_ln_indexed.values).reindex(df.index)
    return sold_lst_mnth

def get_fireplaces_per_room(df):
    """ Generates number of fireplaces per room feature """
    fp_per_house = df['Fireplaces'] / df['TotRmsAbvGrd']
    return fp_per_house

def get_quality_per_room(df):
    """ Generates quality per room feature """
    qual_per_room = df['OverallQual'] / df['TotRmsAbvGrd']
    return qual_per_room

In [None]:
# Add sold houses last month feature
train_X = train_X.assign(Sold_Lst_Mnth=lambda df: get_sold_last_mnth(df))
valid_X = valid_X.assign(Sold_Lst_Mnth=lambda df: get_sold_last_mnth(df))
test_X = test_X.assign(Sold_Lst_Mnth=lambda df: get_sold_last_mnth(df))
# Add number of fireplaces per room feature
train_X = train_X.assign(FireplacesPerRm=lambda df: get_fireplaces_per_room(df))
valid_X = valid_X.assign(FireplacesPerRm=lambda df: get_fireplaces_per_room(df))
test_X = test_X.assign(FireplacesPerRm=lambda df: get_fireplaces_per_room(df))
# Add quality per room feature
train_X = train_X.assign(QualPerRm=lambda df: get_quality_per_room(df))
valid_X = valid_X.assign(QualPerRm=lambda df: get_quality_per_room(df))
test_X = test_X.assign(QualPerRm=lambda df: get_quality_per_room(df))

# Target leakage

![](https://www.verisk.com/siteassets/media/images/verisk_commercial_premium_leakage_analysis.jpg)

Lets look at the data and check whether is there target leakage. It seems that **YrSold(Year when house is sold)** and **MoSold(Month when house is sold)** will cause target leakage, as we wil not have such values when predicting real world house prices.

In [None]:
ta_leakage_cols = ['YrSold', 'MoSold']
train_X = train_X.drop(ta_leakage_cols, axis=1)
valid_X = valid_X.drop(ta_leakage_cols, axis=1)
test_X =test_X.drop(ta_leakage_cols, axis=1)

In [None]:
# Define catgerical and numerical features
cat_features = train_X.columns[train_X.dtypes == 'object']
num_features = train_X.columns[(train_X.dtypes == 'int64') | (train_X.dtypes == 'float64')]

# Data Visualization

First lets take a look of how our numerical features distributed

In [None]:
train_X[num_features].hist(bins=15, figsize=(20, 20))

Countplots of categorical features

In [None]:
fig, ax = plt.subplots(len(cat_features)//5 + 1, 5)
fig.set_size_inches(20, 30)
for idx, feature in enumerate(cat_features):
    sns.countplot(data=train_X, x=feature, ax=ax[idx//5, idx%5])
    
plt.tight_layout()

Heatmap!!!)

In [None]:
plt.figure(figsize=(30, 18))
sns.heatmap(pd.concat([train_X[num_features], train_y], axis=1).corr(), annot=True)

Next lets exemine how our Sale Prices depends on different categorical columns 

In [None]:
fig, ax = plt.subplots(len(cat_features)//3+1, 3, figsize=(20, len(cat_features)*2))
for idx, feature in enumerate(cat_features):
    sns.violinplot(x=feature, y=train_y, data=train_X, ax=ax[idx//3, idx%3])
    ax[idx//3, idx%3].set_title(f'Violin plot of {feature} x SalePrice')
    ax[idx//3, idx%3].xaxis.set_tick_params(rotation=45)
    
plt.tight_layout()

And numearical columns

In [None]:
plt.style.use('seaborn-darkgrid')
fig, ax = plt.subplots(len(num_features)//3+1, 3, figsize=(36, len(cat_features)*3))
for idx, num_feature in enumerate(num_features):
    sns.scatterplot(data=train_X, x=num_feature, y=train_y, ax=ax[idx//3, idx%3])
    ax[idx//3, idx%3].set_title(f'Regplot of Sale Price x {num_feature}', size=24)
    ax[idx//3, idx%3].set_xlabel(num_feature, size=20)
    ax[idx//3, idx%3].set_ylabel('Sale Price', size=20)
    
plt.tight_layout()

#### As we can see the highest influence on Sale Price have Quality features(Overall, Bath, Fireplace, etc), Conditions(Overall, Basement, etc.), Garage Area, 1stFlSF(Square metres on the 1st floor), Roof Materials, Garage Type and Neighbourhoods. 
#### So, if you want a home with the area like Disneyland, a roof that can protect you from the alien laser during the alien invasion, with best conditions and porcelain birds floating in an artificial pond, be next door to Keanu Reeves and have Big High-Quality Garage, where you and Keanu can put your cars and bikes after fascinating travel, you should pay a big amount of money for that( I hope such houses exist) 

# Data Transformation

![](https://paulitaylor.files.wordpress.com/2016/10/lessons-in-rapid-experiments-and-learning-from-failure.png?w=960)

For ordinal data I'll use OrdinalEncoder(as it much more apropriated for Tree-Based models than OHE) and CatBoosEncoder for nominal, and for numerical Standard Scaler

In [None]:
from sklearn import pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from category_encoders import CatBoostEncoder, OrdinalEncoder

In [None]:
# Lets divide or categorical features on ordinal and nominal
ordinal_features = [
    'LandSlope',
    'Condition1',
    'Condition2',
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtFinType1',
    'BsmtFinType2',
    'HeatingQC',
    'KitchenQual',
    'FireplaceQu',
    'GarageQual',
    'GarageCond',
    'PoolQC'
]
nominal_features = cat_features.drop(ordinal_features)
# Next define transformer of nominal features
nominal_transformer = pipeline.Pipeline(steps=[
    ('cat_boost', CatBoostEncoder()),
    ('scaler', StandardScaler())
])
# Nest feature transformations of numerical ans ordinal cols
num_transformer = pipeline.Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0, missing_values=np.nan)),
    ('scaler', StandardScaler())
])
num_ord_transformer = ColumnTransformer(transformers=[
    ('ordinal_transformer', OrdinalEncoder(), ordinal_features),
    ('num_transformer', num_transformer, num_features)
])
# First transform our nominal features
train_X_nominal = nominal_transformer.fit_transform(train_X[nominal_features], train_y)
valid_X_nominal = nominal_transformer.transform(valid_X[nominal_features])
test_X_nominal = nominal_transformer.transform(test_X[nominal_features])
# Next transform ordinal and numerical features
train_X_num_ord = num_ord_transformer.fit_transform(train_X)
valid_X_num_ord = num_ord_transformer.transform(valid_X)
test_X_num_ord = num_ord_transformer.transform(test_X)
# Finally we concatenate such arrays
train_X_transformed = np.concatenate((train_X_nominal, train_X_num_ord), axis=1)
valid_X_transformed = np.concatenate((valid_X_nominal, valid_X_num_ord), axis=1)
test_X_transformed = np.concatenate((test_X_nominal, test_X_num_ord), axis=1)

# Hyper Parameter Tuning

![](https://techcrunch.com/wp-content/uploads/2017/03/5daa8a29b65f5d8422aaeece44ed0a2d_original.jpg?w=1280&h=2300)

Ohh, it's time for tuning... And it's nice, but takes a very long time(and even more if you have no some kind of strategy). I used presented one by Aarshay Jain: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

I've already made some hyperparameter tuning for XGB, so I just leave tuning of learning rate

In [None]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
params = {
    'learning_rate': np.arange(0.01, 0.11, 0.01)
}
optimizer = GridSearchCV(XGBRegressor(n_estimators=500, tree_method='gpu_hist', max_depth=4, min_child_weight=3, gamma=0, colsample_bytree=0.4, subsample=0.7, learning_rate=0.05), params, cv=TimeSeriesSplit(n_splits=3), n_jobs=-1)
optimizer.fit(train_X_transformed, train_y)
valid_score = mean_absolute_error(valid_y, optimizer.predict(valid_X_transformed))

In [None]:
optimizer.best_params_

In [None]:
pd.DataFrame(optimizer.cv_results_)

In [None]:
print(f'Mean validation score is {valid_score}')

# Training Model

![](https://familyproject.sfsu.edu/sites/default/files/Training%20Image_Medium.jpg)

In [None]:
xgb_params = {
    'colsample_bytree': 0.4,
    'n_estimators': 1000,
    'min_child_weight': 3,
    'max_depth': 6,
    'subsample': 0.4,
    'learning_rate': 0.01,
    'gamma': 0,
    'reg_lambda': 0.02
}
model = XGBRegressor(tree_method='gpu_hist', **xgb_params).fit(train_X_transformed, train_y)

# Model evaluation

![](https://eige.europa.eu/sites/default/files/styles/eige_original_optimised/public/images/evaluation.jpg?itok=DPuDMaP8)

First we calculate MAE on train and validation datasets, next we'll plot learning curves

In [None]:
train_score = mean_absolute_error(train_y, model.predict(train_X_transformed))
print(f'Mean train score is {train_score}')

In [None]:
valid_score = mean_absolute_error(valid_y, model.predict(valid_X_transformed))
print(f'Mean validation score is {valid_score}')

Lets plot learning curve to define whether we have high biase or variance(underfitting and overfitting)

In [None]:
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, X_train, y_train, cv, train_sizes=np.linspace(0.1, 1, 10)):
    plt.style.use('seaborn-darkgrid')
    train_sizes, train_scores, test_scores = learning_curve(estimator, X_train, y_train, cv=cv, n_jobs=-1, train_sizes=train_sizes)
    train_mean_scores = np.mean(train_scores, axis=1)
    test_mean_scores = np.mean(test_scores, axis=1)
    plt.title('Learning curve')
    plt.plot(train_sizes, train_mean_scores, 'y', label='Train Learning curve')
    plt.plot(train_sizes, test_mean_scores, 'b', label='Test Learning curve')
    plt.legend()
    

In [None]:
plot_learning_curve(model, train_X_transformed, train_y, TimeSeriesSplit(n_splits=3))

It seeems that there is overfitting problem, but due to small dataset underfitting is more frightening than overfitting(I suppose so, if I'm not correct please write it in comment)

Lets look at feature importance

In [None]:
features = nominal_features.values.tolist() + ordinal_features + num_features.values.tolist() 
plt.figure(figsize=(25, 20))
sns.barplot(y=features, x=model.feature_importances_)

Obviously if we had large dataset or more features I would do some feature elimination(RFE for example), but here I'll leave all of them

# Making Predictions





![](https://i.imgflip.com/zcyxp.jpg)

Pour champagne ladies and gentelmans, after long way we can make our predictions and submit our results. Sure It's not finall version, for example numerical feature imputation can be improved(we just replaced all Na values with 0), also we can generate lot more features than we did

In [None]:
preds_test = model.predict(test_X_transformed)
output = pd.DataFrame({'Id': test_X.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)