In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import matplotlib.pyplot as plt
import seaborn as sns

# Load data

In [None]:
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# Clean data

Consider attributes that are highly correlated with `SalePrice`:

In [None]:
abs(df_train.corr()['SalePrice']).nlargest(10)

Use the scatter chart to show the linear relationship between `GrLivArea` and `SalePrice`:

In [None]:
plt.scatter(df_train['GrLivArea'], df_train['SalePrice'])
plt.xlabel('GrLivArea')
plt.ylabel('SalePrice')

From the chart we can see that these two attributes are linearly related and have two outliers with a value of GrLivArea > 4500.We need to remove these two points::

In [None]:
df_train = df_train[df_train["GrLivArea"] < 4500]

Consider the distribution of `SalePrice`:

In [None]:
sns.distplot(df_train['SalePrice'])

The chart above shows that the distribution of `SalePrice` is disproportionate and is` Positive Skew`, so we need to handle it to make a more symmetric distribution. Here we handle by taking logarithm. *(In addition, the metric used in this problem is to get the logarithm of SalePrice of the two predicted values and actually values then calculate the **rmse** between these two values. Therefore, it is quite reasonable to take SalePrice's logarithm to get the father more symmetrically for better results)*

In [None]:
df_train['SalePrice'] = np.log1p(df_train['SalePrice'])

Distribution of `SalePrice` after using logarithm:

In [None]:
from scipy.stats import norm
sns.distplot(df_train['SalePrice'], fit=norm)

We can see that the distribution of `SalePrice` is more proportionate (close to the standard distribution), not deviating as before processing.

Consider attributes with large missing data:

In [None]:
n = df_train.shape[0]
for col in df_train.columns:
    missing_pct = sum(df_train[col].isnull())*100.0/n
    if missing_pct > 95.0:
        print('{}: {:0.2f}%'.format(col, missing_pct))

We see that `PoolQC` and` MiscFeature` have a large missing data rate, so we will drop these two attributes.

In [None]:
df_train.drop(['PoolQC', 'MiscFeature'], axis=1, inplace=True)
df_test.drop(['PoolQC', 'MiscFeature'], axis=1, inplace=True)

The `Electrical` attribute has only one missing value:

In [None]:
df_train[df_train['Electrical'].isna()]

Since there is only one missing value, we will remove this data point instead of trying to fill the missing value.

In [None]:
df_train = df_train[~df_train['Electrical'].isna()]

Next we need to return the appropriate value type for the attributes `MSSubClass`,` YrSold` and `MoSold` instead of the current numeric type (int64). The reason is because `MSSubClass` is categorical; `YrSold` and` MoSold` are numerical attributes, but we should consider them as categorical attributes which are more appropriate *(for example, with the `MoSold` attribute value 2 (corresponding to February) does not make sense is greater than value 1 (corresponding to January))*. Therefore, we return them to type str (instead of int64).

In [None]:
cols = ["MSSubClass", "YrSold", 'MoSold']
df_train[cols] = df_train[cols].astype(str)
df_test[cols] = df_test[cols].astype(str)

Next we need to remove highly correlated attributes. As the `GarageArea` property is highly correlated with the` GarageCars` property, we will remove the `GarageArea` attribute and retain the` GarageCars` property since `GarageCars` has a higher correlation with` SalePrice`.

In [None]:
df_train.corr()['GarageArea']['GarageCars']

In [None]:
df_train.corr()['GarageArea']['SalePrice']

In [None]:
df_train.corr()['GarageCars']['SalePrice']

Similarly, we find `1stFlrSF` highly correlated with` TotalBsmtSF`; `2ndFlrSF` and` TotRmsAbvGrd` are highly correlated with `GrLivArea`; `GarageYrBlt` is highly correlated with` YearBuilt`:

In [None]:
cname = 'GarageYrBlt'
df_train.corr()[cname][abs(df_train.corr()[cname]) > 0.65]

In [None]:
cname = 'GrLivArea'
df_train.corr()[cname][abs(df_train.corr()[cname]) > 0.65]

In [None]:
cname = 'TotalBsmtSF'
df_train.corr()[cname][abs(df_train.corr()[cname]) > 0.65]

Therefore we will drop the attributes `GarageArea`,` 1stFlrSF`, `2ndFlrSF`,` TotRmsAbvGrd` and `GarageYrBlt`:

In [None]:
col = ['GarageArea','1stFlrSF', '2ndFlrSF','TotRmsAbvGrd', 'GarageYrBlt']
df_train.drop(col, axis=1, inplace=True)
df_test.drop(col, axis=1, inplace=True)

Next we will declare the function `fill_missing_data ()`, with the categorical attribute we will fill in the value 'None', with the numeric attribute we will fill in the value 0.

In [None]:
def fill_missing_data(df):
    df_data = df.copy()
    categoricals = []
    for cname,dtype in df_data.dtypes.items():
        if dtype == 'object':
            categoricals.append(cname)
    # Fill 'None' for the Categorical attribute
    df_data[categoricals] = df_data[categoricals].fillna('None')
    
    for cname in df_data.columns:
        if cname not in categoricals:
            df_data[cname] = df_data[cname].fillna(0) #Fill 0 for the Numeric attribute
    return df_data

In [None]:
df_train = fill_missing_data(df_train)
df_test = fill_missing_data(df_test)

# Feature Engineering

Summary of attributes related to Porch:
* OpenPorchSF
* EnclosedPorch
* 3SsnPorch
* ScreenPorch

In [None]:
df_train['TotalPorchSF'] = df_train['OpenPorchSF'] + df_train['EnclosedPorch'] + df_train['3SsnPorch'] + df_train['ScreenPorch']
df_test['TotalPorchSF'] = df_test['OpenPorchSF'] + df_test['EnclosedPorch'] + df_test['3SsnPorch'] + df_test['ScreenPorch']

Total number of Bathrooms. There are 4 attributes pertaining to the bathroom:
* BsmtFullBath
* BsmtHalfBath
* FullBath
* HalfBath

In [None]:
df_train['TotalBaths'] = df_train['BsmtFullBath'] + df_train['FullBath'] + 0.5*(df_train['BsmtHalfBath'] + df_train['HalfBath'])
df_test['TotalBaths'] = df_test['BsmtFullBath'] + df_test['FullBath'] + 0.5*(df_test['BsmtHalfBath'] + df_test['HalfBath'])

Total area:

In [None]:
df_train['TotalAreaSF'] = df_train['TotalBsmtSF'] + df_train['GrLivArea']
df_test['TotalAreaSF'] = df_test['TotalBsmtSF'] + df_test['GrLivArea']

Age of house from construction to sold:

In [None]:
df_train['Age'] = df_train['YrSold'].astype('int64') - df_train['YearBuilt']
df_test['Age'] = df_test['YrSold'].astype('int64') - df_test['YearBuilt']

Next we will declare the function `feature_engineering()` to convert categorical properties into one-hot vector, binary attributes into 0/1 form and ordinal attributes into ordered numbers (large values carry meaning better than small values): *(Instead of using the get_dummies function)*

In [None]:
def feature_engineering(df):
    df_data = df.copy()
    
    feature = {
        'categorical':{
            'MSSubClass': ['20', '30', '40', '45', '50', '60', '70', '75', '80', '85', '90', '120', '150', '160', '180', '190'],
            'MSZoning': ['A', 'C', 'FV', 'I', 'RH', 'RL', 'RP', 'RM'],
            'Alley': ['Grvl', 'Pave', 'None'],
            'LandContour': ['Lvl', 'Bnk', 'HLS', 'Low'],
            'LotConfig': ['Inside', 'Corner', 'CulDSac', 'FR2', 'FR3'],
            'Neighborhood': ['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel',
                            'Names', 'NoRidge', 'NPkVill', 'NridgHt', 'NWAmes', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker'],
            'Condition1': ['Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe'],
            'Condition2': ['Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe'],
            'BldgType': ['1Fam', '2FmCon', 'Duplx', 'TwnhsE', 'TwnhsI'],
            'HouseStyle': ['1Story', '1.5Fin', '1.5Unf', '2Story', '2.5Fin', '2.5Unf', 'SFoyer', 'SLvl'],
            'RoofStyle': ['Flat', 'Gable', 'Gambrel', 'Hip', 'Mansard', 'Shed'],
            'RoofMatl': ['ClyTile', 'CompShg', 'Membran', 'Metal', 'Roll', 'Tar&Grv', 'WdShake', 'WdShngl'],
            'Exterior1st': ['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco',
                           'VinylSd', 'Wd Sdng', 'WdShing'],
            'Exterior2nd': ['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco',
                           'VinylSd', 'Wd Sdng', 'WdShing'],
            'MasVnrType': ['BrkCmn', 'BrkFace', 'CBlock', 'None', 'Stone'],
            'Foundation': ['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood'],
            'Heating': ['Floor', 'GasA', 'GasW', 'Grav', 'OthW', 'Wall'],
            'Electrical': ['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix'],
            'Functional': ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal'],
            'GarageType': ['2Types', 'Attchd', 'Basment', 'BuiltIn', 'CarPort', 'Detchd', 'None'],
            'GarageFinish': ['Fin', 'RFn', 'Unf', 'None'],
            'PavedDrive': ['Y', 'P', 'N'],
            'MiscFeature': ['Elev', 'Gar2', 'Othr', 'Shed', 'TenC', 'None'],
            'MoSold': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'],
            'YrSold': ['2006', '2007', '2008', '2009', '2010'],
            'SaleType': ['WD', 'CWD', 'VWD', 'New', 'COD', 'Con', 'ConLw', 'ConLI', 'ConLD', 'Oth'],
            'SaleCondition': ['Normal', 'Abnorml', 'AdjLand', 'Alloca', 'Family', 'Partial']
        },
        'binary': {
            'Street': ['Pave', 'Grvl'],
            'CentralAir': ['Y', 'N']          
        },
        'ordinal': {
            'LotShape': ['None', 'IR3', 'IR2', 'IR1', 'Reg'],
            'Utilities': ['None', 'NoSeWa', 'NoSewr', 'AllPub'],
            'LandSlope': ['None', 'Sev', 'Mod', 'Gtl'],
            'ExterQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'ExterCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'BsmtQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'BsmtCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'BsmtExposure': ['None', 'No', 'Mn', 'Av', 'Gd'],
            'BsmtFinType1': ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
            'BsmtFinType2': ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
            'HeatingQC': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'KitchenQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'FireplaceQu': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'GarageQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'GarageCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
            'Fence': ['None', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'],
            'PoolQC': ['None', 'Fa', 'Ta', 'Gd', 'Ex']
        },
    }
    
    selected = []
    for cname in df_data.columns:
        if cname in feature['binary']: # Convert the binary attributes to 0/1
            default_value = feature['binary'][cname][0]
            feature_name = cname + "_is_" + default_value
            selected.append(feature_name)
            df_data[feature_name] = df_data[cname].apply(lambda x: int(x == default_value))
        elif cname in feature['categorical']: # Convert Categorical attributes into One-hot vector
            values = feature['categorical'][cname]
            for val in values:
                try:
                    new_name = "{}_{}".format(cname, val)

                    selected.append(new_name)
                    df_data[new_name] = df_data[cname].apply(lambda x: int(x == val))
                except Exception as err:
                    print("One-hot encoding for {}_{}. Error: {}".format(cname, val, err))
        elif cname in feature['ordinal']: # Convert the Ordinal attributes to a number
            new_name = cname + "_ordinal"
            selected.append(new_name)
            df_data[new_name] = df_data[cname].apply(lambda x: int(feature['ordinal'][cname].index(x)))
        else: # The remaining attributes are numeric so they remain the same
#             print(cname)
            selected.append(cname)
            
    return df_data[selected]

Apply for `df_train` and `df_test`:

In [None]:
df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)
df_train

Next we need to drop the columns that only contain the value 0 (these columns will not make much sense):

In [None]:
for col in df_train.columns:
    if any(df_train[col]) == False:
        df_train.drop([col], axis=1, inplace=True)
        df_test.drop([col], axis=1, inplace=True)

Next, drop the `Id` and` SalePrice` columns before going to the following sections, which need to save the `Id` of the df_test and` SalePrice` of df_train:

In [None]:
ids = df_test['Id']
y = df_train['SalePrice']

In [None]:
df_train.drop(['Id', 'SalePrice'], axis=1, inplace=True)
df_test.drop(['Id'], axis=1, inplace=True)

# Scaler:

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_train)
train = scaler.transform(df_train)
test = scaler.transform(df_test)

# Split Train/Test/Validation

Split df_train into 3 parts: train (75%), test (12.5%), validation (12.5%)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.25, random_state=1)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=123)

# Modeling

In [None]:
from sklearn.metrics import mean_squared_error

In this section we will run some regression model. Then choose the best model.

## XGBRegressor

XGBRegressor initialization parameters (hyperparameters have been selected to achieve good results):

In [None]:
param_init = {
    "max_depth": 5, # default: 3 only for depthwise
    "n_estimators": 3000, # default: 500
    "learning_rate": 0.01, # default: 0.05
    "subsample": 0.5,
    "colsample_bytree": 0.7,  # default:  1.0
    "min_child_weight": 1.5,
    "reg_alpha": 0.75,
    "reg_lambda": 0.4,
    "seed": 42,
#     "eval_metric": "rmse"
}

In [None]:
import xgboost
xgb_model = xgboost.XGBRegressor(**param_init)

In [None]:
param_fit = {
    "eval_metric": "rmse",
    "early_stopping_rounds": 500, # default: 100
    "verbose": 200,
    "eval_set": [(X_val, y_val)]
}

In [None]:
xgb_model = xgb_model.fit(X_train, y_train, **param_fit)

In [None]:
y_pred_xgb = xgb_model.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred_xgb, squared=False)

We see with `XBGRegressor` the error of test set is` 0.11370`.

## DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor  

regressorTree = DecisionTreeRegressor(random_state = 0, min_samples_split=2, max_depth=6)  
regressorTree.fit(X_train, y_train) 

In [None]:
y_pred_Tree = regressorTree.predict(X_test)
mean_squared_error(y_test, y_pred_Tree, squared=False)

## GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

regressorGB = GradientBoostingRegressor(
    max_depth=5,
    n_estimators=10000,
    learning_rate=0.25
)
regressorGB.fit(X_train, y_train)

In [None]:
y_pred_GB = regressorGB.predict(X_test)
mean_squared_error(y_test, y_pred_GB, squared=False)

## Lasso

In [None]:
from sklearn.linear_model import Lasso
regressorLasso = Lasso(alpha=0.0007)
regressorLasso.fit(X_train, y_train)

In [None]:
y_pred_Lasso = regressorLasso.predict(X_test)
mean_squared_error(y_test, y_pred_Lasso, squared=False)

From the above results show that `XGBRegressor` gives the best results. Therefore use `XGBRegressor` to predict.

# Submission

Use `XGBRegressor` to predict:

In [None]:
SalePrice_pred = xgb_model.predict(test)
# Because it takes log () to train, it is necessary to take exp () the predicted result
SalePrice_pred = np.exp(SalePrice_pred)

In [None]:
submission = {'Id': ids, 'SalePrice': SalePrice_pred}

In [None]:
df_submission = pd.DataFrame(submission)
df_submission

In [None]:
df_submission.to_csv('submission.csv', index=False)