## House Price Regression with XGBoost
## Table of Contents
- Summary
- Import Packages
- Import Datasets
- Common Functions
- Exploratory Data Analysis & Data Preprocessing
    - Statistic infos
    - Missing Value Imputation
    - Convert Categorical Features to Numerical Features
    - Train Validation Split
    - Calculate Correlated Features
    - Feature Scaling
- Model Development and Evaluation

## Summary
In this notebook, I will use XGBoost to create House Price Predictor and use hyperparameter searching techniques to find best results.

## Import Packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn import metrics
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold

## Import Datasets

In [None]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")

test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")


## Common Functions

**Evaluation Function**

In [None]:
def evaluate(model, x_val, y_val):
    y_pred = model.predict(x_val)
    r2 = metrics.r2_score(y_val, y_pred)
    mse = metrics.mean_squared_error(y_val, y_pred)
    mae = metrics.mean_absolute_error(y_val, y_pred)
    msle = metrics.mean_squared_log_error(y_val, y_pred)
    mape = np.mean(tf.keras.metrics.mean_absolute_percentage_error(y_val, y_pred))
    rmse = np.sqrt(mse)
    rmlse_score = rmlse(y_val, y_pred)
    print("R2 Score:", r2)
    print("MSE:", mse)
    print("MAE:", mae)
    print("MSLE:", msle)
    print("MAPE", mape)
    print("RMSE:", rmse)
    print("RMLSE", rmlse_score)
    return {"r2": r2, "mse": mse, "mae": mae, "msle": msle, "mape": mape, "rmse": rmse, "rmlse": rmlse_score}

**Root Mean Squared Logarithmic Error**

In [None]:
def rmlse(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log(y_pred + 1) - np.log(y_true + 1))))

**Submission**

In [None]:
def submit(model, X, ids, file_path):
    SalePrice = model.predict(X)
    submission = pd.DataFrame({"Id": ids, "SalePrice": SalePrice.reshape(-1)})
    submission.to_csv(file_path, index=False)

## Exploratory Data Analysis & Data Preprocessing

In [None]:
train.head()

In [None]:
train.shape

**Statistic infos**

In [None]:
train.info()

In [None]:
train.describe()

**Correlation scores**

In [None]:
correlation_scores = train.corr()
correlation_scores

**Factors that impact house price most**

In [None]:
train.corr()["SalePrice"].sort_values(key = lambda x: abs(x), ascending=False)

### Missing Value Imputation

I will use following strategies to apply imputation to missing values. 
- For numerical columns, I will replace missing value with their mean value.
- For categorical columns, I will replace missing value with unknown category.

In [None]:
for data in [train, test]:
    null_counts = data.isnull().sum()
    null_counts[null_counts > 0]
    null_columns = list(pd.DataFrame(null_counts[null_counts > 0]).index)
    for column in null_columns:
        if data[column].dtype == object:
            data[column] = data[[column]].replace(np.NAN, "Unknown")
        else:
            data[column] = data[column].replace(np.NAN, data[column].mean())

### Convert Categorical Features to Numerical Features

In [None]:
train_test = pd.get_dummies(pd.concat([train, test]))

In [None]:
train_test.head()

In [None]:
mean_value = train_test.mean()
std_value = train_test.std()
mean_value.pop("SalePrice")
std_value.pop("SalePrice")
print(mean_value)
print(std_value)

In [None]:
train_features = train_test.iloc[0: len(train)]
test_features = train_test.iloc[len(train):]
_ = train_features.pop("Id")
_ = test_features.pop("SalePrice")
test_ids = test_features.pop("Id")

### Calculate Correlated Features

In [None]:
train_features.corr()

In [None]:
thresold = 0.05
correlated_scores = train_features.corr()["SalePrice"]
correlated_scores = correlated_scores[correlated_scores.abs() >= thresold]
correlated_columns = list(correlated_scores.index)
correlated_columns.remove("SalePrice")
print(correlated_columns)

In [None]:
y = train_features.pop("SalePrice")
X = train_features

### Feature Scaling

In [None]:
categorical_columns = set(train.dtypes[train.dtypes==object].index)

In [None]:
scale_strategies = ["none", "standard_scale", "standard_scale_exclude_categorcial_features"]
scale_strategy = scale_strategies[2]
if scale_strategy == scale_strategies[1]:
    X = (X - mean_value) / std_value
    test_features = (test_features - mean_value) / std_value
if scale_strategy == scale_strategies[2]:
    for column in train_features.columns:
        is_categorical_feature = False
        components = column.split("_")
        if len(components) == 2 and components[0] in categorical_columns:
            is_categorical_feature = True
        if is_categorical_feature == False:
            for features in [X, test_features]:
                features.loc[:, column] = (features.loc[:, column] - mean_value[column]) / std_value[column]

In [None]:
X.head()

In [None]:
use_correlated_columns = True
if use_correlated_columns:
    X = X[correlated_columns]
    test_features = test_features[correlated_columns]

## Model Development and Evaluation

In [None]:
import xgboost
import time
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
def train_with_xgboost(hyperparameters, X_train,  y_train, X_val = None, y_val = None):
    keys = hyperparameters.keys()
    #for key in keys:
    #    hyperparameters[key] = sklearn.utils.shuffle(hyperparameters[key])
    best_index = {key:0 for key in keys}
    best_model = None
    best_parameters = None
    best_score = 10e8
    for (index, key) in enumerate(keys):
        print("Find best parameter for %s" %(key))
        items = hyperparameters[key]
        best_parameter = items[best_index[key]]
        for (key_index, item) in enumerate(items):
            params = {key2: hyperparameters[key2][best_index[key2]] if key2 != key else item for key2 in keys}
            print("Training with %s" %(params))
            model = xgboost.XGBRegressor(
                **params
            )
            model.fit(X_train, y_train, verbose=False)
            if len(X_val) != 0 and len(y_val) != 0:
                result = evaluate(model, X_val, y_val)
            else:
                result = evaluate(model, X_train, y_train)
            score = result["rmlse"]
            if score < best_score:
                best_score = score
                best_index[key] = key_index
                best_parameter = item
                best_model = model
                best_parameters = params
        print("Best Parameter for %s: "%(key), best_parameter)
    return best_model, best_score, best_parameters

In [None]:
def split_data(X, y, strategy):
    if not strategy in ["full", "kfold", "train_validation_split"]:
        return (0, [], [], [], [])
    if strategy == "full":
        yield (0, X, y, [], [])
    for index, (train_indices, valid_indices) in enumerate(KFold(n_splits=5, shuffle=True).split(X)):
            X_train = X.iloc[train_indices]
            X_val = X.iloc[valid_indices]
            y_train = y.iloc[train_indices]
            y_val = y.iloc[valid_indices]
            yield (index, X_train, y_train, X_val, y_val)
            if strategy != "kfold":
                break

In [None]:
parameters = {
    "max_depth": list(range(4, 10)),
    "learning_rate": list(np.linspace(0.03, 0.15, 13)),
    "booster": ["gbtree", "gblinear", "dart"],
}
models = []
for strategy in ["full", "kfold"]:
    for (index, X_train, y_train, X_val, y_val) in split_data(X, y, strategy):
        begin = time.time()
        best_model, best_score, best_parameters = train_with_xgboost(parameters, X_train, y_train, X_val, y_val)
        print("Best RMLSE: ", best_score)
        print("Best Parameters: ", best_parameters)
        elapsed = time.time() - begin 
        print("Elapsed time: ", elapsed)
        submit(best_model, test_features, test_ids, "submission_%s_%d.csv"%(strategy, index))
        models.append(best_model)

In [None]:
SalePrice = np.mean([model.predict(test_features) for model in models], axis=0)
submission = pd.DataFrame({"Id": test_ids, "SalePrice": SalePrice})
submission.to_csv("submission.csv", index=False)


## If you found my work useful, please give me an upvote, thanks.