# ***[House Prices] XGBRegressor***

<img src="https://hips.hearstapps.com/hmg-prod.s3.amazonaws.com/images/suburban-house-royalty-free-image-1584972559.jpg" width="500">

# Import train_data and test_data

### Since there are many columns, set to display all

In [None]:
import pandas as pd
pd.get_option("display.max_columns")
pd.set_option('display.max_columns', 81)

In [None]:
train_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
train_df.head()

In [None]:
test_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
test_df.head()

# EDA on Numerical Columns

### First, let's look at the distribution of numerical columns.

In [None]:
train_num_df = train_df.select_dtypes(include=['int64', 'float64'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig = plt.subplots(figsize=(12, 36))
i=0
for j, feature in enumerate(train_num_df.columns):
    if feature not in ['Id', 'SalePrice']:
        i += 1
        plt.subplot(13, 3, i)
        sns.histplot(train_df[feature], kde=True)
        plt.tight_layout()

### Next, visualize the effect of each numerical column on Sale Price.

In [None]:
fig = plt.subplots(figsize=(12, 36))
i=0
for j, feature in enumerate(train_num_df.columns):
    if feature not in ['Id', 'SalePrice']:
        i += 1
        plt.subplot(13, 3, i)
        sns.scatterplot(x=train_df[feature], y=train_df['SalePrice'])
        plt.tight_layout()

### MSSubClass, OverallQual, OverallCond, Bathroom related features, totalRoomsAbvGrd, MoSold, YrSold, GarageCars, KitchenAbvGrd has discrete values

In [None]:
fig = plt.subplots(figsize=(12, 15))
for i, feature in enumerate(['MSSubClass','OverallQual','OverallCond','MoSold','YrSold','BsmtFullBath',
                             'FullBath','HalfBath','BedroomAbvGr','TotRmsAbvGrd','Fireplaces']):
        plt.subplot(6, 3, i+1)
        sns.barplot(x=train_df[feature], y=train_df['SalePrice'])
        plt.tight_layout()

# EDA on Categorical Columns

In [None]:
train_cat_df = train_df.select_dtypes(include=['object'])

fig=plt.subplots(figsize=(12,60))

for i, feature in enumerate(train_cat_df.columns):
    plt.subplot(15, 3, i+1)
    sns.boxplot(x=train_df['SalePrice'], y=train_df[feature])
    plt.tight_layout()

# Visualize the correlation of each column

In [None]:
plt.figure(figsize=(20, 16))
sns.heatmap(train_num_df.corr(), annot=True)
plt.show()

* 'GarageArea', 'GarageCars'
* 'GarageYrBlt', 'YearBuilt'
* 'TotRmsAbvGrd', 'GrLivArea'
* '1stFlrSF', 'TotalBsmtSF'

Each of these has a high correlation, so you can drop one feature from each of their pair.

# Data Preprocessing

Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. So, convert SalePrice columns to logarithm.

In [None]:
import numpy as np
train_df['SalePrice'] = np.log1p(train_df['SalePrice'])

Delete unnecessary columns

In [None]:
y_train = train_df['SalePrice']
train_df = train_df.drop(['Id', 'GarageArea', 'GarageYrBlt', 
                          'TotRmsAbvGrd', '1stFlrSF', 'SalePrice'], axis=1)
test_df = test_df.drop(['Id', 'GarageArea', 'GarageYrBlt', 
                        'TotRmsAbvGrd', '1stFlrSF'], axis=1)

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder

features_to_encode = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
                     'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
                     'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
                     'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
                     'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
                     'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
                     'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC',
                     'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


for feature in features_to_encode:
    train_df[feature].fillna('miss', inplace=True)
    le = LabelEncoder()
    le.fit(train_df[feature])
    train_df[feature] = le.transform(train_df[feature])

In [None]:
for feature in features_to_encode:
    test_df[feature].fillna('miss', inplace=True)
    le.fit(test_df[feature])
    test_df[feature] = le.transform(test_df[feature])

In [None]:
train_df.head()

In [None]:
test_df.head()

# Define XGB model and check the validation score

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

In [None]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train_df.values)
    rmse= np.sqrt(-cross_val_score(model, train_df.values, 
                                   y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [None]:
from xgboost import XGBRegressor
model_xgb = XGBRegressor(max_depth=6, n_estimators=1000, 
                         random_state=0, learning_rate=0.1, 
                         min_child_weight=4, subsample=0.7)

In [None]:
score = rmsle_cv(model_xgb)
print(f'\nmodel_xgb score : {score.mean()} ({score.std()})')

#  Train the model and create submission data

In [None]:
model_xgb.fit(train_df, y_train)
xgb_pred = np.expm1(model_xgb.predict(test_df))

In [None]:
submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
submission['SalePrice'] = xgb_pred
submission['SalePrice'] = submission['SalePrice'].round().astype(int)
submission.to_csv('submission.csv', index=False)

In [None]:
submission