In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [None]:
PATH = "/kaggle/input/house-prices-advanced-regression-techniques/"

In [None]:
train_df = pd.read_csv(PATH + "train.csv")
test_df = pd.read_csv(PATH + "test.csv")
sub_df = pd.read_csv(PATH + "sample_submission.csv")

In [None]:
print('train_df shape :: ',train_df.shape)
print('test_df shape :: ',test_df.shape)

In [None]:
train_df.head()

House prices are tend to relevant with it's age and location.<br>
Let's check the average 'YearBuilt' and 'MSZoning'.<br>

In [None]:
grouped = train_df['YearBuilt'].groupby(train_df['MSZoning'])
print(grouped.mean())

Clearly, the houses in category 'FV' are recently bulit. If null values exist in 'MSZoning', we can reference a mean value of each 'MSZoning', but since no nulls in here let's go another task.

In [None]:
grouped = train_df['SalePrice'].groupby(train_df['MSZoning'])
print(grouped.mean())

We have 80 columns and 1 label column. Quite huge.. <br>
Which column is our target??

In [None]:
for col in train_df.columns:
    if col not in test_df.columns:
        print(col, "is the target column")

In [None]:
sns.distplot(train_df['SalePrice'])

Thus, our task is to predict 'SalePrice'. The distribution of 'SalePrice' looks a little bit skewed. we will evenly spread the distribution with 'np.log1p' function later.<br><br>
For now, what features do we have?

### Data Fields 
- **SalePrice:** the property's sale price in dollars<br>
- **MSSubClass:** the building class<br>
- **MSZoning:** the general zoning classification<br>
- **LotFrontage:** Linear feet of street connected to property<br>
- **LotArea:** Lot size in square feet<br>
- **Street:** Type of road access<br>
- **Alley:** Type of alley access<br>
- **LotShape:** General shape of property<br>
- **LandContour:** Flatness of the property<br>
- **Utilities:** Type of unitilities available<br>
- **LotConfig:** Lot configuration<br>
- **LandSlope:** Slope of property<br>
- **Neighborhood:** Physical locations within Ames city limits<br>
- **Condition1:** Proximity to main road or railroad<br>
- **Condition2:** Proximity to main road or railroad (if a second is present)<br>
- **BldgType:** Type of dwelling<br>
- **HouseStyle:** Style of dwelling<br>
- **OverallQual:** Overall material and finish quality<br>
- **OverallCond:** Overall condition rating<br>
- **YearBuilt:** Original construction date<br>
- **YearRemodAdd:** Remodel date<br>
- **RoofStyle:** Type of roof<br>
- **RoofMatl:** Roof material<br>
- **Exterior1st:** Exterior covering on house<br>
- **Exterior2nd:** Exterior covering on house (if more than one material)<br>
- **MasVnrType:** Masonry veneer type<br>
- **MasVnrArea:** Masonry veneer area in square feet<br>
- **ExterQual:** Exterior material quality<br>
- **ExterCond:** Present condition of the material on the exterior<br>
- **Foundation:** Type of foundation<br>
- **BsmtQual:** Height of the basement<br>
- **BsmtCond:** General condition of the basement<br>
- **BsmtExposure:** Walkout or garden level basement walls<br>
- **BsmtFinType1:** Quality of basement finished area<br>
- **BsmtFinSF1:** Type 1 finished square feet<br>
- **BsmtFinType2:** Quality of second finished area (if present)<br>
- **BsmtFinSF2:** Type 2 finished square feet<br>
- **BsmtUnfSF:** Unfinished square feet of basement area<br>
- **TotalBsmtSF:** Total square feet of basement area<br>
- **Heating:** Type of heating<br>
- **HeatingQC:** Heating quality and condition<br>
- **CentralAir:** Central air conditioning<br>
- **Electrical:** Electrical system<br>
- **1stFlrSF:** First Floor square feet<br>
- **2ndFlrSF:** Second floor square feet<br>
- **LowQualFinSF:** Low quality finished square feet (all floors)<br>
- **GrLivArea:** Above grade (ground) living area square feet<br>
- **BsmtFullBath:** Basement full bathrooms<br>
- **BsmtHalfBath:** Basement half bathrooms<br>
- **FullBath:** Full bathrooms above grade<br>
- **HalfBath:** Half baths above grade<br>
- **Bedroom:** Number of bedrooms above basement level<br>
- **Kitchen:** Number of kitchens<br>
- **KitchenQual:** Kitchen quality<br>
- **TotRmsAbvGrd:** Total rooms above grade (does not include bathrooms)<br>
- **Functional:** Home functionality rating<br>
- **Fireplaces:** Number of fireplaces<br>
- **FireplaceQu:** Fireplace quality<br>
- **GarageType:** Garage location<br>
- **GarageYrBlt:** Year garage was built<br>
- **GarageFinish:** Interior finish of the garage<br>
- **GarageCars:** Size of garage in car capacity<br>
- **GarageArea:** Size of garage in square feet<br>
- **GarageQual:** Garage quality<br>
- **GarageCond:** Garage condition<br>
- **PavedDrive:** Paved driveway<br>
- **WoodDeckSF:** Wood deck area in square feet<br>
- **OpenPorchSF:** Open porch area in square feet<br>
- **EnclosedPorch:** Enclosed porch area in square feet<br>
- **3SsnPorch:** Three season porch area in square feet<br>
- **ScreenPorch:** Screen porch area in square feet<br>
- **PoolArea:** Pool area in square feet<br>
- **PoolQC:** Pool quality<br>
- **Fence:** Fence quality<br>
- **MiscFeature:** Miscellaneous feature not covered in other categories<br>
- **MiscVal:** Value of miscellaneous feature<br>
- **MoSold:** Month Sold<br>
- **YrSold:** Year Sold<br>
- **SaleType:** Type of sale<br>
- **SaleCondition:** Condition of sale<br><br><br>

Are these features crucial to predict the SalePrice?? I'm not sure<br>
But more data is better than less. Let's preprocessing them!<br>

### 1. Preprocessing
Let's check out the type of each column.<br>
How do the train and test data have different distributions? How many nulls are in each column? Can we fill the null value with any referenced column? Is there a duplicated value? 

In [None]:
sns.distplot(train_df['OverallQual'])

In [None]:
sns.distplot(test_df['OverallQual'])

In [None]:
for col in train_df.columns:
    if train_df[col].isnull().sum() > 0:
        print("[[",col,']] column has >>', train_df[col].isnull().sum(), '<< nulls\n')

Let's drop the features that have hundreds or thousands of nulls. 

In [None]:
null_dominant_cols = ["Alley","PoolQC","Fence","MiscFeature","FireplaceQu"]
train_df.drop(columns=null_dominant_cols, inplace=True)
test_df.drop(columns=null_dominant_cols, inplace=True)

In [None]:
def preprocess(df):
    categorical_cols = ["LotConfig","LotArea","LandSlope","Neighborhood","Condition1",\
                       "Condition2","BldgType","HouseStyle","RoofStyle","RoofMatl",\
                       "Exterior1st","Exterior2nd","MasVnrType","ExterQual","ExterCond",\
                       "Foundation","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1",
                       "BsmtFinSF1","BsmtFinType2","BsmtFinSF2","BsmtUnfSF","YearRemodAdd",\
                       "TotalBsmtSF","Heating","HeatingQC","CentralAir","Utilities",\
                        "Electrical","KitchenQual","Functional","PavedDrive","SaleType",\
                        "GarageType","GarageFinish","GarageQual","GarageCond",\
                        "SaleCondition","MSZoning","LotShape","Street","LandContour"]

    df['MasVnrType'].fillna('None', inplace=True)
    df['BsmtQual'].fillna('None', inplace=True)
    df['BsmtCond'].fillna('None', inplace=True)
    df['BsmtExposure'].fillna('None', inplace=True)
    df['BsmtFinType1'].fillna('None', inplace=True)
    df['BsmtFinType2'].fillna('None', inplace=True)
    df['Electrical'].fillna('None', inplace=True)
    df['GarageType'].fillna('None', inplace=True)
    df['GarageFinish'].fillna('None', inplace=True)
    df['GarageQual'].fillna('None', inplace=True)
    df['GarageCond'].fillna('None', inplace=True)
    df['Exterior1st'].fillna('None', inplace=True)
    df['Exterior2nd'].fillna('None', inplace=True)
    df['Utilities'].fillna('None', inplace=True)
    df['Electrical'].fillna('None', inplace=True)
    df['KitchenQual'].fillna('None', inplace=True)
    df['Functional'].fillna('None', inplace=True)
    df['SaleType'].fillna('None', inplace=True)
    df['MSZoning'].fillna('None', inplace=True)
    # The mean value of 'LotArea' is too large compare to other values. 
    # Thus, with 'np.log1p', the absolute values of 'LotArea' will be harmonized with other features
    df['LotArea'] = np.log1p(df['LotArea'])
    
    df['LotFrontage'].fillna(np.mean(df['LotFrontage']), inplace=True)
    df['MasVnrArea'].fillna(0, inplace=True)
    df['GarageYrBlt'].fillna(df['YearBuilt'], inplace=True)
    
    for col in categorical_cols:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])

    df['MasVnrArea'] = df['MasVnrArea'].astype(int)
    return df

In [None]:
train_df = preprocess(train_df)
test_df = preprocess(test_df)

"Alley","PoolQC","Fence" and "MiscFeature" have over a thousand null values, and  "LotFrontage","FireplaceQu" have hundreds of null values. Others columns have relatively small numbers of null values.<br><br>
The below codes are check the duplicated values in each dataframe, and we can see that there is no duplicated value here. 

In [None]:
print("former shape: ", train_df.shape, " test : ", test_df.shape)
train_df.drop_duplicates()
test_df.drop_duplicates()
print("latter shape: ", train_df.shape, " test : ", test_df.shape)

### 2. Feature Engineering

How do we check a valuable feature for helping price prediction?

In [None]:
train_corrmatrix = train_df.corr()
cols = train_corrmatrix.nlargest(40, "SalePrice")['SalePrice'].index
cm = np.corrcoef(train_df[cols].values.T)
plt.figure(figsize=(50,50))
sns.set(font_scale=2)
sns.heatmap(cm, cbar=True, linewidths=2, vmax=.9, square=True, annot=True,\
           fmt=".2f", annot_kws={"size":17},yticklabels=cols.values, xticklabels=cols.values)
plt.show()

Since this is a SIMPLE EDA, we will remove the features that have an under 0.02 effect on 'SalePrice'.<br><br> Note that this could be hurt the performance of model due to a Heatmap ONLY checking the linear relationship between the features. i.e. Heatmap cannot check the non-linear relationships between the features.<br><br>
Many valuable models have non-linearity, so we cannot check which feature could be effective in models.

In [None]:
cols = list(cols)
cols.extend(["YrSold"])
test_cols = []
for col in cols:
    if col != "SalePrice":
        test_cols.append(col)
        
print('Choosed features : ', cols)

In [None]:
train_df = train_df[cols]
test_df = test_df[test_cols]

In [None]:
def add_features(df):
    df['house_age1'] = df['YrSold'] - df['YearBuilt']
    df['house_age2'] = df['YrSold'] - df['YearRemodAdd']
    df['garage_age'] = df['YrSold'] - df['GarageYrBlt']
    
    df['total_area'] = np.log1p(df['GrLivArea'] + df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF'])
    df['num_rooms'] = df['TotRmsAbvGrd'] + df['BedroomAbvGr'] + df['FullBath']
    return df

In [None]:
train_df = add_features(train_df)
test_df = add_features(test_df)

In [None]:
train_df['YrSold'] = train_df['YrSold'].replace({2008:2, 
                                                 2007:1, 
                                                 2006:0, 
                                                 2009:3, 
                                                 2010:4})
test_df['YrSold'] = test_df['YrSold'].replace({2008:2, 
                                                 2007:1, 
                                                 2006:0, 
                                                 2009:3, 
                                                 2010:4})

In [None]:
train_df.head()

In [None]:
sns.distplot(train_df['SalePrice'])

The above plot looks skewed. Let's make the distribution to follow the Gaussian.

In [None]:
train_label = np.log1p(train_df['SalePrice'])
train_df.drop(columns=['SalePrice'], inplace=True)

In [None]:
train_df.head()

### 3. Build Models

#### 3.1 Simple XGBoost

In [None]:
xgb_param = {'learning_rate': 0.03,
             'max_depth': 40,
             'verbosity': 3,
             'nthread': 5,
             'random_state': 0,
             'subsample': 0.7,
             'n_estimators': 5000,
             'colsample_bytree': 0.8}

In [None]:
model_xgb = xgb.XGBRegressor(learning_rate=xgb_param['learning_rate'],
                            max_depth=xgb_param['max_depth'],
                            verbosity=xgb_param['verbosity'],
                            nthread=xgb_param['nthread'],
                            random_state=xgb_param['random_state'],
                            subsample=xgb_param['subsample'],
                            n_estimators=xgb_param['n_estimators'],
                            colsample_bytree=xgb_param['colsample_bytree'])

In [None]:
model_xgb.fit(train_df, train_label)
xgb_train_pred = model_xgb.predict(train_df)
xgb_test_pred = np.expm1(model_xgb.predict(test_df))

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

print(rmsle(train_label, xgb_train_pred))

In [None]:
sub_df['SalePrice'] = xgb_test_pred
sub_df.to_csv('xgb_submission.csv', index=False)

#### 3.2 Ensemble XGB + LGBM 

In [None]:
lgb_param = {'num_leaves': 5000,
             'objective':'huber',
             'max_depth': 5000,
             'learning_rate': 0.05,
             "bagging_fraction": 0.7,
             "bagging_seed": 11,
             "metric": 'rmse',
             "verbosity": -1,
             "random_state": 0}

In [None]:
model_lgb = lgb.LGBMRegressor(num_leaves=lgb_param['num_leaves'],
                 objective=lgb_param['objective'],
                 max_depth=lgb_param['max_depth'],
                 learning_rate=lgb_param['learning_rate'],
                 bagging_seed=lgb_param['bagging_seed'],
                 metric=lgb_param['metric'],
                 verbosity=lgb_param['verbosity'],
                 random_state=lgb_param['random_state'])

In [None]:
model_lgb.fit(train_df, train_label)
lgb_train_pred = model_lgb.predict(train_df)
lgb_test_pred = np.expm1(model_lgb.predict(test_df))

print(rmsle(train_label, lgb_train_pred))

In [None]:
sub_df['SalePrice'] = lgb_test_pred
sub_df.to_csv('lgb_submission.csv', index=False)

#### 3.3 Stacking (will be updated)

### 4. Feature Importances

In [None]:
from xgboost import plot_importance
fig, ax = plt.subplots(figsize=(50,50))
plot_importance(model_xgb, ax=ax)
plt.show()

In [None]:
from lightgbm import plot_importance
fig, ax = plt.subplots(figsize=(50,50))
plot_importance(model_lgb, ax=ax)
plt.show()