In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Data preprocessing and Features engineering

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### 1-1 check data distribution

In [None]:
train_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
train_df

In [None]:
train_df.info()

In [None]:
train_df.shape

In [None]:
test_df

In [None]:
test_df.info()

In [None]:
test_df.shape

In [None]:
x_train = train_df.drop(['Id', 'SalePrice'], axis=1)
y_train = train_df['SalePrice']
x_test = test_df.drop('Id', axis = 1)

### 1-2 NULL, NaN value processing

In [None]:
isnull_series = train_df.isnull().sum()

print('\n NULL column and number of them \n', 
      isnull_series[isnull_series > 0].sort_values(ascending=False))

In [None]:
# Remove columns with high NULL values
train_df.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
             , axis = 1, inplace = True)

# Replace non-drop numeric NULL columns with average values.

train_df.fillna(train_df.mean(), inplace=True)


### 1-3 Check the correlation between features

In [None]:
# target: SalePrice
plt.figure(figsize=(10,8))
plt.title('Original Sale Price')
sns.distplot(train_df['SalePrice'])
plt.ylabel('Density', fontsize=13)
plt.xlabel('SalePrice', fontsize=13)

plt.show()

In [None]:
#log transform the target:

train_df["SalePrice"] = np.log1p(train_df["SalePrice"])

plt.figure(figsize=(10,8))
sns.distplot(train_df['SalePrice'])
plt.ylabel('Density', fontsize=13)
plt.xlabel('SalePrice', fontsize=13)

plt.show()

In [None]:
corr_matrix = train_df.corr()
plt.subplots(figsize=(15,15))
sns.heatmap(corr_matrix, vmax=0.9, annot=True, cmap="coolwarm")

In [None]:
# most correlated features
corr_matrix = train_df.corr()
top_corr_features = corr_matrix.index[abs(corr_matrix["SalePrice"])>0.5]
plt.figure(figsize=(10,10))

g = sns.heatmap(train_df[top_corr_features].corr(),
                annot=True,
                cmap="coolwarm")


In [None]:
#Check the correlation between features

print("features highly correlated with the target")

corr_matrix = train_df.corr()

#Sort in ascending order
corr_matrix["SalePrice"].sort_values(axis=0, ascending=False)

In [None]:
corr_matrix = train_df.corr()

#Sort in ascending order
corr_matrix["SalePrice"].sort_values(axis=0, ascending=True)

In [None]:
corr_matrix_2 = pd.DataFrame(corr_matrix.iloc[-1,:]).T

throw_away_col = []
for col in corr_matrix_2:
    if abs(corr_matrix_2[col][0]) <= 0.15:
        throw_away_col.append(col)
print(throw_away_col)

In [None]:
# Remove features lower than 0.15

train_df.drop(['Id', 'MSSubClass', 'OverallCond', 'BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'KitchenAbvGr', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
             , axis = 1, inplace = True)

test_df.drop(['Id', 'MSSubClass', 'OverallCond', 'BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'KitchenAbvGr', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
             , axis = 1, inplace = True)


In [None]:
train_df.head(5)

In [None]:
test_df.head(5)

### 1-4 Remove Outliers for 7 Features

#### 7 features with a positive correlation with "SalePrice"

OverallQual      0.817185

GrLivArea        0.700927

GarageCars       0.680625

GarageArea       0.650888

TotalBsmtSF      0.612134

1stFlrSF         0.596981

FullBath         0.594771

In [None]:
# ① OverallQual
fig, ax = plt.subplots()
ax.scatter(x = train_df['OverallQual'], y = train_df['SalePrice'], alpha = 0.3)
plt.xlabel('OverallQual', fontsize=13)
plt.ylabel('SalePrice', fontsize=13)

plt.show()

In [None]:
# ② GrLivArea
fig, ax = plt.subplots()
ax.scatter(x = train_df['GrLivArea'], y = train_df['SalePrice'], alpha = 0.3)
plt.xlabel('GrLivArea', fontsize=13)
plt.ylabel('SalePrice', fontsize=13)

plt.show()

In [None]:
#Remove Outliers from 'GrLivArea'
train_df = train_df.drop(train_df[(train_df['GrLivArea']>3500) | (train_df['SalePrice']>2.65)].index)

In [None]:
# ③ GarageCars

fig, ax = plt.subplots()
ax.scatter(x = train_df['GarageCars'], y = train_df['SalePrice'], alpha = 0.3)

plt.xlabel('GarageCars', fontsize=13)
plt.ylabel('SalePrice', fontsize=13)
plt.show()

In [None]:
# ④ GarageArea

fig, ax = plt.subplots()
ax.scatter(x = train_df['GarageArea'], y = train_df['SalePrice'], alpha = 0.3)
plt.xlabel('GarageArea', fontsize=13)
plt.ylabel('SalePrice', fontsize=13)

plt.show()

In [None]:
#Remove Outliers from 'GarageArea'

train_df = train_df.drop(train_df[(train_df['GarageArea']>1200) | (train_df['SalePrice']>2.65)].index)

In [None]:
# ⑤ TotalBsmtSF
fig, ax = plt.subplots()
ax.scatter(x = train_df['TotalBsmtSF'], y = train_df['SalePrice'], alpha = 0.3)
plt.xlabel('TotalBsmtSF', fontsize=13)
plt.ylabel('SalePrice', fontsize=13)

plt.show()

In [None]:
#Remove Outliers from 'TotalBsmtSF'

train_df = train_df.drop(train_df[(train_df['TotalBsmtSF'] > 2500) | (train_df['SalePrice']>2.65)].index)

In [None]:
# ⑥ 1stFlrSF 

fig, ax = plt.subplots()
ax.scatter(x = train_df['1stFlrSF'], y = train_df['SalePrice'], alpha = 0.3)
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('1stFlrSF', fontsize=13)

plt.show()

In [None]:
#Remove Outliers from '1stFlrSF' 

train_df = train_df.drop(train_df[(train_df['1stFlrSF'] > 2500) | (train_df['SalePrice'] > 2.65)].index)

In [None]:
# ⑦ FullBath 
fig, ax = plt.subplots()
ax.scatter(x = train_df['FullBath'], y = train_df['SalePrice'], alpha = 0.3)
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('FullBath', fontsize=13)

plt.show()

In [None]:
print('data set shape', train_df.shape)

print('\nfeature type\n', train_df.dtypes.value_counts())

# 1-5 Data scaling

In [None]:
from sklearn.model_selection import KFold

# Setup cross validation folds
kf = KFold(n_splits=12, random_state=42, shuffle=True)

In [None]:
# Define error metrics
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, train_df=train_df):
    rmse = np.sqrt(-cross_val_score(model, train_df, train_labels, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

In [None]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline

lasso = make_pipeline(RobustScaler(), 
                      Lasso(alpha =0.0005, random_state=1))


# 2. Linear regression model training, predicting, testing

Six models were selected. We will select the best model among them.

Predictions and tests using regression models have not yet been fully constructed. 

In the future, this part will be completed.

### 2-1 Light GBM

In [None]:
# Light Gradient Boosting Regressor

from lightgbm import LGBMRegressor

lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=7000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       verbose=-1,
                       random_state=42)


### 2-2 XGBoost Regressor

In [None]:
# XGBoost Regressor

from xgboost import XGBRegressor
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

### 2-3 Ridge regressor

In [None]:
# Ridge Regressor

from sklearn.linear_model import Ridge, RidgeCV

ridge_alphas = [1e-10, 1e-8, 9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 1e-3, 5e-2, 1e-2, 
                0.1, 0.3, 1, 3, 5, 10, 15, 18, 20, 30, 50, 75, 100]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas, cv=kf))


### 2-4 Support Vector Regressor 


In [None]:
# Support Vector Regressor

from sklearn.svm import SVR

svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))

### 2-5 Gradient Boosting Regressor

In [None]:
#Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)  

### 2-6 Random Forest Regressor

In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=1200,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)

In [None]:
from mlxtend.regressor import StackingCVRegressor

# Stack up all the models above, optimized using xgboost
stack_gen = StackingCVRegressor(regressors=(xgboost, lightgbm, svr, ridge, gbr, rf),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)