# Module Load

In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

# Data Load

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
print(train.shape, test.shape)
print('number of non-categorical feature', train.dtypes[train.dtypes != 'object'].count())
print('number of categorical feature', train.dtypes[train.dtypes == 'object'].count())

train.head(5)

categorical feature와 non-categorical featrue를 구분해서 분석하자.

In [None]:
categorical_columns = train.dtypes[train.dtypes == 'object'].index
non_categorical_columns = train.dtypes[train.dtypes != 'object'].index[1:]

print(categorical_columns)
print(non_categorical_columns)

In [None]:
# missing data

def missing_data_percent(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

In [None]:
all_df = pd.concat((train, test), axis=0).drop(['Id', 'SalePrice'], axis=1)

all_df

In [None]:
missing_data_percent(all_df).head(10)

In [None]:
# 전체의 15% 이상이 Nan값인 columne들을 분석에서 제외한다.

delete_col = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage']

# Numeric Featrue Selection - Correlation >= 0.3

In [None]:
numeric_train = train[non_categorical_columns]

numeric_train.shape

In [None]:
cmatrix = numeric_train.corr()

high_corr_col = cmatrix['SalePrice'][abs(cmatrix['SalePrice']) >= 0.3].index

high_corr_col

In [None]:
plt.figure(figsize = (16, 16))
sns.heatmap(numeric_train[high_corr_col].corr(), annot=True, cmap='mako')

In [None]:
choice_numeric = set(high_corr_col) - set(delete_col) - set(['SalePrice'])

choice_numeric

# Categorical Feature Selection

In [None]:
choice_category = set(categorical_columns) - set(delete_col)

choice_category

# Target Variable - SalePrice

- 일단 생략

# EDA

- 일단 생략

# Missing Value

In [None]:
all_df = pd.concat((train, test), axis=0)
all_df_index = all_df.index

print(all_df.shape)

In [None]:
all_numeric_df = all_df[choice_numeric]
all_category_df = all_df[choice_category]


print(all_numeric_df.shape)
print(all_numeric_df.columns)

print(all_category_df.shape)
print(all_category_df.columns)

## 1) Numeric Feature

In [None]:
missing_data_percent(all_numeric_df) 

In [None]:
sns.distplot(all_numeric_df['GarageYrBlt'])

all_numeric_df['GarageYrBlt'].mean()

In [None]:
all_numeric_df[all_numeric_df['GarageYrBlt'].isnull()]

GarageYrBlt는 범주형 변수임. 후에 범주별로 나눠야 한다.

일단 처음에는 0으로 채워 넣는다.

나중에 범주형 변수로 바꿔야 할 필요도 있음

In [None]:
def missing_numeric(df):
    columns = ['GarageYrBlt', 'MasVnrArea', 'TotalBsmtSF',
               'BsmtFinSF1', 'GarageCars', 'GarageArea']
    df = df.fillna(0)
    return df

## 2) Categorical Feature

In [None]:
missing_data_percent(all_category_df)

모든 missing value는 'No-Observed'로 채운다.

In [None]:
def missing_category(df):
    columns = ['GarageFinish','GarageQual', 'GarageCond', 'GarageType', 'BsmtCond', 'BsmtExposure',
               'BsmtQual', 'BsmtFinType2', 'BsmtFinType1', 'MasVnrType', 'MSZoning', 'Utilities',
               'Functional', 'Exterior1st', 'Electrical', 'Exterior2nd', 'SaleType', "KitchenQual"]
    df = df.fillna('NONE')
    return df

In [None]:
def one_hot_encoding(df):
    columns = ['GarageFinish','GarageQual', 'GarageCond', 'GarageType', 'BsmtCond', 'BsmtExposure',
               'BsmtQual', 'BsmtFinType2', 'BsmtFinType1', 'MasVnrType', 'MSZoning', 'Utilities',
               'Functional', 'Exterior1st', 'Electrical', 'Exterior2nd', 'SaleType', 'KitchenQual']
    
    return pd.get_dummies(df[columns])

# Data PreProcessing


In [None]:
print(choice_category)
print(choice_numeric)

In [None]:
def preprocessing(df):
    category = df[choice_category]
    numeric = df[choice_numeric]
    
    numeric = missing_numeric(numeric)
    category = missing_category(category)
    
    category = one_hot_encoding(category)
    
    return pd.merge(numeric, category, left_index=True, right_index=True)

In [None]:
train_x = preprocessing(train)
train_y = train['SalePrice']

test_x = preprocessing(test)

print(train_x.shape, test_x.shape)

In [None]:
share_col = set(train_x.columns)^set(test_x.columns)

share_train_columns = list(share_col - set(train_x.columns))
share_test_columns = list(share_col - set(test_x.columns))

In [None]:
for col in share_train_columns:
    train_x[col] = 0

for col in share_test_columns:
    test_x[col] = 0
    
sort_columns = train_x.columns    

train_x = train_x[sort_columns]
test_x = test_x[sort_columns]

print(train_x.shape, test_x.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, shuffle=True)

In [None]:
print(X_train.shape, X_val.shape, test_x.shape)

In [None]:
X_train

# Modeling

- Lasso, Ridge, Elastic
- SVR
- GradientBoosting
- xgboost
- lightgbm
- StackingCVRegressor

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

## Ridge

In [None]:
ridge_model = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))

## Lasso

In [None]:
lasso_model = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2,
                                                    random_state=42, cv=kfolds))

## Elastic

In [None]:
elastic_model = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas,
                                                          cv=kfolds, l1_ratio=e_l1ratio))

## SVR

In [None]:
svr_model = make_pipeline(RobustScaler(), SVR(C=20, epsilon=0.008, gamma=0.0003, ))

## GradientBoostingRegressor

In [None]:
gbr_model = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4,
                                     max_features='sqrt', min_samples_leaf=15, min_samples_split=10,
                                     loss='huber', random_state=42)

## light GBM Regressor

In [None]:
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )

## XGboost

In [None]:
xgboost_model = XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                         max_depth=3, min_child_weight=0,
                                         gamma=0, subsample=0.7,
                                         colsample_bytree=0.7,
                                         objective='reg:linear', nthread=-1,
                                         scale_pos_weight=1, seed=27,
                                         reg_alpha=0.00006)

## Evaluate

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X, y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

# Fitting

In [None]:
X_train, X_val, y_train, y_val

In [None]:
# score = cv_rmse(lasso_model, X_train, y_train)

In [None]:
ridge_model.fit(X_train, y_train)
#lasso_model.fit(train_x, train_y)
#elastic_model.fit(train_x, train_y)
svr_model.fit(X_train, y_train)
gbr_model.fit(X_train, y_train)
xgboost_model.fit(X_train, y_train)

In [None]:
lasso_model2 = make_pipeline(RobustScaler(), LassoCV(max_iter=1e4, alphas=alphas2,
                                                    random_state=42, cv=kfolds))
lasso_model2.fit(train_x, train_y)

print(MSE(lasso_model2, X_train, y_train), MSE(lasso_model2, X_val, y_val))

In [None]:
def MSE(model, X, y):
    return mean_squared_error(y, model.predict(X))

In [None]:
print(MSE(ridge_model, X_train, y_train), MSE(ridge_model, X_val, y_val))

In [None]:
print(MSE(svr_model, X_train, y_train), MSE(svr_model, X_val, y_val))

In [None]:
print(MSE(gbr_model, X_train, y_train), MSE(gbr_model, X_val, y_val))

In [None]:
print(MSE(xgboost_model, X_train, y_train), MSE(xgboost_model, X_val, y_val))

In [None]:
test_x.shape

In [None]:
y_pred = xgboost_model.predict(test_x)

In [None]:
submission = pd.concat([test['Id'], pd.Series(y_pred)], axis=1)

submission.rename(columns = {0:'SalePrice'}, inplace=True)

submission.head(5)

In [None]:
submission.to_csv('new_submission.csv', index=False)