### **Load Data**

In [None]:
import numpy as np
import pandas as pd

train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

## **Data preprocessing**

### **preprocessing null data**

In [None]:
def preprocessing_null(data_df):
    # I drop features with many null values.
    data_df.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Id'], axis=1, inplace=True)
    # I fill the null data of features with appropriate values.
    Bsmtlist =  ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
    Bsmtlist2=['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF']
    Garagelist = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
    Bathlist = ['BsmtFullBath', 'BsmtHalfBath']
    Extlist = ['Exterior1st', 'Exterior2nd']
    
    data_df.loc[:, Bsmtlist]=data_df.loc[:, Bsmtlist].fillna('TA')
    data_df['Electrical']=data_df['Electrical'].fillna('SBrkr')
    data_df['LotFrontage']=data_df['LotFrontage'].fillna(data_df['LotFrontage'].mean())
    data_df['FireplaceQu'] = data_df['FireplaceQu'].fillna('NA')
    data_df.loc[:, Garagelist] = data_df.loc[:, Garagelist].fillna('NA')
    data_df['GarageYrBlt']=data_df['GarageYrBlt'].fillna(2005)
    data_df.loc[:, 'MasVnrType'] = data_df.loc[:, 'MasVnrType'].fillna('None')
    data_df['MasVnrArea']=data_df['MasVnrArea'].fillna(0)
    data_df.loc[:, Bsmtlist2]=data_df[Bsmtlist2].fillna(0)
    data_df['TotalBsmtSF']=data_df['TotalBsmtSF'].fillna(0)
    data_df['GarageArea']=data_df['GarageArea'].fillna(data_df['GarageArea'].median())
    data_df['GarageCars']=data_df['GarageCars'].fillna(data_df['GarageCars'].median())
    data_df[Bathlist]=data_df[Bathlist].fillna(0)
    
    data_df[Extlist]=data_df[Extlist].fillna('VinylSd')
    data_df['MSZoning']=data_df['MSZoning'].fillna('TA')
    data_df['Utilities']=data_df['Utilities'].fillna('AllPub')
    data_df['KitchenQual']=data_df['KitchenQual'].fillna('TA')
    data_df['Functional']=data_df['Functional'].fillna('Typ')
    data_df['SaleType']=data_df['SaleType'].fillna('WD')
    
    return data_df

split train data into feature data and target data 

In [None]:
train_target = train_df['SalePrice']
train_feature = train_df.drop('SalePrice', axis=1)

In [None]:
train_feature = preprocessing_null(train_feature)
test_feature = preprocessing_null(test_df)

In [None]:
train_feature.info()

## **Drop high corr feature**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train_corr = train_df
plt.figure(figsize=(9, 9))
corr = train_corr.corr()
sns.heatmap(corr)

In [None]:
pd.set_option('display.max_columns', 500)
corr[corr>0.7] # 1stFlrSF-2ndFlrSF, GrLivArea-TotRmsAbvGrd, GarageYrBlt-YearBuilt, GarageArea-GarageCars's corr is over 0.7

In [None]:
def drop_corr_ftr(data_df):
    data_df=data_df.drop(['2ndFlrSF', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea'], axis=1)
    
    return data_df

In [None]:
train_feature = drop_corr_ftr(train_feature)
test_feature = drop_corr_ftr(test_feature)
print(train_feature.shape, test_feature.shape)

### **test availability of Label encoding**

In [None]:
def split_num_obj(data_df):
    df_num = data_df.select_dtypes(exclude='object')
    df_obj = data_df.select_dtypes(include='object')
    
    return df_num, df_obj

In [None]:
train_feature_num, train_feature_obj = split_num_obj(train_feature)
test_feature_num, test_feature_obj = split_num_obj(test_feature)

In [None]:
train_dummies = pd.get_dummies(train_feature_obj)
test_dummies = pd.get_dummies(test_feature_obj)

not_in_train = [column for column in train_dummies.columns if column not in test_dummies.columns]
not_in_test = [column for column in test_dummies.columns if column not in train_dummies.columns]

print('##train_dummies_shape, test_dummies_shape##\n',train_dummies.shape, test_dummies.shape, '\n')
print('##not in train_dummies columns but in test_dummies columns##\n', not_in_train, '\n')
print('##not in test_dummies columns but in train_dummies columns##\n', not_in_test)

**We need to add all-data(train+test) as there are columns that don't exist to apply LabelEncoding.**

In [None]:
df_num_col = train_feature_num.columns

## **Find category feature**

In [None]:
fig, ax = plt.subplots(16, 2, figsize=(10, 80))

for i in range(0, train_feature_num.shape[1]):
    row = int((i)/2)
    col = (i)%2
    rand_ind = np.random.permutation(1460)[:200]
    sns.scatterplot(x=train_feature.loc[rand_ind, df_num_col[i]], y=train_target, ax=ax[row][col])

The columns below are category features.

In [None]:
df_num_cat_col = df_num_col[[0, 3, 4, 15, 16, 17, 18, 19, 20, 21, 22, 28, 29, 30, 31]]

i split num type fetures into cat and non-cat. because i tranform non-cat feature to log scale

In [None]:
def split_num_cat(data_df_num):
    data_df_num_cat = data_df_num[df_num_cat_col]
    data_df_num_non_cat = data_df_num.drop(df_num_cat_col, axis=1)
    
    return data_df_num_cat, data_df_num_non_cat

train_feature_num_cat, train_feature_num_non_cat = split_num_cat(train_feature_num) 
test_feature_num_cat, test_feature_num_non_cat = split_num_cat(test_feature_num)
print(train_feature_num_cat.shape,test_feature_num_cat.shape)
print(train_feature_num_non_cat.shape, test_feature_num_non_cat.shape)

## **Make all data for Label encoding**

In [None]:
# concatenate train_dt and test_df
all_data = pd.concat((train_df, test_df)).reset_index(drop=True)
all_data.drop('SalePrice', axis=1, inplace=True)
print(all_data.shape)

# preprocessing data
all_data = preprocessing_null(all_data)
all_data = drop_corr_ftr(all_data)
print(all_data.shape)

# extract obj data for LabelEncoding
all_data_num, all_data_obj = split_num_obj(all_data)
print(all_data_obj.shape)

## **log transform, Label encoding using all-data**

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

for col in all_data_obj.columns:
    label.fit(all_data_obj.loc[:, col])
    train_feature_obj.loc[:, col] = label.transform(train_feature_obj.loc[:, col])
    test_feature_obj.loc[:, col] = label.transform(test_feature_obj.loc[:, col])

In [None]:
train_target = np.log1p(train_target)
train_feature_num_non_cat = np.log1p(train_feature_num_non_cat)
test_feature_num_non_cat = np.log1p(test_feature_num_non_cat)

## **combine data**

In [None]:
train_feature_fin = pd.concat([train_feature_num_cat, train_feature_num_non_cat, train_feature_obj], axis=1)
test_feature_fin = pd.concat([test_feature_num_cat, test_feature_num_non_cat, test_feature_obj], axis=1)

### remove outlier of important features. **GrLivArea**

In [None]:
cond1 = train_target>500000
cond2 = train_feature_num_non_cat['GrLivArea']>4000

In [None]:
train_feature_fin = train_feature_fin.drop(train_feature_num_non_cat[cond1|cond2].index, axis=0)

**we finish data preprocessing**

## **Predict House Price**

#### Compare Ridge, Lasso, ElasticNet, LGBM

In [None]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# print best rmse and return best_estimator
def get_best_estimator(model, params):
    grid_model = GridSearchCV(model, param_grid=params, scoring="neg_mean_squared_error", cv=5)
    grid_model.fit(train_feature_fin, train_target)
    rmse = np.sqrt(-1*grid_model.best_score_)
    print('{0}, param:{1}, rmse:{2}'.format(model.__class__.__name__, grid_model.best_params_,\
                                            np.round(rmse, 4)))
    return grid_model.best_estimator_

ridge_params = {'alpha':[0.05, 0.1, 1, 5, 8, 10, 15]}
lasso_params = {'alpha':[0.001, 0.005, 0.008, 0.05, 0.1, 0.3, 0.5, 1, 5, 10]}
elastic_params = {'alpha':[0.05, 0.1, 0.5, 1, 3, 5, 8]}
ridge_reg = Ridge()
lasso_reg = Lasso() 
elastic_reg = ElasticNet(l1_ratio=0.7)

lasso_be = get_best_estimator(lasso_reg, lasso_params)
ridge_be = get_best_estimator(ridge_reg, ridge_params)
elastic_be = get_best_estimator(elastic_reg, elastic_params)

In [None]:
lgbm_params = {
    'max_depth':[5, 10, 15, 20, 25, 30],
    'learning_rate':[0.01, 0.05, 0.1, 0.5, 1],
}
lgbm_reg = LGBMRegressor(n_estimators=1000)

lgbm_be = get_best_estimator(lgbm_reg, lgbm_params)

## **change log scale to original scale**

In [None]:
preds = np.expm1(lgbm_be.predict(test_feature_fin))

## **Submit answer**

In [None]:
test=pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
my_submission = pd.DataFrame({'Id': test.Id,
                             'SalePrice': preds})
my_submission.to_csv('submission.csv', index=False)