In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shutil

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from scipy.stats import skew
from sklearn.preprocessing import OneHotEncoder

sns.set()

pd.set_option('display.max_columns', None)

In [None]:
pth_train = "../input/house-prices-advanced-regression-techniques/train.csv"
pth_test = "../input/house-prices-advanced-regression-techniques/test.csv"

raw_train = pd.read_csv(pth_train)
raw_test = pd.read_csv(pth_test)

all_data = pd.concat((raw_train.iloc[:,:-1], raw_test.iloc[:,:-1]), axis=0).reset_index(drop=True)
print('all_data',all_data.shape)
print('raw_train',raw_train.shape)
print('raw_test',raw_test.shape)

## EXPLORATORY DATA

I do column grouping based on data type and level measurement

In [None]:
categorical_nominal_cols = ['MSSubClass','MSZoning','Street','Alley','LotShape',
                            'LandContour','Utilities','LotConfig','LandSlope',
                            'Neighborhood','Condition1','Condition2','BldgType',
                            'HouseStyle','RoofStyle','RoofMatl','Exterior1st',
                            'Exterior2nd','MasVnrType','Foundation', 'Heating',
                            'Electrical', 'Functional', 'GarageType', 'MiscFeature', 
                            'SaleType','SaleCondition']
categorical_ordinal_cols = ['OverallQual','OverallCond','YearBuilt','YearRemodAdd',
                            'ExterQual','ExterCond','BsmtQual','BsmtCond', 
                            'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                            'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageYrBlt', 
                            'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
                            'PoolQC', 'Fence']
categorical_bool_cols = ['CentralAir']

I created a data dictionary in the ordinal data category, which can be seen in the variable I named categorical_ordinal_cols. The ordering is based on the data description file to which the dataset has been attached

In [None]:
categorical_ordinal2encode = {}
categorical_ordinal2encode["ExterQual"] = {
    'Po' : 0,
    'Fa' : 1,
    'TA' : 2,
    'Gd' : 3,
    'Ex' : 4
}
categorical_ordinal2encode["ExterCond"] = categorical_ordinal2encode["ExterQual"].copy()
categorical_ordinal2encode["BsmtQual"] = {
    'NA' : 0,
    'Po' : 1,
    'Fa' : 2,
    'TA' : 3,
    'Gd' : 4,
    'Ex' : 5
}
categorical_ordinal2encode["BsmtCond"] = categorical_ordinal2encode["BsmtQual"].copy()
categorical_ordinal2encode["BsmtExposure"] = {
    'NA' : 0,
    'No' : 1,
    'Mn' : 2,
    'Av' : 3,
    'Gd' : 4
}
categorical_ordinal2encode["BsmtFinType1"] = {
    'NA' : 0,
    'Unf' : 1,
    'LwQ' : 2,
    'Rec' : 3,
    'BLQ' : 4,
    'ALQ' : 5,
    'GLQ' : 6
}
categorical_ordinal2encode["BsmtFinType2"] = categorical_ordinal2encode["BsmtFinType1"].copy()
categorical_ordinal2encode["HeatingQC"] = categorical_ordinal2encode["ExterQual"].copy()
categorical_ordinal2encode["KitchenQual"] =  categorical_ordinal2encode["HeatingQC"].copy()
categorical_ordinal2encode["FireplaceQu"] = categorical_ordinal2encode["BsmtQual"].copy()
categorical_ordinal2encode["GarageFinish"] = {
    'NA' : 0,
    'Unf' : 1,
    'RFn' : 2,
    'Fin' : 3
}
categorical_ordinal2encode["GarageQual"] = categorical_ordinal2encode["BsmtQual"].copy()
categorical_ordinal2encode["GarageCond"] = categorical_ordinal2encode["BsmtQual"].copy()
categorical_ordinal2encode["PavedDrive"] = {
    'N' : 0,
    'P' : 1,
    'Y' : 2
}
categorical_ordinal2encode["PoolQC"] = {
    'NA' : 0,
    'Fa' : 1,
    'TA' : 2,
    'Gd' : 3,
    'Ex' : 4
}
categorical_ordinal2encode["Fence"] = {
    'NA' : 0,
    'MnWw' : 1,
    'GdWo' : 2,
    'MnPrv' : 3,
    'GdPrv' : 4
}

In [None]:
total_col = 4
total_row = len(categorical_ordinal_cols)//total_col
if len(categorical_ordinal_cols) % total_col > 0:
    total_row += 1
    
idx = 0
fig, axs = plt.subplots(total_row, total_col, figsize=(15,total_row * 4))
for i in range(total_row):
    for j in range(total_col):
        if idx < len(categorical_ordinal_cols): 
            title = categorical_ordinal_cols[idx]
            if title in categorical_ordinal2encode:
                vc = all_data[title].value_counts().reset_index()
#                 vc.rename(columns={'index':'code'}, inplace=True)
#                 vc['index'] = vc['code'].copy()
                vc['index'] = vc['index'].map(categorical_ordinal2encode[title])
#                 vc.set_index('index', inplace=True)
                vc = vc.sort_values('index')
                sns.barplot(data=vc ,x='index', y=title, color='violet', ax = axs[i][j])
            else:
                vc = all_data[title].value_counts().sort_index()
                sns.barplot(x=vc.index, y=vc, color='violet', ax = axs[i][j])
            
            axs[i][j].set_ylabel('frequency')
            axs[i][j].set_xlabel('level')

            axs[i][j].set_title(title)
            idx += 1

plt.tight_layout()
plt.show()

## PREPOCESSING CATEGORICAL ORDINAL FEATURE/ COLUMN 

I will do a check on each feature/column of the category ordinal. I will combine it every unique value of the feature into some data, for example 0, 1, 2, 3 which represents like low, average, high, and very high 

In [None]:
ordinal_columns = ['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
                   'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 
                   'FireplaceQu', 'GarageFinish', 'GarageQual','PavedDrive','PoolQC','Fence']
ordinal_column_transforms = {}
ordinal_column_fillna = {}
ordinal_column_transforms['OverallQual'] = [[1,2,3,4,5,6,7,8,9,10],[0,0,0,0,1,2,3,4,4,4]]
ordinal_column_transforms['OverallCond'] = [[1,2,3,4,5,6,7,8,9,10],[0,0,0,0,0,1,1,1,1,1]]
ordinal_column_transforms['ExterQual'] = [['Po','Fa','TA','Gd','Ex'],[0,0,0,1,1]]
ordinal_column_transforms['ExterCond'] = [['Po','Fa','TA','Gd','Ex'],[0,0,0,1,1]]
ordinal_column_transforms['BsmtQual'] = [['NA','Po','Fa','TA','Gd','Ex'],[0,0,0,0,2,2]]
ordinal_column_transforms['BsmtCond'] = [['NA','Po','Fa','TA','Gd','Ex'],[0,0,0,0,1,1]]
ordinal_column_transforms['BsmtExposure'] = [['NA','No','Mn','Av','Gd'],[0,0,1,1,1]]
ordinal_column_transforms['BsmtFinType1'] = [['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],[0,0,0,1,1,2,2]]
ordinal_column_transforms['BsmtFinType2'] = [['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],[0,0,0,1,1,1,1]]
ordinal_column_transforms['HeatingQC'] = [['NA','Po','Fa','TA','Gd','Ex'],[0,0,0,0,1,1]]
ordinal_column_transforms['KitchenQual'] = [['Po','Fa','TA','Gd','Ex'],[0,0,0,1,1]]
ordinal_column_transforms['FireplaceQu'] = [['NA','Po','Fa','TA','Gd','Ex'], [0,0,0,0,1,1]]
ordinal_column_transforms['GarageFinish'] = [['NA','Unf','RFn','Fin'],[0,1,2,3]]
ordinal_column_transforms['GarageQual']  = [['NA','Po','Fa','TA','Gd','Ex'],[0,0,1,1,2,2]]
ordinal_column_transforms['PavedDrive']  = [['N','P','Y'],[0,0,1]]
ordinal_column_transforms['PoolQC'] = [['NA','Fa','TA','Gd','Ex'],[0,1,1,2,3]]
ordinal_column_transforms['Fence']  = [['NA','MnWw','GdWo','MnPrv','GdPrv'],[0,0, 0,1,2]]

In [None]:
for title in ordinal_columns:
    vc = all_data[title].value_counts().sort_index().reset_index()
    
    all_data[title].replace(ordinal_column_transforms[title][0],ordinal_column_transforms[title][1],
                            inplace=True)
    all_data[title].fillna(0, inplace=True)
    
    vc_changed = all_data[title].value_counts().sort_index()
    
    fig, axs = plt.subplots(1,2,figsize=(8,3))
    
    if title in categorical_ordinal2encode:
        vc['index'] = vc['index'].map(categorical_ordinal2encode[title])
        vc = vc.sort_values('index')
        sns.barplot(data=vc ,x='index', y=title, color='violet', ax = axs[0])
    else:
        sns.barplot(data=vc, x='index', y=title, color='violet', ax=axs[0])
        axs[0].set_ylabel('frequency')
        axs[0].set_xlabel('level')
    
    axs[0].set_title('BEFORE',fontsize=12)
    
    sns.barplot(x=vc_changed.index, y=vc_changed, color='violet', ax=axs[1])
    axs[1].set_title('AFTER',fontsize=12)
    axs[1].set_ylabel('frequency')
    axs[1].set_xlabel('level')

    fig.suptitle(title+' (BEFORE - AFTER)',fontsize=15)

    # plt.title(title+' (BEFORE - AFTER)', fontsize=15)
    plt.tight_layout()
    plt.show()

In [None]:
def updateDfWithBins(val, bins):
    bins = bins.reshape(-1,1)
    bins2 = bins[1:]
    bins2 = np.append(bins2,2700).reshape(-1,1)
    bins_comb = np.concatenate((bins, bins2), axis=1)
    bins_comb
    
    for idx,(a,b) in enumerate(bins_comb):
        if val >= a and val <= b:
            return round(b)
    print(bins_comb)
    print(val)
    return 0 

In [None]:
def updateCols(pd_data, categorical_cols):
    columns = pd_data.columns.values
    res_col = categorical_cols.copy()
    tot_del = 0
    for i, v in enumerate(categorical_cols):
        if v not in columns:
            res_col.pop((i - tot_del))
            tot_del += 1
        
    return res_col

I combine training and test data into a variable. then I will do the filtering of the columns used. data that has more than 50% empty data from that column, then I will delete that column

In [None]:
#I created a variable to group numerical quantitative data types into numerical_quintatif_cols variable variables
numerical_quintatif_cols = []
for column in all_data.columns.values:
    if column not in categorical_bool_cols \
    and column not in categorical_nominal_cols \
    and column not in categorical_ordinal_cols:
        numerical_quintatif_cols.append(column)
len(numerical_quintatif_cols), numerical_quintatif_cols

In [None]:
len_data = len(numerical_quintatif_cols[1:-1])
total_col = 4
total_row = len_data/ total_col
if total_row  % total_col > 0:
    total_row += 1
total_row = int(total_row)  
print(total_row, total_col)
idx = 0
fig, axs = plt.subplots(total_row, total_col, figsize=(15,(total_row * 3)))
for i in range(total_row):
    for j in range(total_col):
        if len_data > idx:
            selected_col = numerical_quintatif_cols[idx]
#             print(selected_col, i, j)
            sns.histplot(data=all_data , x=selected_col, ax=axs[i][j])
            idx += 1
plt.tight_layout()
plt.show()

In [None]:
check_null_cols = all_data.iloc[:raw_train.shape[0],:].isna().sum().rename('Total',axis=1)\
.reset_index()
check_null_cols['percent'] = round((check_null_cols.Total/raw_train.shape[0]) * 100,2)
display(check_null_cols.head())
drop_cols = check_null_cols.loc[check_null_cols.percent > 50]['index']

In [None]:
print("Remove feature unnecessary")
print("Before",all_data.shape)

all_data = all_data.drop(columns=drop_cols).reset_index(drop=True)

print("After", all_data.shape)

In [None]:
print("Check empty cell")
check_null_cols = all_data.isna().sum().rename('Total',axis=1)\
.reset_index()
check_null_cols['percent'] = round((check_null_cols.Total/all_data.shape[0]) * 100,2)
display(check_null_cols.loc[check_null_cols.percent > 0])

In [None]:
numerical_quintatif_cols = updateCols(all_data, numerical_quintatif_cols)
categorical_ordinal_cols = updateCols(all_data, categorical_ordinal_cols)
categorical_bool_cols = updateCols(all_data, categorical_bool_cols)
categorical_nominal_cols = updateCols(all_data, categorical_nominal_cols)

print(len(numerical_quintatif_cols), len(categorical_ordinal_cols), len(categorical_bool_cols),
      len(categorical_nominal_cols))

print(len(numerical_quintatif_cols)+len(categorical_ordinal_cols)+len(categorical_bool_cols)\
+len(categorical_nominal_cols))
print(all_data.shape)

I did some data transformations to deal with columns that have empty data some of the steps include

1. column which is an ordinal category, I will fill in empty data from that column using the type of category that is
often used
2. column for nominal category data, I will fill empty data from that column with unknown identity
3. I will fill in the other columns using the mean value of the data group from that column

In [None]:
for column in check_null_cols['index']:
    if (column in categorical_ordinal_cols) | (column in categorical_bool_cols):
        vc = all_data[column].value_counts()
        all_data[column].fillna(vc.idxmax(), inplace=True)
    elif column in categorical_nominal_cols:
        all_data[column].fillna('unknown', inplace=True)
    else:
        mean = all_data[column].mean()
        all_data[column].fillna(mean, inplace=True)

In [None]:
print("Check empty cell")
check_null_cols = all_data.isna().sum().rename('Total',axis=1)\
.reset_index()
check_null_cols['percent'] = round((check_null_cols.Total/all_data.shape[0]) * 100,2)
display(check_null_cols.loc[check_null_cols.percent > 0])

In [None]:
numerical_quintatif_cols = updateCols(all_data, numerical_quintatif_cols)
categorical_ordinal_cols = updateCols(all_data, categorical_ordinal_cols)
categorical_bool_cols = updateCols(all_data, categorical_bool_cols)
categorical_nominal_cols = updateCols(all_data, categorical_nominal_cols)

I created a data dictionary on a categorical_nominal2encode variable. I will use the content data dictionary to convert data in text form into numeric data.

In [None]:
total_col = 4
total_row = int(len(categorical_nominal_cols)/ total_col)
if len(categorical_nominal_cols) % total_col > 0:
    total_row += 1
fig, axs = plt.subplots(total_row, total_col, figsize=(15, total_row * 3))
idx = 0
for i in range(total_row):
    for j in range(total_col):
        if idx < len(categorical_nominal_cols):
            col_selected = categorical_nominal_cols[idx]
            vc =  all_data[col_selected].value_counts()
            sns.barplot(x=vc.index, y=vc, color='violet', ax = axs[i][j])
            axs[i][j].tick_params(labelrotation=90, axis='x')
            idx += 1
plt.tight_layout()
plt.show()

In [None]:
print(all_data.MSZoning.value_counts(),'\n')
all_data.copy()
all_data.loc[all_data.MSZoning != 'RL', 'MSZoning'] = 0
all_data.loc[all_data.MSZoning == 'RL', 'MSZoning'] = 1
all_data.MSZoning.value_counts()

In [None]:
print(all_data.LotShape.value_counts(),'\n')
all_data.loc[all_data.LotShape != 'Reg', 'LotShape'] = 0
all_data.loc[all_data.LotShape == 'Reg', 'LotShape'] = 1
all_data.LotShape.value_counts()

In [None]:
print(all_data.LandContour.value_counts(),'\n')
all_data.loc[all_data.LandContour != 'Lvl', 'LandContour'] = 0
all_data.loc[all_data.LandContour == 'Lvl', 'LandContour'] = 1
all_data.LandContour.value_counts()

In [None]:
print(all_data.BldgType.value_counts(),'\n')
all_data.loc[all_data.BldgType != '1Fam', 'BldgType'] = 0
all_data.loc[all_data.BldgType == '1Fam', 'BldgType'] = 1
all_data.BldgType.value_counts()

In [None]:
print(all_data.SaleCondition.value_counts(),'\n')
all_data.loc[all_data.SaleCondition != 'Normal', 'SaleCondition'] = 0
all_data.loc[all_data.SaleCondition == 'Normal', 'SaleCondition'] = 1
all_data.SaleCondition.value_counts()

In [None]:
print(all_data.SaleType.value_counts(),'\n')
all_data.loc[all_data.SaleType != 'WD', 'SaleType'] = 0
all_data.loc[all_data.SaleType == 'WD', 'SaleType'] = 1
all_data.SaleType.value_counts()

In [None]:
categorical_nominal2encode = {}
for column in all_data.columns.values:
    if column in categorical_nominal_cols or column in categorical_bool_cols:
        uq = all_data[column].unique()
        categorical_nominal2encode[column] = {}
        for i, val in enumerate(uq):
            categorical_nominal2encode[column][val] = i
categorical_nominal2encode

In [None]:
print("I will change categorical ordinal and nominal feature to number")
for column in all_data.columns.values:
    if column in categorical_ordinal_cols:
        if pd.api.types.is_numeric_dtype(all_data[column]) == False:
            all_data[column] = all_data[column].map(categorical_ordinal2encode[column])
    elif column in categorical_nominal_cols or column in categorical_bool_cols:
        all_data[column] = all_data[column].map(categorical_nominal2encode[column])
all_data.head()

## REMOVE MULTICOLLINEARITY

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [None]:
vif_result = calc_vif(all_data)
vif_result.replace([np.inf, -np.inf], np.nan, inplace=True)
vif_result.dropna(inplace=True)
cols_selected  = vif_result.loc[(vif_result.VIF > 5)]
display(cols_selected.head())

In [None]:
# remove multicollinearity columns
print('before', all_data.shape)
all_data.drop(columns=cols_selected.variables.to_numpy(), inplace=True)
print('after', all_data.shape)

numerical_quintatif_cols = updateCols(all_data, numerical_quintatif_cols)
categorical_ordinal_cols = updateCols(all_data, categorical_ordinal_cols)
categorical_bool_cols = updateCols(all_data, categorical_bool_cols)
categorical_nominal_cols = updateCols(all_data, categorical_nominal_cols)

In [None]:
data_train = all_data.iloc[:raw_train.shape[0],:]
data_test = all_data.iloc[raw_train.shape[0]:,:]

data_train['price'] = raw_train.iloc[:,-1].copy()

print(data_train.shape, data_test.shape)
data_test.head()

## CHECK  PRICE DISTRIBUTION

In [None]:
sns.displot(data_train.price)

In [None]:
data_train['log_price'] = np.log10(data_train.price)
sns.displot(data_train.log_price)

In [None]:
total_features = len(numerical_quintatif_cols)
total_columns = 4
total_rows = total_features//total_columns
if total_features % total_columns > 0:
    total_rows+=1
    
# print(total_rows, total_columns)
idx = 0
fig, axs = plt.subplots(total_rows,total_columns, figsize=(15,total_rows * 4))
for i in range(total_rows):
    for j in range(total_columns):
        if total_features > idx:
            sns.scatterplot(x = data_train[numerical_quintatif_cols[idx]], \
                            y=data_train.log_price , ax=axs[i][j])
            idx += 1
plt.tight_layout()
plt.show()

In [None]:
corr = data_train[numerical_quintatif_cols + ['log_price']].corr()['log_price']
drop_cols = corr.loc[corr < 0].index.values
data_train.drop(columns=drop_cols, inplace=True)
data_test.drop(columns=drop_cols, inplace=True)

numerical_quintatif_cols = updateCols(all_data, numerical_quintatif_cols)
categorical_ordinal_cols = updateCols(all_data, categorical_ordinal_cols)
categorical_bool_cols = updateCols(all_data, categorical_bool_cols)
categorical_nominal_cols = updateCols(all_data, categorical_nominal_cols)

In [None]:
print("I will remove some row outliers based of numerical columns")
print('Before Filter', data_train.shape)
# for columns in numerical_cols:
sp = data_train.log_price

Q1 = sp.quantile(q=0.25)
Q3 = sp.quantile(q=0.75)
IQR = Q3 - Q1
min_sp = Q1 - (IQR * 1.5)
max_sp = Q3 + (IQR * 1.5)

data_train = data_train.loc[(data_train.log_price >= min_sp) & (data_train.log_price <= max_sp)]

print('After Filter',data_train.shape)

## PREPROCESSING DATA

The data preprocessing stage will do several things including:
1. Combining training data and test data
2. Categorical nominal data will be converted into one-hot encoding. In this way it can be interpreted that each nominal category of data has the same position
3. Categorize ordinal data, no further changes are made because the data has been transformed based on its level according to the data dictionary that was created previously
4. The quantitative data will then be scaled using the StandardScaler from the sklearn library

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

In [None]:
prep_all = pd.concat((data_train.iloc[:,:-2], data_test),axis=0)
prep_all.shape

In [None]:
nominal_cols = []
other_cols = []
for column in prep_all.columns.values:
    if column in categorical_nominal_cols or column in categorical_bool_cols:
        nominal_cols.append(column)
    else:
        other_cols.append(column)

print('nominal_cols', len(nominal_cols))
print('other_cols', len(other_cols))

In [None]:
onehot = OneHotEncoder()
scaler = StandardScaler()

onehot.fit(prep_all[nominal_cols])
nominal_col_feature = onehot.transform(prep_all[nominal_cols]).toarray()
numerical_col = prep_all[other_cols].to_numpy()

nominal_col_feature.shape, numerical_col.shape 

In [None]:
np.concatenate((nominal_col_feature,numerical_col),axis=1).shape
data2transform = np.concatenate((nominal_col_feature,numerical_col),axis=1)
scaler.fit(data2transform)

In [None]:
def getSelectedData(pd_data, nominal_cols, other_cols):
    nominal_col_feature = onehot.transform(pd_data[nominal_cols]).toarray()
#     other_col_feature = scaler.transform(pd_data[other_cols])
    data2transform = np.concatenate((nominal_col_feature,  pd_data[other_cols]),axis=1)
    x = scaler.transform(data2transform)#np.concatenate((nominal_col_feature,  other_col_feature),axis=1)
    
    return x#, nominal_col_feature, other_col_feature

## PREPARING DATA TRAINING AND TESTING 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train = getSelectedData(data_train, nominal_cols, other_cols)
x_test = getSelectedData(data_test, nominal_cols, other_cols)

y_train = data_train.iloc[:,-1]

print('Train\n',x_train.shape)
print(y_train.shape,'\n')
print('Test\n',x_test.shape,'\n')

In [None]:
x_train_split, x_val_split, y_train_split, y_val_split = train_test_split(x_train, y_train, test_size=0.3, random_state=101)

## BUILD MODEL 

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [None]:
def checkValidation(y_valid, y_valid_pred):
#     y_valid = 10 ** y_valid
#     y_valid_pred = 10 ** y_valid_pred
    
    r2score = r2_score(y_valid, y_valid_pred)
    mae = mean_absolute_error(y_valid, y_valid_pred)
    mse = mean_squared_error(y_valid, y_valid_pred)
    
    return r2score, mae, mse

In [None]:
summary_prediction = [];

At this stage I will use several models and then will choose the best model to use

### SVM

In [None]:
svm = SVR()
svm.fit(x_train_split, y_train_split)
pred = svm.predict(x_val_split)
r2score, mae, mse = checkValidation(y_val_split, pred)
summary_prediction.append(['svm3',r2score, mae, mse])

### LGBMRegressor

In [None]:
lgbmr = LGBMRegressor(learning_rate=0.01, n_estimators=5000)
lgbmr.fit(x_train_split, y_train_split)
pred = lgbmr.predict(x_val_split)
r2score, mae, mse = checkValidation(y_val_split, pred)
summary_prediction.append(['LGBMRegressor',r2score, mae, mse])

### XGBRegressor

In [None]:
xgb_model = XGBRegressor(n_estimators=2000, learning_rate=0.05)
xgb_model.fit(x_train_split, y_train_split)
pred = xgb_model.predict(x_val_split)
r2score, mae, mse = checkValidation(y_val_split, pred)
summary_prediction.append(['XGBRegressor',r2score, mae, mse])

### CHECK RESULT PREDICT 

In [None]:
pd_summary_prediction = pd.DataFrame(data=summary_prediction, columns=['method','R2Score','MAE', 'MSE'])
pd_summary_prediction.set_index('method', inplace=True)
pd_summary_prediction

# 0.869377	0.044217	0.003321

## SUBMISSION OF TEST DATA

In [None]:
selected_model = XGBRegressor(n_estimators=2000, learning_rate=0.05)
selected_model.fit(x_train, y_train)

In [None]:
predict_submission = selected_model.predict(x_test)
predict_submission

In [None]:
submission  = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
submission.columns.values

In [None]:
submission['SalePrice'] = (10 ** predict_submission)
submission.to_csv('./submission.csv', index=False)
submission.head()
# 0.18071