# **Importing libraries**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder,RobustScaler, PowerTransformer, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, ElasticNet, LassoLars, Lasso, RidgeCV
from sklearn.compose import ColumnTransformer
from sklearn.kernel_ridge import KernelRidge
from sklearn.feature_selection import SelectFromModel

import xgboost as xgb
import lightgbm as lgb

from scipy import stats
from scipy.stats import norm, skew, boxcox_normmax
from scipy.special import boxcox1p

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt  

import warnings
warnings.filterwarnings('ignore')

# Importing and Saving Data

In [None]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
sub = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")

In [None]:
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

# Outliers

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

In [None]:
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)|(train['SalePrice']<36000)].index)
fig, ax = plt.subplots()
ax.scatter(train['GrLivArea'], train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

# Target value transformation

In [None]:
def plot_dist(var):
    sns.distplot(var, fit=norm);
    (mu, sigma) = norm.fit(var)

    #plot the distribution
    plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
                loc='best')
    plt.ylabel('Frequency')
    plt.title('Distribution')

    #QQ-plot
    plt.figure()
    stats.probplot(var, plot=plt)
    plt.show()

plot_dist(train['SalePrice'])

**Non-normality disappears after log(1+x) transformation**

In [None]:
train["SalePrice"] = np.log1p(train["SalePrice"])
plot_dist(train['SalePrice'])

# Independent variables Imputing

In [None]:
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))

In [None]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

In [None]:
for col in ("FireplaceQu", "Fence","Alley", "MiscFeature", "PoolQC", 'GarageType',\
            'GarageFinish', 'GarageQual', 'GarageCond',"MasVnrType",'MSSubClass',\
           'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2',\
            'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath',\
           "MasVnrArea"):
    all_data[col] = all_data[col].fillna(0)

all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
all_data = all_data.drop(['Utilities'], axis=1)
all_data["Functional"] = all_data["Functional"].fillna("Typ")
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])
all_data.isnull().sum().sort_values(ascending=False)

# Feature engineering

In [None]:
all_data['exists_garage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['exists_bsmt'] = all_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
all_data["OverallGrade"] = all_data["OverallQual"] * all_data["OverallCond"]
all_data['Total_Bath'] = all_data['FullBath'] + (0.5 * all_data['HalfBath']) + all_data['BsmtFullBath'] + (0.5 * all_data['BsmtHalfBath'])
all_data["SimplOverallCond"] = all_data.OverallCond.replace({1 : 1, 2 : 1, 3 : 1, # bad
                                                       4 : 2, 5 : 2, 6 : 2, # average
                                                       7 : 3, 8 : 3, 9 : 3, 10 : 3 # good
                                                      })
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)
all_data['OverallCond'] = all_data['OverallCond'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)
all_data['OverallQual']=all_data['OverallQual'].astype(str)
all_data['GarageYrBlt']=all_data['GarageYrBlt'].astype(str)
all_data['GarageCars']=all_data['GarageCars'].astype(str)
all_data['BedroomAbvGr']=all_data['BedroomAbvGr'].astype(str)
all_data['HalfBath']=all_data['HalfBath'].astype(str)


object_feats = all_data.dtypes[all_data.dtypes == "object"].index.tolist()
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index.tolist()

# Eliminating Skewness of features

In [None]:
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
sk = skewed_feats[abs(skewed_feats)>0.5].index.to_list()
all_data[sk] = np.log1p(all_data[sk])

# Training and Predicting

In [None]:
train = all_data[:ntrain]
test  = all_data[ntrain:]

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), object_feats),
        ('num', RobustScaler() , numeric_feats)
    ])

In [None]:
xg = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
clf_xg = Pipeline(steps=[
                    ('pre', preprocessor),
                    ('poly', PolynomialFeatures(2)),
                    ('selection', SelectFromModel(estimator=RandomForestRegressor(n_estimators=300, random_state=1))),
                    ('a', xg),
                    ])

clf_xg.fit(train, y_train)

In [None]:
lg = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf=6, min_sum_hessian_in_leaf = 11)
clf_lg = Pipeline(steps=[
                    ('pre', preprocessor),
                    ('a', lg),
                    ])

clf_lg.fit(train, y_train)

In [None]:
estimators = [
('', RandomForestRegressor(n_estimators=300,random_state=1)),
('kernel_ridge', KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)),
('Boosting', GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', \
                                       min_samples_leaf=15, min_samples_split=10, loss='huber',random_state=1)),
('elasticnet', ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)),
('lasso', Lasso(alpha = 0.0005, random_state=1)),
             ]
stack_reg = StackingRegressor(estimators = estimators, final_estimator = RandomForestRegressor(n_estimators = 500 ,random_state=1), n_jobs=-1)

clf = Pipeline(steps=[
                    ('pre', preprocessor),
                    ('a', stack_reg),
                    ])
clf.fit(train, y_train)

In [None]:
#clf.get_params()

In [None]:
param = {
    'a__final_estimator__n_estimators': [200, 300, 400, 500, 600],
}
Stack = GridSearchCV(clf, param, scoring='accuracy', cv=10).fit(train, y_train)
print(Stack.best_estimator_)
print('best score:')
print(Stack.best_score_)

In [None]:
predictions = 0.5*np.expm1(Stack.best_estimator_.predict(test)) + 0.25*np.expm1(clf_xg.predict(test))+ 0.25*np.expm1(clf_lg.predict(test))

In [None]:
sub['SalePrice'] = predictions
sub.to_csv('submission.csv',index=False)