<a href="https://colab.research.google.com/github/stsibin/ML-projects/blob/master/Final_Regression_Hous_Price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,FunctionTransformer,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge,ElasticNet,Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Kaggle Datasets/House Prices: Advanced Regression Techniques/train.csv')
sub_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Kaggle Datasets/House Prices: Advanced Regression Techniques/test.csv')
submission_id = sub_data['Id']
#data.isna().sum().sort_values(ascending = False).head(20)

In [None]:
data.drop(['Id','Utilities','Street'], axis=1, inplace=True)
data['Electrical'] = data['Electrical'].fillna(data['Electrical'].mode()[0])
data['MSSubClass'] = data['MSSubClass'].astype('object')
data['YearBuilt'] = data['YearBuilt'].astype('object')
data['YearRemodAdd'] = data['YearRemodAdd'].astype('object')
data['MoSold'] = data['MoSold'].astype('object')
data['YrSold'] = data['YrSold'].astype('object')
cal_to_none = ['Alley','Fence','MiscFeature','PoolQC','GarageFinish','GarageQual','GarageCond','GarageType','FireplaceQu','MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',]
cat_to_zero = ['MasVnrArea','GarageYrBlt']
for col in cal_to_none:
    data[col] = data[col].fillna('None')
for col in cat_to_zero:
    data[col] = data[col].fillna(0)
data = data.drop(data[data['LotFrontage']>300].index)
data = data.drop(data[data['MasVnrArea']>1200].index)
data = data.drop(data[data['GrLivArea']>4600].index)
data = data.drop(data[data['GarageArea']>1200].index)
data = data.drop(data[data['SalePrice']>700000].index)
data = data.reset_index(drop=True)
data['SalePrice'] = np.log1p(data['SalePrice'])
data['YrBltAndRemod']=data['YearBuilt']+data['YearRemodAdd']
data['TotalSF']=data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']

data['Total_sqr_footage'] = (data['BsmtFinSF1'] + data['BsmtFinSF2'] +
                                 data['1stFlrSF'] + data['2ndFlrSF'])

data['Total_Bathrooms'] = (data['FullBath'] + (0.5 * data['HalfBath']) +
                               data['BsmtFullBath'] + (0.5 * data['BsmtHalfBath']))

data['Total_porch_sf'] = (data['OpenPorchSF'] + data['3SsnPorch'] +
                              data['EnclosedPorch'] + data['ScreenPorch'] +
                              data['WoodDeckSF'])
data['haspool'] = data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
data['has2ndfloor'] = data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
data['hasgarage'] = data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
data['hasbsmt'] = data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
data['hasfireplace'] = data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
def preprocessing(df):
    df.drop(['Id','Utilities','Street'], axis=1, inplace=True)
    #df['SalePrice'] = np.log1p(df['SalePrice'])
    df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])
    df['MSSubClass'] = df['MSSubClass'].astype('object')
    df['YearBuilt'] = df['YearBuilt'].astype('object')
    df['YearRemodAdd'] = df['YearRemodAdd'].astype('object')
    df['MoSold'] = df['MoSold'].astype('object')
    df['YrSold'] = df['YrSold'].astype('object')
    cal_to_none = ['Alley','Fence','MiscFeature','PoolQC','GarageFinish','GarageQual','GarageCond','GarageType','FireplaceQu','MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',]
    cat_to_zero = ['MasVnrArea','GarageYrBlt']
    for col in cal_to_none:
        df[col] = df[col].fillna('None')
    for col in cat_to_zero:
        df[col] = df[col].fillna(0)
    df['YrBltAndRemod']=df['YearBuilt']+df['YearRemodAdd']
    df['TotalSF']=df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

    df['Total_sqr_footage'] = (df['BsmtFinSF1'] + df['BsmtFinSF2'] +
                                    df['1stFlrSF'] + df['2ndFlrSF'])

    df['Total_Bathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath']) +
                                df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))

    df['Total_porch_sf'] = (df['OpenPorchSF'] + df['3SsnPorch'] +
                                df['EnclosedPorch'] + df['ScreenPorch'] +
                                df['WoodDeckSF'])
    df['haspool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    df['has2ndfloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    df['hasgarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    df['hasbsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    df['hasfireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

    return df

In [None]:
# num_cols = data.dtypes[data.dtypes != 'object'].index
# corrmat = data[num_cols].corr()
# #print(corrmat.shape)
# top15_cols = corrmat.nlargest(15, 'SalePrice')['SalePrice'].index

In [None]:
# corrmat_top15 = data[top15_cols].corr()
# fig, ax = plt.subplots(figsize = (20,20))
# sns.heatmap(corrmat_top15, fmt='.2f', annot=True, annot_kws={'size': 10})

In [None]:
# fig, ax = plt.subplots(3,2, figsize = (15,15))
# sns.scatterplot(data['SalePrice'], data['GarageCars'],ax = ax[0][0])
# sns.scatterplot(data['SalePrice'], data['GarageArea'],ax =ax[0][1])
# sns.scatterplot(data['SalePrice'], data['TotalBsmtSF'],ax = ax[1][0])
# sns.scatterplot(data['SalePrice'], data['1stFlrSF'],ax = ax[1][1])
# sns.scatterplot(data['SalePrice'], data['TotRmsAbvGrd'],ax = ax[2][0])
# sns.scatterplot(data['SalePrice'], data['GrLivArea'],ax = ax[2][1])


In [None]:
#data = preprocessing(data)
X_train, X_test, y_train, y_test = train_test_split(data.drop('SalePrice', axis=1),
                                                    data['SalePrice'],
                                                    test_size=0.3,
                                                    random_state=42)
cat_cols = X_train.dtypes[X_train.dtypes == 'object'].index
num_cols = X_train.dtypes[X_train.dtypes != 'object'].index

In [None]:
Skew_pos_recip = ['MasVnrArea',]
Skew_adj_log0_01 = ['1stFlrSF']
Skew_adj_log0_1 = ['GrLivArea','Total_sqr_footage']
Skew_cbrt = ['TotRmsAbvGrd','Fireplaces','TotalSF']
Skew_sqrt = ['Total_Bathrooms']
to_correct = Skew_pos_recip+Skew_adj_log0_01+Skew_adj_log0_1+Skew_cbrt+Skew_sqrt
num_cols = list(set(num_cols).difference(set(to_correct)))

In [None]:
class CorrLog(BaseEstimator, TransformerMixin):
    def __init__(self, lower=0.001, k=0.2):
        self.lower = lower
        self.k = k
        self.mean = None      
    def fit(self, X, y=None):
        self.mean = np.mean(X)
        return self
    def transform(self, X):
        X = np.log(np.clip(X, self.lower, None) / self.mean + self.k)
        return X

class PosReciprocal(BaseEstimator, TransformerMixin):
    def __init__(self, k=0.01):
        self.k=k    
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for col in X:
            X = (1 / (X.clip(0.01)))
        return X


In [None]:
num_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
num_log_pipe0_01 = Pipeline([
    ('imp', SimpleImputer(strategy='mean')),
    ('corrlog0_01', CorrLog(k=0.01)),
    ('scaler', StandardScaler())
])
num_log_pipe0_1 = Pipeline([
    ('imp', SimpleImputer(strategy='mean')),
    ('corrlog0_1', CorrLog(k=0.1)),
    ('scaler', StandardScaler())
])
num_cbrt = Pipeline([
    ('imp', SimpleImputer(strategy='mean')),
    ('cbrt', FunctionTransformer(np.cbrt, validate=False)),
    ('scaler', StandardScaler())
])
num_sqrt = Pipeline([
    ('imp', SimpleImputer(strategy='mean')),
    ('sqrt', FunctionTransformer(np.sqrt, validate=False)),
    ('scaler', StandardScaler())
])
num_pos_reciprocal = Pipeline([
    ('imp', SimpleImputer(strategy='mean')),
    ('pos_reciprocal', PosReciprocal()),
    ('scaler', StandardScaler())
])
cat_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [None]:
transformers = [('num', num_pipe, num_cols),
                ('cat', cat_pipe, cat_cols),
                ('num_log_0_01', num_log_pipe0_01, Skew_adj_log0_01),
                ('num_log_0_1', num_log_pipe0_1, Skew_adj_log0_1),
                ('num_cbrt', num_cbrt, Skew_cbrt),
                ('num_sqrt', num_sqrt, Skew_sqrt),
                ('num_pos_reciprocal', num_pos_reciprocal, Skew_pos_recip)]
transformer = ColumnTransformer(transformers=transformers)

In [None]:
fulldata = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Kaggle Datasets/House Prices: Advanced Regression Techniques/train.csv')
fulldata = preprocessing(fulldata)
y_fulldata = fulldata.pop('SalePrice').values

In [None]:
ridge = Ridge(alpha=30.90,
              fit_intercept=True,
              max_iter=1000,
              normalize=False,
              random_state=42,
              solver='auto',
              tol=0.001)
pipe_ridge = Pipeline([('tf', transformer), 
                       ('regressor', ridge)])
pipe_ridge.fit(fulldata,y_fulldata)
mean_squared_error(y_test, pipe_ridge.predict(X_test), squared=False)

0.09805161224457193

In [None]:

elasticnet= ElasticNet(alpha=0.0009,
                       copy_X=True,
                       fit_intercept=True,
                       l1_ratio=0.5,
                       max_iter=1000,
                       normalize=False,
                       positive=False,
                       precompute=False,
                       random_state=None,
                       selection='cyclic',
                       tol=0.0001,
                       warm_start=False)
pipe_elasticnet = Pipeline([('tf', transformer), 
                 ('regressor',elasticnet)])
pipe_elasticnet.fit(fulldata,y_fulldata)
mean_squared_error(y_test, pipe_elasticnet.predict(X_test), squared=False)

0.09721933057362546

In [None]:
lasso = Lasso(alpha=0.0006,
              copy_X=True,
              fit_intercept=True,
              max_iter=1000,
              normalize=False,
              positive=False,
              precompute=False,
              random_state=None,
              selection='cyclic',
              tol=0.0001,
              warm_start=False)
pipe_lasso = Pipeline([('tf', transformer), 
                ('regressor', lasso)])
pipe_lasso.fit(fulldata,y_fulldata)
mean_squared_error(y_test, pipe_lasso.predict(X_test), squared=False)

0.09897504244582817

In [None]:
svr = SVR(C=4.7521,
          epsilon= 0.00012,
          gamma=0.000637)
pipe_svr = Pipeline([('tf', transformer), 
                 ('regressor', svr)])
pipe_svr.fit(fulldata,y_fulldata)
mean_squared_error(y_test, pipe_svr.predict(X_test), squared=False)

0.10139101997390464

In [None]:
lgbm = LGBMRegressor(max_depth=150,
                     feature_fraction=0.68118,
                     bagging_freq=3,
                     bagging_fraction=0.27943,
                     lambda_l2= 0.047509,
                     learning_rate=0.009357,
                     num_leaves=231,
                     n_estimators=1172,
                     max_bin=271,
                     subsample= 0.47571,
                     min_data_in_leaf=5,
                     min_sum_hessian_in_leaf=5)
pipe_lgbm = Pipeline([('tf', transformer), 
                 ('regressor',lgbm )])
pipe_lgbm.fit(fulldata,y_fulldata)
mean_squared_error(y_test, pipe_lgbm.predict(X_test), squared=False)

0.039414673540003504

In [None]:
xgboost = XGBRegressor(base_score=0.5, 
                        booster='gbtree', 
                        colsample_bylevel=1,
                        colsample_bynode=1, 
                        colsample_bytree=0.5, 
                        early_stopping_rounds=50,
                        gamma=0.0006, 
                        importance_type='gain', 
                        learning_rate=0.01,
                        max_delta_step=0, 
                        max_depth=3, 
                        min_child_weight=11, 
                        missing=None,
                        n_estimators=2500, 
                        n_jobs=1, 
                        nthread=None, 
                        objective='reg:linear',
                        random_state=42, 
                        reg_alpha=0.0001, 
                        reg_lambda=1,
                        scale_pos_weight=1, 
                        seed=None, 
                        silent=None, 
                        subsample=0.5,
                        verbosity=1)
pipe_xgboost = Pipeline([('tf', transformer), 
                 ('regressor', xgboost )])
pipe_xgboost.fit(fulldata,y_fulldata)
mean_squared_error(y_test, pipe_xgboost.predict(X_test), squared=False)



0.07529981250809115

In [None]:
estimators=[
            ('ridge', pipe_ridge),
            ('svr', pipe_svr),
            ('lasso',pipe_lasso),
            ('elasticnet',pipe_elasticnet),
            ('lgbm',pipe_lgbm),
            ('xgboost', pipe_xgboost)]
stack = StackingRegressor(estimators,
                          final_estimator=elasticnet, cv=10,n_jobs=-1)
stack.fit(fulldata,y_fulldata)
mean_squared_error(y_test, stack.predict(X_test), squared=False)

0.07497876492158458

# Submit


In [None]:
sub_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Kaggle Datasets/House Prices: Advanced Regression Techniques/test.csv')
sub_data = preprocessing(sub_data)
final = np.expm1(stack.predict(sub_data))
final

array([124574.17823241, 162761.972728  , 182090.37507387, ...,
       169713.77272661, 113444.79618908, 220607.93671308])

In [None]:
def blend_models_predict(X):
    return (
        (0.1 * pipe_elasticnet.predict(X)) +\
        # (0.05 * pipe_lasso.predict(X)) +\
        (0.2 * pipe_ridge.predict(X)) +\
        # (0.05 * pipe_svr.predict(X)) +\
        (0.2 * pipe_xgboost.predict(X)) +\
        (0.2 * pipe_lgbm.predict(X)) +\
        (0.3 * stack.predict(X))
        )

final = np.expm1(blend_models_predict(sub_data))
final

array([123738.82533227, 162325.08061645, 180054.49112562, ...,
       166732.05341539, 112687.85551879, 221691.27661591])

In [None]:
sub_to_upload = pd.DataFrame()
sub_to_upload['Id'] = submission_id
sub_to_upload['SalePrice'] = final
sub_to_upload.to_csv('submission_top_15_11|05|2020_10|49.csv',index=False)