In [None]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

from datetime import datetime
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

# Data processing

In [None]:
# Drop ID
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

In [None]:
# axList = train["SalePrice"].hist(bins=25, figsize=(4,4))

In [None]:
# Log is used to respond to skewness towards large values 
train["SalePrice"] = np.log1p(train["SalePrice"])
y = train['SalePrice'].reset_index(drop=True)
# axList = train["SalePrice"].hist(bins=25, figsize=(4,4))

In [None]:
corr_matrix = train.corr(method='pearson')
corr_matrix['SalePrice'].sort_values(kind="quicksort")

In [None]:
train["BsmtFinSF1"].hist(bins=30, figsize=(15,2))

In [None]:
train.boxplot(figsize=(15,7), column=[
'BsmtFinSF1',
'1stFlrSF',
'TotalBsmtSF',
'GrLivArea',])

In [None]:
train.boxplot(figsize=(15,7), column=[
'BsmtUnfSF',
'2ndFlrSF',
'MasVnrArea',
'GarageArea',
])

In [None]:
train[train['2ndFlrSF'] > 1500]

In [None]:
train.boxplot(figsize=(15,7), column=[
'OpenPorchSF',
'WoodDeckSF',
'LotFrontage',
])

In [None]:
train.boxplot(figsize=(15,7), column=[

'KitchenAbvGr',
'BsmtHalfBath',
'BedroomAbvGr',
'BsmtFullBath',
'HalfBath',
'Fireplaces',
'TotRmsAbvGrd',
'FullBath',
'GarageCars',
'OverallQual'
])

In [None]:
train[train['BsmtFullBath'] > 2]
train[train['OverallQual'] < 2]

In [None]:
train.boxplot(figsize=(15,5), column=[
'EnclosedPorch',
'3SsnPorch',
'ScreenPorch'
])

In [None]:

train.boxplot(figsize=(15,5), column=[
#'YrSold',
#'MoSold',
'YearRemodAdd',
'YearBuilt'
])

In [None]:
train[train['YearBuilt'] < 1900]

In [None]:
# Put test & train features to one df
train_features = train.drop(['SalePrice'], axis=1)
test_features = test
features = pd.concat([train_features, test_features]).reset_index(drop=True)

In [None]:
# Exploring NaN
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    return mis_val_table_ren_columns
missing_values_table(features)

In [None]:
# Removing features that are not very useful:
# - more then 48% of NaN at start : 'PoolQC','MiscFeature','Alley','Fence','FireplaceQu'
features = features.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu'], axis=1)

In [None]:
# Since these column are actually a category , using a numerical number will lead the model to assume
# that it is numerical , so we convert to string .
features['MSSubClass'] = features['MSSubClass'].apply(str)
features['YrSold'] = features['YrSold'].astype(str)
features['MoSold'] = features['MoSold'].astype(str)

In [None]:
print(features['Functional'].value_counts(dropna=False))
print(features['Electrical'].value_counts(dropna=False))
print(features['KitchenQual'].value_counts(dropna=False))

In [None]:
## Filling these columns With most suitable value for these columns 
features['Functional'] = features['Functional'].fillna('Typ') 
features['Electrical'] = features['Electrical'].fillna("SBrkr") 
features['KitchenQual'] = features['KitchenQual'].fillna("TA") 

In [None]:
print(features['Exterior1st'].value_counts(dropna=False))
print(features['Exterior2nd'].value_counts(dropna=False))
print(features['SaleType'].value_counts(dropna=False))

In [None]:
## Filling these with MODE, i.e. , the most frequent value in these columns .
features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0]) 
features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])
features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])

In [None]:
### Missing data in GarageYrBit most probably means missing Garage, so replace NaN with zero . 
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    features[col] = features[col].fillna(0)
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    features[col] = features[col].fillna('None')

### Same with basement
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    features[col] = features[col].fillna('None')

In [None]:
"""
# Since these column are actually a category , using a numerical number will lead the model to assume
# that it is numerical , so we convert to string .
features['MSSubClass'] = features['MSSubClass'].apply(str)
features['YrSold'] = features['YrSold'].astype(str)
features['MoSold'] = features['MoSold'].astype(str)

## Filling these columns With most suitable value for these columns 
features['Functional'] = features['Functional'].fillna('Typ') 
features['Electrical'] = features['Electrical'].fillna("SBrkr") 
features['KitchenQual'] = features['KitchenQual'].fillna("TA") 
features["PoolQC"] = features["PoolQC"].fillna("None")

## Filling these with MODE, i.e. , the most frequent value in these columns .
features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0]) 
features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])
features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])

### Missing data in GarageYrBit most probably means missing Garage, so replace NaN with zero . 
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    features[col] = features[col].fillna(0)
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    features[col] = features[col].fillna('None')

### Same with basement
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    features[col] = features[col].fillna('None')
"""

In [None]:
missing_values_table(features)

In [None]:
# This code will filll the missing values with the mode 
# (The frequently category appearing) By each MSsubclass:
# Idea is that similar MSSubClasses will have similar MSZoning
features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

# Fill the remaining columns as None
objects = []
for i in features.columns:
    if features[i].dtype == object:
        objects.append(i)
features.update(features[objects].fillna('None'))

In [None]:
missing_values_table(features)

In [None]:
features["LotFrontage"].hist(bins=30, figsize=(15,3))

In [None]:
# For missing values in numerical cols , we fillNa with 0
# We are still filling up missing values 
features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics.append(i)
features.update(features[numerics].fillna(0))


numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics2.append(i)
skew_features = features[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

for i in skew_index:
    features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1))

In [None]:
missing_values_table(features)

In [None]:
features

# Feature Engineering

In [None]:
print(features['Utilities'].value_counts(dropna=False))
print(features['Street'].value_counts(dropna=False))

In [None]:
# Removing features that are not very useful:
# - droped because ... : 'Utilities', 'Street'
features = features.drop(['Utilities', 'Street'], axis=1)


In [None]:
print(features['MasVnrType'].value_counts(dropna=False))
print(features['MasVnrArea'].value_counts(dropna=False))

In [None]:
# - droped because ... : 'MasVnrType'
# features = features.drop(['MasVnrType'], axis=1)

In [None]:
# Adding new features. Sums of categiries.

features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd']
features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']

features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
                                 features['1stFlrSF'] + features['2ndFlrSF'])

features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) +
                               features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))

features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
                              features['EnclosedPorch'] + features['ScreenPorch'] +
                              features['WoodDeckSF'])

## If PoolArea = 0 , Then HasPool = 0 too, ...

features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

features.shape

In [None]:
features

In [None]:
########################################################
#  Get_dummies converts Categorical data to numerical, # 
#  as models don't work with Text data                 #
########################################################

final_features = pd.get_dummies(features).reset_index(drop=True)
final_features.shape

In [None]:
final_features

In [None]:
##########################################################
#  Now, again train and test are spilt back seperately,  #
#  as now all data processing is done.                   #
#  Y is taget and its length is used to split            #
##########################################################

X = final_features.iloc[:len(y), :]
X_sub = final_features.iloc[len(y):, :]
X.shape, y.shape, X_sub.shape

In [None]:
final_features.iloc[[30,88,462, 631, 1322]]

In [None]:
# Removing outliers.
# Can be seen by plotting them in a graph.

outliers = [30, 88, 462, 631, 1322]

# From RandomForest:
# outliers = [30, 39, 58, 88, 108, 307, 375, 462, 520, 523, 533, 631, 635, 636, 705, 769, 778, 828, 954, 1179, 1218, 1219, 1298, 1322, 1337]

X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])

In [None]:
overfit = []
for i in X.columns:
    counts = X[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(X) * 100 > 99.94:
        overfit.append(i)

overfit = list(overfit)
X = X.drop(overfit, axis=1)
X_sub = X_sub.drop(overfit, axis=1)
overfit

In [None]:
X.shape, y.shape, X_sub.shape

In [None]:
X_sub

# Final Step

In [None]:
# Process of modelling K-Folds cross-validator
# Provides train/test indices to split data in train/test sets. 


# defining error functions for handy use. 

kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5,16,17,20,25,30,35,40,50,55,60,65,70,80,90]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009]

e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001, 0.002, 0.003, 0.004, 0.005]
e_l1ratio = [0.08, 0.1, 0.3, 0.5, 0.7, 0.8, 0.85, 0.9, 0.95, 0.99, 1]

# svr_c = [11,13,15,17,19,21,23,25]
# svr_epsilon = [0.007,0.008,0.009]
# svr_gamma = [0.0003,0.0004,0.0005,0.0006]

ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)

rfr = RandomForestRegressor(n_estimators=100, random_state=42)

lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )

xgboost = XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)




# Stacking



In [None]:
# Stacking - combine multiple regression models via a meta-regressor. 

# In the standard stacking procedure, the first-level regressors are fit to the same 
# training set that is used prepare the inputs for the second-level regressor,
# which may lead to overfitting. The StackingCVRegressor, however, uses the concept 
# of out-of-fold predictions: the dataset is split into k folds, and in k successive rounds, 

# In simple words, Stacking helps avoid fitting on the same data twice , 
# and is effective in reducing overfitting.

stack_gen = StackingCVRegressor(regressors=(ridge, lasso, svr, elasticnet, gbr, xgboost, lightgbm, rfr),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

# Now, we compare the various models that we just created.
# Using various prediction models that we just created 



score = cv_rmse(rfr , X)
print("Random Forest: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(ridge , X)
print("RIDGE: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lasso , X)
print("LASSO: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(elasticnet)
print("elastic net: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lightgbm)
print("lightgbm: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

In [None]:
print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))

print('Random Forest')
random_forest_model_full_data = rfr.fit(X, y)

print('elasticnet')
elastic_model_full_data = elasticnet.fit(X, y)

print('Lasso')
lasso_model_full_data = lasso.fit(X, y)

print('Ridge')
ridge_model_full_data = ridge.fit(X, y)

print('Svr')
svr_model_full_data = svr.fit(X, y)

print('GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)

print('xgboost')
xgb_model_full_data = xgboost.fit(X, y)

print('lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)

In [None]:
stack_gen_model.predict(np.array(X)).mean()

# Blending Models / 'Ensambling'



In [None]:
# Notice that we are using a few percent from different models to get our final answer, 
# all decimals add up to 1

def blend_models_predict(X):
    return ((0.08 * elastic_model_full_data.predict(X)) + 
            (0.08 * lasso_model_full_data.predict(X)) + 
            (0.08 * ridge_model_full_data.predict(X)) + 
            (0.08 * svr_model_full_data.predict(X)) + 
            (0.08 * random_forest_model_full_data.predict(X)) + 
            (0.1 * gbr_model_full_data.predict(X)) + 
            (0.1 * xgb_model_full_data.predict(X)) + 
            (0.1 * lgb_model_full_data.predict(X)) + 
            (0.30 * stack_gen_model.predict(np.array(X))))

print('RMSLE score on train data:')
print(rmsle(y, blend_models_predict(X)))

In [None]:
print('Predict submission')
submission = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
submission.iloc[:,1] = (np.expm1(blend_models_predict(X_sub)))

## Submission

In [None]:
q1 = submission['SalePrice'].quantile(0.0042)
q2 = submission['SalePrice'].quantile(0.99)
# Quantiles helping us get some extreme values for extremely low or high values 
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x > q1 else x*0.77)
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x < q2 else x*1.1)
submission.to_csv("submission.csv", index=False)

In [None]:
submission.head()

In [None]:
"""
sample_submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20,20),linewidth=2)

ax1 = fig.add_subplot(211)
ax1.plot(submission[['SalePrice']])

ax2 = fig.add_subplot(211)
ax2.plot(sample_submission[['SalePrice']])

plt.legend(["my","sample"], fontsize=15)
           
plt.xlabel('ID', fontsize=15);
plt.show()
"""