In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Hello Kagglers!
## This will be a walkthrough to applying several methods to minimze the errors as much as we can mainly through feature engineering 



In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
#polynomial
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

In [None]:
#load test and train data
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_train

## EDA & Visualizing the data

In [None]:
#notice Overall Quality is very highliy correlated
corr_y = df_train.corr()
corr_y['SalePrice'].sort_values(ascending=False).abs()[1:]

In [None]:
#select continous features only
num_columns = [col for col in df_train.columns if (df_train[col].dtype == 'int64' or df_train[col].dtype == 'float64') and (col !='Id') ]

In [None]:
corr = df_train[num_columns].corr().abs()

mask = np.triu(np.ones_like(corr, dtype=np.bool))

plt.figure(figsize=(18,8))

# plot heatmap
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap='coolwarm',
            cbar_kws={"shrink": .8}, vmin=0, vmax=1)
# yticks
#plt.yticks(rotation=0)
plt.show()

## Plotting different features against Prices

### Continous features

In [None]:
df_col = df_train[num_columns]
for i in range(0, len(num_columns), 5):
        sns.pairplot(data=df_col,
                    x_vars=df_col.columns[i:i+5],
                    y_vars=['SalePrice'])

### Categorical features

In [None]:
cat_columns = [col for col in df_train.columns if (df_train[col].dtype == 'object')]

### There appears to be multple feauture with a linear relation with Sale price and other useless features

## We start bulding a baseline model for our linear regression problem using continous features
### Since we have a relatively small dataset, the model is cross validated with K-folds = 4 


In [None]:
train = df_train[num_columns]
train.fillna(train.mean(), inplace=True)
x_train = preprocessing.scale(train.iloc[:,:-1])
y_train = np.log1p(train.iloc[:,-1:])
sns.distplot(y_train) # salesprice now symmetric

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
error = cross_val_score(lin_reg, x_train, y_train, cv=4, scoring='neg_root_mean_squared_error')
error , error.mean()

### exploring features importance using coffiecents

In [None]:
model = lin_reg
fet = []
val = []
for i in range(len(num_columns[:-1])):
#df_train[num_col2].columns[0]
    fet.append(df_train[num_columns[:-1]].columns[i])
    val.append(model.coef_[0][i])
    
pd.DataFrame({'features':fet, 'coff':val}).sort_values(by='coff',key=abs,ascending=False).head(10)


#### notice there are very large cofficents, which both indicate importance and the need for regulization

### There seems to be a pattern in predicting observation groups, so we will use shuffled k-folds

## Now we will work on decreasing the error from our baseline model and include all features
#### We start by detecting nan values

In [None]:
# removing outliers didnt help increase model accuracy
# def drop_outliers(df, field_name):
#     iqr = 10 * 1.5 * (np.percentile(df[field_name], 75) - np.percentile(df[field_name], 25))
#     df.drop(df[df[field_name] > (iqr + np.percentile(df[field_name], 75))].index, inplace=True)
#     df.drop(df[df[field_name] < (np.percentile(df[field_name], 25) - iqr)].index, inplace=True)

In [None]:
# print(df_train.shape)
# drop_outliers(df_train, 'LotArea')
# print(df_train.shape)

In [None]:
#missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
#get columns function
def get_column_names_from_ColumnTransformer(column_transformer):    
    col_name = []

    for transformer_in_columns in column_transformer.transformers_[:-1]: #the last transformer is ColumnTransformer's 'remainder'
        #print('\n\ntransformer: ', transformer_in_columns[0])
        
        raw_col_name = list(transformer_in_columns[2])
        
        if isinstance(transformer_in_columns[1], Pipeline): 
            # if pipeline, get the last transformer
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
            
        try:
          if isinstance(transformer, OneHotEncoder):
            names = list(transformer.get_feature_names(raw_col_name))
            
          elif isinstance(transformer, SimpleImputer) and transformer.add_indicator:
            missing_indicator_indices = transformer.indicator_.features_
            missing_indicators = [raw_col_name[idx] + '_missing_flag' for idx in missing_indicator_indices]

            names = raw_col_name + missing_indicators
            
          else:
            names = list(transformer.get_feature_names())
          
        except AttributeError as error:
          names = raw_col_name
        
        #print(names)    
        
        col_name.extend(names)
            
    return col_name

### checking feature skewnesss

In [None]:
from scipy.stats import skew
numeric_feats = df_train.dtypes[df_train.dtypes != "object"].index

skewed_feats = df_train[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)

skewed_feats

In [None]:
def fixing_skewness(df, num_col):
    """
    This function takes in a dataframe and return fixed skewed dataframe
    """
    ## Import necessary modules 
    from scipy.stats import skew
    from scipy.special import boxcox1p
    from scipy.stats import boxcox_normmax
    
    ## Getting all the data that are not of "object" type. 
    numeric_feats = df.dtypes[num_col].index

    # Check the skew of all numerical features
    skewed_feats = df[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)
    high_skew = skewed_feats[abs(skewed_feats) > 0.5]
    skewed_features = high_skew.index

    for feat in skewed_features:
        df[feat] = boxcox1p(df[feat], boxcox_normmax(df[feat] + 1))

#fixing_skewness(alldf,num_columns)

In [None]:
sns.distplot(df_train['PoolArea']);

### Merge both test and train data and start feature engineering

In [None]:
alldf = pd.concat([df_train , df_test])
alldf.head()

In [None]:
alldf['MSSubClass'] = alldf['MSSubClass'].apply(str)

In [None]:
#remove missing or useless data
alldf.drop(['Utilities', 'Street', 'PoolQC','MiscFeature','Alley'], axis=1, inplace=True)
alldf.drop(['GarageYrBlt', 'YearRemodAdd',], axis=1, inplace=True)

In [None]:
#new features
alldf['haspool'] = alldf['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
alldf['has2ndfloor'] = alldf['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
# alldf['hasgarage'] = alldf['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
# alldf['hasbsmt'] = alldf['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
# alldf['hasfireplace'] = alldf['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
newfet = ['haspool','has2ndfloor']

In [None]:
alldf["SqFtPerRoom"] = alldf["GrLivArea"] / (alldf["TotRmsAbvGrd"] +
                                                       alldf["FullBath"] +
                                                       alldf["HalfBath"] +
                                                       alldf["KitchenAbvGr"])

alldf['Total_Home_Quality'] = alldf['OverallQual'] + alldf['OverallCond']

alldf['Total_Bathrooms'] = (alldf['FullBath'] + (0.5 * alldf['HalfBath']) +
                               alldf['BsmtFullBath'] + (0.5 * alldf['BsmtHalfBath']))

alldf["HighQualSF"] = alldf["1stFlrSF"] + alldf["2ndFlrSF"]

In [None]:
cat_columns = [col for col in alldf.columns if (alldf[col].dtype != 'int64' and alldf[col].dtype != 'float64') or (col in newfet)]
cat_columns

In [None]:
num_columns = [col for col in alldf.columns if (alldf[col].dtype == 'int64' or alldf[col].dtype == 'float64') and (col != 'SalePrice' and col != 'Id' and col not in newfet)]
num_columns

In [None]:
fixing_skewness(alldf,num_columns)

### Creating a data pipeline to transform numerical and categorical data

In [None]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="mean")),
        ('std_scaler', StandardScaler()),
    ])

In [None]:
cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder()),
    ])

In [None]:
num_attribs = num_columns
cat_attribs = cat_columns

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs)],
        
    )

In [None]:
housing_f = full_pipeline.fit_transform(alldf)
housing_f.shape

In [None]:
# housing_f = full_pipeline.fit_transform(alldf)
# display(housing_f)

In [None]:
a = get_column_names_from_ColumnTransformer(full_pipeline)
a[250:]

In [None]:
#full_pipeline.transformers_[1][1]\
 #   ['encoder'].get_feature_names(cat_attribs)
#full_pipeline.get_feature_names()
y_train2 = np.log1p(df_train.iloc[:,-1:])
kf = KFold(4, shuffle=True, random_state=1)
t = df_train.shape[0]

In [None]:
train = housing_f[:t]
housing_f[:t].shape

In [None]:
test = housing_f[t:]
housing_f[t:].shape

### we've reached a much better score using lasso and our new features

In [None]:
alph = np.linspace(0.0001, 0.001, 20)
#itr = np.linspace(1000, 10000, 5)


clf2 = GridSearchCV(estimator=Lasso(random_state=1), cv=kf, param_grid=dict(max_iter=[5000],alpha=alph), n_jobs=-1, scoring='neg_root_mean_squared_error')
clf2.fit(train, y_train2.to_numpy().ravel())        

clf2.best_score_                                  

#clf.best_estimator_.C                            


# Prediction performance on test set is not as good as on train set
#clf2.score(train, y_train2.to_numpy().ravel())  #-0.13289254169400164 #skew -0.1267826684793217 #newfeatures -0.12673846624490706

In [None]:
clf2.best_estimator_

In [None]:
model = clf2.best_estimator_
fet = []
val = []
for i in range(len(a)):
#df_train[num_col2].columns[0]
    fet.append(a[i])
    val.append(model.coef_[i])
    
pd.DataFrame({'features':fet, 'coff':val}).sort_values(by='coff',key=abs,ascending=False).head(50)

In [None]:
y = clf2.predict(test)
yLasso = np.expm1(y)
yLasso

In [None]:
import xgboost as xgb
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [None]:
model_xgb.fit(train, y_train2)

In [None]:
cross_val_score(model_xgb, train, y_train2.to_numpy().ravel(), cv=kf, scoring='neg_root_mean_squared_error')

In [None]:
xgb_pred = np.expm1(model_xgb.predict(test))
xgb_pred

In [None]:
from xgboost.sklearn import XGBRegressor

In [None]:
# xgb1 = XGBRegressor()
# parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
#               'objective':['reg:linear'],
#               'learning_rate': [.03, 0.05, .07], #so called `eta` value
#               'max_depth': [3, 5, 7],
#               'min_child_weight': [1,2,3,4],
#               'silent': [1],
#               'subsample': [0.7],
#               'colsample_bytree': [0.7],
#               'n_estimators': [500, 1000, 2000, 2500]}

# xgb_grid = GridSearchCV(xgb1,
#                         parameters,
#                         cv = kf,
#                         n_jobs = -1,
#                         verbose=True, scoring='neg_root_mean_squared_error')
# xgb_grid.fit(train, y_train2.to_numpy().ravel())

# print(xgb_grid.best_score_) #0.12339150836137816

In [None]:
#xgb_grid.best_estimator_

In [None]:
'k'

In [None]:
model_xgb2 = xgb.XGBRegressor(colsample_bytree=0.7,  
                             learning_rate=0.03, max_depth=3, 
                             min_child_weight=2, n_estimators=2200,
                             reg_alpha=0,
                             silent=1,subsample=0.7,
                             random_state =1, nthread = 4)

In [None]:
cross_val_score(model_xgb2, train, y_train2.to_numpy().ravel(), cv=kf, scoring='neg_root_mean_squared_error')

In [None]:
model_xgb2.fit(train, y_train2.to_numpy().ravel())

In [None]:
xgb_pred2 = np.expm1(model_xgb2.predict(test))
xgb_pred2

In [None]:
ynew = 0.5*yLasso + 0.5*xgb_pred2

In [None]:
subm = pd.DataFrame({'Id': df_test.Id ,'SalePrice' : ynew})
subm

In [None]:
subm.to_csv('submission.csv',index=False)