In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, VarianceThreshold, SelectFromModel
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
train_path = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'
test_path = '/kaggle/input/house-prices-advanced-regression-techniques/test.csv'
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
train.head()

In [None]:
test.head()

In [None]:
print(f'Training Data shape: {train.shape} \n Test Data shape: {test.shape}')

# Basic Data Exploration - Missing values

In [None]:
#divide columns in categorical and numerical ones to have a better overview
cat_columns = train.select_dtypes(include=['object']).columns.tolist()
num_columns = train.select_dtypes(exclude=['object']).columns.tolist()
print(f'Categorical columns: \n {cat_columns} \n Numerical columns: \n {num_columns}')

In [None]:
#remove first and last element of the num_columns list as Id and SalePrice are not wanted inside the list
num_columns = num_columns[:-1]

In [None]:
num_columns

### Missing values in categorical features

Have look at the data description to see the available categories and if they are ordinal or not

In [None]:
#check for missing values
#train[cat_columns].isna().sum()

#function to only give back the columns with missing values to not have a long list with all other features that don't have missing values
def check_missing(df):
    missing = df.isna().sum()[df.isna().any()==True]
    df_out = pd.DataFrame({'missing':missing})
    return df_out

check_missing(train[cat_columns])

* Replacing missing values with some value is important as the sklearn transformers don't work when there are missing values
* For most of the features a missing value means that this feature is not available for the house, e.g. the house has no Pool or no Fireplace
* According to the data description there is a "NA" category for these features so I'll replace the missing values with "NA"
* there are some few features with only few missign values that do not have an "NA" category in the description. However, I'll also encode them with "NA" for simplicity.

### Missing values in numerical features

In [None]:
check_missing(train[num_columns])

* There are different ways how to impute numerical values. In this case it doesn't make sense to impute with mean or median values as this would probably not reflect what the features tell us about the house. It seems more reasonable to impute with 0 values to say that this feature is not available for the house.

# Splitting the Dataset into Train and Test Data

* This is done to be able to evaluate the model performance

In [None]:
X = train.drop(['SalePrice'], axis=1)
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=8)
X_train.head()

In [None]:
X_test.head()

In [None]:
test.head()

# Preprocessing - Imputing, Encoding and Scaling the data

## Dealing with the Numerical Features - Imputing missing Values

In [None]:
num_imputing = make_pipeline(
    SimpleImputer(strategy='constant', fill_value=0)
)

## Dealing with the Categorical Features - Imputing - Ordinal Encoding and One Hot Encoding


In [None]:
cat_imputing = make_pipeline(
    (SimpleImputer(strategy='constant', fill_value='NA'))
)

* Use Ordinal Encoding for ordinal categorical features - features with values that have a meaningful order and thus can be encoded with numbers in one single column instead of splitting up each value of a nominal categorical variable into seperate column

* Use One-Hot Encoding for nominal categorical features - features that do not have a meaningful order - during encoding they are split into seperate columns for each unique value in the feature

In [None]:
# find all the ordinal features according to the data description and put them in a list
ordinal_features = ['ExterQual','ExterCond','KitchenQual','BsmtQual','BsmtCond','HeatingQC','FireplaceQu','GarageQual','GarageCond','GarageFinish','BsmtExposure','BsmtFinType1','BsmtFinType2','Functional','CentralAir','LandSlope','PavedDrive','Fence','PoolQC','Alley','Street','Utilities']

# the rest of the categorical features goes into the list of nominal categorical features
nominal_features = list(set(cat_columns) - set(ordinal_features))

In [None]:
# lists with values for ordinal feature encoding
ql5 =['None','Po','Fa','TA','Gd','Ex']
fin=['None','Unf','RFn','Fin']
expo=['None','No','Mn','Av','Gd']
fint=['None','Unf','LwQ','Rec','BLQ','ALQ','GLQ']
func=['None','Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ']
yn=['Y','N']
ls=['None','Sev','Mod','Gtl']
pad=['N','P','Y']
fen=['None','MnWw','GdWo','MnPrv','GdPrv']
ql4=['None','Fa','TA','Gd','Ex']
al=['None','Grvl','Pave']
st=['None','Grvl','Pave']
util=['ELO','NoSeWa','NoSewr','AllPub']

# for each feature in the ordinal_feature list there has to be an encoding category in the following categories list to be passed to the OrdinalEncoder
ordinal_categories = [ql5,ql5,ql5,ql5,ql5,ql5,ql5,ql5,ql5,fin,expo,fint,fint,func,yn,ls,pad,fen,ql4,al,st,util]

In [None]:
#check if all columns are in the column lists - should be 79 (81 original columns - 2 (removed Id and SalePrice))
len(ordinal_features)+len(nominal_features)+len(num_columns)

* Use dummy encoding for nominal categorical features, that means that there will be one column less than values in the variable because this can be explained with the other dummy variables from this feature
* to achieve this use attributes drop='first' with the One Hot Encoder

In [None]:
# create ordinal and one-hot encoding Pipeline steps

ordinal_enc = Pipeline(steps=[
    ('ordinal_encoder', OrdinalEncoder(categories=ordinal_categories))
])

one_hot_enc = Pipeline(steps=[
    ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore')) #drop='first'
])

### Combine Imputing and Encoding into the Pipeline

In [None]:
# use ColumnTransformer to get one single feature space as output - basically relevant when using multiple transformers because of different columns
imputing = ColumnTransformer(transformers=[
    ('imp_nums', num_imputing, num_columns),
    ('imp_cats', cat_imputing, cat_columns)
])

encoding = ColumnTransformer(transformers=[
    ('enc_nums', "passthrough", num_columns),
    ('enc_ord', ordinal_enc, ordinal_features),
    ('enc_nom', one_hot_enc, nominal_features)
])


## Scaling the Data

* scaling prevents features that have bigger/smaller values to have a bigger or smaller impact on the predictions

* Do the scaling for all features, although for encoded nominal features there are only 0 and 1 in each dummified column so the scaler doesn't do anything


In [None]:
scaling = Pipeline(steps=[
    ('scale', MinMaxScaler())
])

## Full Preprocess of the dataset

In [None]:
# try simple version of pipeline
cat_encoding = ColumnTransformer(transformers=[
    ('enc_ord', ordinal_enc, ordinal_features),
    ('enc_nom', one_hot_enc, nominal_features)
])


cats = Pipeline(steps=[
    ('impute_cats', cat_imputing),
    #('encode_cats', cat_encoding)
    ('encode_cats', one_hot_enc)
])

nums = Pipeline(steps=[
    ('impute_nums', num_imputing)
])

preprocess = ColumnTransformer(transformers=[
    ('cats', cats, cat_columns),
    ('nums', nums, num_columns)
])

full_preprocess2 = Pipeline(steps=[
    ('preprocess',preprocess),
    ('scaling',scaling)
])

In [None]:
# full_preprocess =  Pipeline(steps=[
#      ('imputing', imputing),
#      ('encoding', encoding),
#      ('scaling', scaling)   
# ])

In [None]:
pd.DataFrame(full_preprocess2.fit_transform(X_train))

# First Model Prediction

## Using a simple Linear Regression

In [None]:
lm_pipeline = Pipeline(steps=[
    ('full_preprocess', full_preprocess2),
    ('model', LinearRegression())
])

In [None]:
lm_pipeline.fit(X_train, y_train)

In [None]:
predictions = lm_pipeline.predict(X_test)

In [None]:
# use logarithm of predictions and true_values for rmse calculation because this will be the evaluation measure in the competition
rmse = mean_squared_error(y_test, predictions)**0.5
rmse

In [None]:
rmsle = mean_squared_log_error(y_test, abs(predictions))**0.5
rmsle

Until now there is no feature selection implemented. Maybe this is the problem of the model and the really high root mean squared error for the predictions.

In [None]:
#evaluate the results
results = pd.DataFrame({
    'predictions':predictions, 
    'true_values':y_test
})

results['diff'] = abs(round(results['predictions'] - results['true_values']))
results.sort_values(by='diff', ascending = True).tail(5)#head(5)

In [None]:
t = X_test.copy()
t['Price'] = y_test
t['preds'] = predictions
t['diff'] = t['Price'] - t['preds']
t.sort_values(by='diff', ascending = True).head(5)

# Feature Selection

Add different feature selection approaches to improve predictions

## Remove features with low variance using VarianceThreshold

Features with low variance (have lots of the same values) don't add much information and thus could be removed

In [None]:
#remove features that have less then 90% variance (in columns with 0 and 1, 90% are 0 and only 10% are 1)

select_vt = Pipeline(steps=[
    ('variance_threshold', VarianceThreshold(threshold=(0.9*(1-0.9))))
])

In [None]:
vt_pipeline = Pipeline(steps=[
    ('full_preprocess', full_preprocess2),
    ('feature_selection',select_vt),
    ('model', LinearRegression())
])

In [None]:
vt_pipeline.fit(X_train, y_train)

In [None]:
vt_predictions =vt_pipeline.predict(X_test)

In [None]:
rmsle = mean_squared_log_error(y_test, abs(vt_predictions))**0.5
rmsle

## Try SelectKBest for Feature Selection

In [None]:
select_kbest = Pipeline(steps=[
    ('select_kbest', SelectKBest(mutual_info_regression))
])

In [None]:
kbest_pipeline = Pipeline(steps=[
    ('full_preprocess', full_preprocess2),
    ('feature_selection',select_kbest),
    ('model', LinearRegression())
])

In [None]:
kbest_pipeline.fit(X_train, y_train)

In [None]:
kbest_predictions =kbest_pipeline.predict(X_test)

In [None]:
rmsle = mean_squared_log_error(y_test, abs(kbest_predictions))**0.5
rmsle

## Try also SelectFromModel for Feature Selection

In [None]:
from_model_pipeline = Pipeline(steps=[
    ('full_preprocess', full_preprocess2),
    ('feature_selection', SelectFromModel(RandomForestRegressor())),
    ('model', LinearRegression()) #fit_intercept=False - regression lines starts at 0,0 to prevent negative price predictions but also influences rest of predictions that get worse
])


In [None]:
from_model_pipeline.fit(X_train, y_train)

In [None]:
from_model_predictions =from_model_pipeline.predict(X_test)

In [None]:
sum(i < 0 for i in from_model_predictions)

In [None]:
rmsle = mean_squared_log_error(y_test, abs(from_model_predictions))**0.5
rmsle

* The Linear Regression Model sometimes predicts negative prices for houses. To prevent this I tried to set the beginning of the regression line to 0,0 but this also influences the rest of the predictions a lot and thus is not wanted

* try some other models

# Random Forest Regressor

This already worked good as model for Feature Selection - try as predicting Model

In [None]:
random_forest_pipeline = Pipeline(steps=[
    ('full_preprocess', full_preprocess2),
    ('feature_selection', SelectFromModel(RandomForestRegressor())),
    ('model', RandomForestRegressor())
])

In [None]:
random_forest_pipeline.fit(X_train, y_train)

In [None]:
random_forest_predictions =random_forest_pipeline.predict(X_test)

In [None]:
sum(i < 0 for i in random_forest_predictions)

In [None]:
rmsle = mean_squared_log_error(y_test, abs(random_forest_predictions))**0.5
rmsle

In [None]:
test_df = pd.DataFrame(full_preprocess2.transform(test))

In [None]:
test_df

In [None]:
test.head()

In [None]:
X_train.head()

In [None]:
test_preds = random_forest_pipeline.predict(test)

In [None]:
output = pd.DataFrame({'Id': test.Id,
                       'SalePrice': test_preds})
output.head(2)

In [None]:
#output.to_csv('submission.csv', index=False)

# Use a PCA for dimension reduction - with and without additional feature selection

In [None]:
pca = make_pipeline(PCA())

In [None]:
pca_pipeline = Pipeline(steps=[
    ('full_preprocess', full_preprocess2),
    ('pca',pca),
    ('feature_selection', SelectFromModel(RandomForestRegressor())),
    ('model', RandomForestRegressor())
])

In [None]:
pca_pipeline.fit(X_train, y_train)

In [None]:
pca_predictions =pca_pipeline.predict(X_test)

In [None]:
rmsle = mean_squared_log_error(y_test, pca_predictions)**0.5
rmsle

PCA doesn't improve the predictions

# Try some other Model

In [None]:
adaboost_pipeline = Pipeline(steps=[
    ('full_preprocess', full_preprocess2),
    #('feature_selection', SelectFromModel(RandomForestRegressor())),
    ('model', AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators=450, random_state=42))
])

In [None]:
adaboost_pipeline.fit(X_train, y_train)

In [None]:
adaboost_predictions =adaboost_pipeline.predict(X_test)

In [None]:
rmsle = mean_squared_log_error(y_test, adaboost_predictions)**0.5
rmsle

In [None]:
test_preds = adaboost_pipeline.predict(test)

In [None]:
output = pd.DataFrame({'Id': test.Id,
                       'SalePrice': test_preds})
output.head(2)

In [None]:
output.to_csv('submission.csv', index=False)

# Use Grid Search for Parameter Optimization

In [None]:
treereg_pipeline = Pipeline(steps=[
    ('full_preprocess', full_preprocess2),
    ('feature_selection', SelectFromModel(RandomForestRegressor())),
    ('model', DecisionTreeRegressor(max_depth = 6, random_state=42))
])

# param_grid = {
#     'model__max_depth': range(1, 10),
#     'model__min_samples_leaf': range(1, 10),
#     'model__min_samples_split': range(2, 10),
#    # 'model__criterion':['squared_error','friedman_mse','absolute_error','poisson'],
#     'model__max_features':['auto', 'sqrt','log2']
#     }
# search = GridSearchCV(adaboost_pipeline, param_grid, cv=15, scoring='accuracy', verbose=1, refit=True, n_jobs=-1)

# search.fit(X_train,y_train)

treereg_pipeline.fit(X_train, y_train)

In [None]:
#search.best_params_

In [None]:
treereg_predictions =treereg_pipeline.predict(X_test)

In [None]:
rmsle = mean_squared_log_error(y_test, treereg_predictions)**0.5
rmsle

Using different models does not really improve the predictions. Better try to do some more feature engineering.

In [None]:
df_plot = train.filter(['LotArea','OverallQual','TotalBsmtSF','1stFlrSF','GrLivArea','GarageCars','YrSold','SalePrice'])
df_plot.head()

In [None]:
g = sns.PairGrid(df_plot)
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
train.SalePrice.describe()

In [None]:
plt.subplots(figsize=(12, 8))
sns.scatterplot(x=X_train['LotArea'], y=y_train)
#plt.axvline(x = 2.5, ymin=0, ymax=18000, c='red')
plt.show()