In [None]:
# data analysis and wrangling
import numpy as np 
import pandas as pd 
import random as rnd
import math
import statsmodels.api as sm

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

#Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# machine learning
import sklearn
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv') 
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
combine = [train_df, test_df]

# 1. Looking at data

In [None]:
display(train_df.head())
display(train_df.tail())

In [None]:
train_df.info()
print('_'*40)
# test_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.describe(include=['O'])

# 2. Finding correlations and necessary features

There are a lot of features. It's complicated to study them separetely, therefore we will try to find correlation between them, and will exclude those who hav high correlation.

In [None]:
correlation_df = train_df.copy()
display(correlation_df.head(10))

In [None]:
correlation_df.isna().sum().sort_values().tail(20)

**In the majority of cases empty cells mean lack of this object. We will label this cells as 0 class**

In [None]:
num_attribs = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'] 
cat_attribs = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

len(num_attribs) + len(cat_attribs)

In [None]:
correlation_df[cat_attribs] = correlation_df[cat_attribs].astype(str)

In [None]:
encoder = OrdinalEncoder()
correlation_df[cat_attribs] = encoder.fit_transform(correlation_df[cat_attribs])

In [None]:
imputer = SimpleImputer(strategy="median")
correlation_df[num_attribs] = imputer.fit_transform(correlation_df[num_attribs])

In [None]:
display(correlation_df.head(10))
correlation_df[num_attribs].isna().sum()

In [None]:
corr = correlation_df.corr()
sns.heatmap(corr)

In [None]:
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.7:
            if columns[j]:
                columns[j] = False
selected_columns = correlation_df.columns[columns]
correlation_df_table = correlation_df[selected_columns]

In [None]:
correlation_df

In [None]:
import statsmodels.api as sm
regressor_OLS = sm.OLS(endog = correlation_df['SalePrice'], exog = correlation_df.loc[:, correlation_df.columns != 'SalePrice'].values).fit()
regressor_OLS.summary()

**So, we built a simple linear regression model and obtained information about the parameters.
Now we are dooing backward elimination. The loop is created, and in the loop body the simple linear regression model is built. Then, the parameter with the highest p-value is detected and eliminated and again and again until we reach the situation when the highset p-value is lower than 0.05 (statistical significance). In the end we will get truncated table, containing much less number of columns**

In [None]:
def find_max_index(list_of_falues):
    local_maximum = -100
    local_index = 0
    for i in range(len(list_of_falues)):
        if list_of_falues[i] > local_maximum:
            local_maximum = list_of_falues[i]
            index = i
    return index

max_p_value = 1


while max_p_value > 0.05:
    regressor_OLS = sm.OLS(endog = correlation_df['SalePrice'], exog = correlation_df.loc[:, correlation_df.columns != 'SalePrice'].values).fit()
    max_p_value = regressor_OLS.pvalues[find_max_index(regressor_OLS.pvalues)]
    if max_p_value > 0.05:
        correlation_df.drop(correlation_df.columns[find_max_index(regressor_OLS.pvalues)], axis=1, inplace = True)

regressor_OLS.summary()

In [None]:
interesting_columns = correlation_df.columns

In [None]:
test_df.info()

# 3. Extracting values and data preprocessing

In [None]:
working_df_train = train_df.copy()[interesting_columns[:-1]]
working_df_test = test_df.copy()[interesting_columns[:-1]]

y_train = train_df.copy()[interesting_columns[-1]]
y_train

In [None]:
working_df_train.info()
num_attrib = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'PoolArea', 'YrSold']
cat_attrib = ['Street', 'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle', 'RoofMatl', 'Exterior1st', 'MasVnrType', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'KitchenQual', 'Functional', 'FireplaceQu', 'PoolQC', 'SaleCondition']

In [None]:
print(working_df_test['Exterior1st'].mode())
print(working_df_test['KitchenQual'].mode())
print(working_df_test['Functional'].mode())

In [None]:
mode_exterior1st = working_df_test['Exterior1st'].mode()
working_df_test['Exterior1st'] = working_df_test['Exterior1st'].fillna('VinylSd')
working_df_test['KitchenQual'] = working_df_test['KitchenQual'].fillna('TA')
working_df_test['Functional'] = working_df_test['Functional'].fillna('Typ')

working_df_test.info()

In [None]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="constant", fill_value = 'empty')),
        ('encoder', OneHotEncoder(sparse=False)),
    ])

full_pipeline = ColumnTransformer([
        ("cat", cat_pipeline, cat_attrib),
        ("num", num_pipeline, num_attrib),
    ])

X_train = full_pipeline.fit_transform(working_df_train)
X_test = full_pipeline.transform(working_df_test)

In [None]:
print(pd.DataFrame(X_train))

In [None]:
print(pd.DataFrame(X_test))

# Building the model

## 4.1 Random forest

In [None]:
rf = RandomForestRegressor(n_estimators = 200, max_depth = 20, random_state = 0)
rf.fit(X_train, y_train)

from sklearn.metrics import r2_score, mean_squared_log_error
print(r2_score(y_train, rf.predict(X_train)))
print(mean_squared_log_error(y_train, rf.predict(X_train)))

#from sklearn.model_selection import GridSearchCV
#parameters_rf = [{'n_estimators': [200, 1000], 
#                  'max_depth': [14, 20]}]
#grid_search_rf = GridSearchCV(estimator = rf,
#                           param_grid = parameters_rf,
#                           scoring = 'neg_mean_squared_log_error',
#                           cv = 5,
#                           verbose = 1)
#grid_search_rf.fit(X_train, y_train)
#best_log_error_rf = grid_search_rf.best_score_
#best_parameters_rf = grid_search_rf.best_params_
#print("Best error: {:.2f} ".format(best_log_error_rf))
#print("Best Parameters:", best_parameters_rf)

In [None]:
from xgboost import XGBRegressor
boost = XGBRegressor(n_estimators = 550, max_depth = 3, eta = 0.1, random_state = 0)
boost.fit(X_train, y_train)

from sklearn.metrics import r2_score, mean_squared_log_error
print(r2_score(y_train, boost.predict(X_train)))
print(mean_squared_log_error(y_train, boost.predict(X_train)))

from sklearn.model_selection import GridSearchCV
parameters_boost = [{'n_estimators': [400, 500, 600],
                     'max_depth' : [2, 3, 4]}]
grid_search_boost = GridSearchCV(estimator = boost,
                           param_grid = parameters_boost,
                           scoring = 'neg_mean_squared_log_error',
                           cv = 10,
                           verbose = 1)
# grid_search_boost.fit(X_train, y_train)
#best_log_error_boost = grid_search_boost.best_score_
#best_parameters_boost = grid_search_boost.best_params_
#print("Best error: {:.4f} ".format(best_log_error_boost))
#print("Best Parameters:", best_parameters_boost)

#print("Grid scores on development set:")
#print()
#means = grid_search_boost.cv_results_['mean_test_score']
#stds = grid_search_boost.cv_results_['std_test_score']
#for mean, std, params in zip(means, stds, grid_search_boost.cv_results_['params']):
#    print("%0.5f (+/-%0.05f) for %r"
#            % (mean, std * 2, params))

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors = 6, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)

from sklearn.metrics import r2_score, mean_squared_log_error
print(r2_score(y_train, knn.predict(X_train)))
print(mean_squared_log_error(y_train, knn.predict(X_train)))

from sklearn.model_selection import GridSearchCV
parameters_knn = [{'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 2, 10, 15]}]
grid_search_knn = GridSearchCV(estimator = knn,
                           param_grid = parameters_knn,
                           scoring = 'neg_mean_squared_log_error',
                           cv = 10,
                           verbose = 1)
#grid_search_knn.fit(X_train, y_train)
#best_log_error_knn = grid_search_knn.best_score_
#best_parameters_knn = grid_search_knn.best_params_
#print("Best error: {:.4f} ".format(best_log_error_knn))
#print("Best Parameters:", best_parameters_knn)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

from sklearn.metrics import r2_score, mean_squared_log_error
print(r2_score(y_train, regressor.predict(X_train)))
print(mean_squared_log_error(y_train, regressor.predict(X_train)))

In [None]:
# y_pred_rf = rf.predict(X_test)

In [None]:
# y_pred_boost = boost.predict(X_test)

In [None]:
#submission_house_price_rf = pd.DataFrame({'Id' : test_df['Id'],
#                                         'SalePrice' : y_pred_rf})
#submission_house_price_rf.to_csv('/kaggle/working/submission_house_price_rf_2.csv', index=False)

In [None]:
#submission_house_price_boost = pd.DataFrame({'Id' : test_df['Id'],
#                                         'SalePrice' : y_pred_boost})
#submission_house_price_boost.to_csv('/kaggle/working/submission_house_price_boost_no_scaling_depth4_n500.csv', index=False)

# 5. Blending

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

X_first_lvl, X_2nd_level, y_first_lvl, y_2nd_level = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)

rf_1 = RandomForestRegressor(n_estimators = 200, max_depth = 20, random_state = 0)
boost_1 = XGBRegressor(n_estimators = 550, max_depth = 3, eta = 0.1, random_state = 0)
knn_1 = KNeighborsRegressor(n_neighbors = 6, metric = 'minkowski', p = 2)
regressor_1 = LinearRegression()

rf_1.fit(X_first_lvl, y_first_lvl)
boost_1.fit(X_first_lvl, y_first_lvl)
knn_1.fit(X_first_lvl, y_first_lvl)
regressor_1.fit(X_first_lvl, y_first_lvl)

y_pred_rf_1 = rf_1.predict(X_2nd_level)
y_pred_boost_1 = boost_1.predict(X_2nd_level)
y_pred_knn_1 = knn_1.predict(X_2nd_level)
y_pred_regressor_1 = regressor_1.predict(X_2nd_level)

X_train_predicted_from_previous = pd.DataFrame({'predicted_rf' : y_pred_rf_1,
                                              'predicted_boost' : y_pred_boost_1,
                                              'predicted_linear' : y_pred_regressor_1
    
})


boost_blending = XGBRegressor(n_estimators = 75, max_depth = 3, eta = 0.1, random_state = 0)
boost_blending.fit(X_train_predicted_from_previous, y_2nd_level)

from sklearn.metrics import r2_score, mean_squared_log_error
print(r2_score(y_2nd_level, boost_blending.predict(X_train_predicted_from_previous)))
print(mean_squared_log_error(y_2nd_level, boost_blending.predict(X_train_predicted_from_previous)))

from sklearn.model_selection import GridSearchCV
parameters_blend = [{'n_estimators': [50, 100, 75, 125, 150, 200, 500],
                     'max_depth' : [2, 3, 4],
                     'eta' : [0.1, 0.05, 0.2]}]
grid_search_blend = GridSearchCV(estimator = boost_blending,
                           param_grid = parameters_blend,
                           scoring = 'neg_mean_squared_log_error',
                           cv = 5,
                           verbose = 1)
grid_search_blend.fit(X_train_predicted_from_previous, y_2nd_level)
best_log_error_blend = grid_search_blend.best_score_
best_parameters_blend = grid_search_blend.best_params_
print("Best error: {:.4f} ".format(best_log_error_blend))
print("Best Parameters:", best_parameters_blend)

y_pred_rf_1_test = rf_1.predict(X_test)
y_pred_boost_1_test = boost_1.predict(X_test)
y_pred_knn_1_test = knn_1.predict(X_test)
y_pred_regressor_1_test = regressor_1.predict(X_test)

X_test_predicted_from_previous = pd.DataFrame({'predicted_rf' : y_pred_rf_1_test,
                                              'predicted_boost' : y_pred_boost_1_test,
                                              'predicted_linear' : y_pred_regressor_1_test
    
})

y_pred_blend = boost_blending.predict(X_test_predicted_from_previous)

submission_house_price_blend = pd.DataFrame({'Id' : test_df['Id'],
                                         'SalePrice' : y_pred_blend})
submission_house_price_blend.to_csv('/kaggle/working/submission_house_price_blending.csv', index=False)

# 6. ANN

### **<font color = 'green'>3.1 Initializing the ANN</font>**

In [None]:
number_neurons = 175
number_of_layers_relu = 3
ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=number_neurons, activation = 'tanh')) 
for i in range(number_of_layers_relu):
    ann.add(tf.keras.layers.Dense(units=number_neurons, activation = 'relu')) 
ann.add(tf.keras.layers.Dense(units=1, activation = 'linear')) 

### **<font color = 'green'>3.2 Training the ANN</font>**

In [None]:
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

es = EarlyStopping(monitor='val_loss', mode='min', patience = 50, verbose=1)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True) 

opt = keras.optimizers.Adam(learning_rate=0.005)
ann.compile(optimizer = opt, loss = 'mean_squared_logarithmic_error', metrics = [tf.keras.metrics.MeanSquaredLogarithmicError()])
history = ann.fit(X_train, y_train, validation_split = 0.10, batch_size = 32, epochs = 3000, callbacks=[es, mc])

### **<font color = 'green'>3.3 Plot the loss vs epochs</font>**

In [None]:
history_dict = history.history
loss_values = history_dict['loss']
log_loss_values = history_dict['mean_squared_logarithmic_error']
val_loss_values = history_dict['val_loss']
val_log_loss_values = history_dict['val_mean_squared_logarithmic_error']
epochs = range(1, len(loss_values) + 1)

plt.plot(epochs, loss_values, color = 'blue', label='Training loss')
plt.scatter(epochs, val_loss_values, color = 'red', s = 5, label='Validation loss')
plt.title('Training & Validation Loss', fontsize=16)
plt.xlabel('Epochs', fontsize=16)
plt.ylabel('Loss', fontsize=16)
plt.legend()
#plt.ylim(0, 200)
plt.show()

plt.plot(epochs, log_loss_values, color = 'blue', label='Training loss')
plt.scatter(epochs, val_log_loss_values, color = 'red', s = 5, label='Validation loss')
plt.title('Training & Validation Log Loss', fontsize=16)
plt.xlabel('Epochs', fontsize=16)
plt.ylabel('Log Loss', fontsize=16)
plt.legend()
plt.ylim(0, 0.1)
plt.show()

In [None]:
history_dict = history.history
loss_values = history_dict['loss']
log_loss_values = history_dict['mean_squared_logarithmic_error']
val_loss_values = history_dict['val_loss']
val_log_loss_values = history_dict['val_mean_squared_logarithmic_error']
epochs = range(1, len(loss_values) + 1)

plt.plot(epochs, loss_values, color = 'blue', label='Training loss')
plt.scatter(epochs, val_loss_values, color = 'red', s = 5, label='Validation loss')
plt.title('Training & Validation Loss', fontsize=16)
plt.xlabel('Epochs', fontsize=16)
plt.ylabel('Loss', fontsize=16)
plt.legend()
#plt.ylim(0, 200)
plt.show()

plt.plot(epochs, log_loss_values, color = 'blue', label='Training loss')
plt.scatter(epochs, val_log_loss_values, color = 'red', s = 5, label='Validation loss')
plt.title('Training & Validation Log Loss', fontsize=16)
plt.xlabel('Epochs', fontsize=16)
plt.ylabel('Log Loss', fontsize=16)
plt.legend()
plt.ylim(0, 0.1)
plt.show()

In [None]:
y_pred_ann = np.squeeze(ann.predict(X_test), axis = 1)

In [None]:
submission_house_price_ann = pd.DataFrame({'Id' : test_df['Id'],
                                         'SalePrice' : y_pred_ann})
submission_house_price_ann.to_csv('/kaggle/working/submission_house_price_ann.csv', index=False)