## Load the data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

# Exploratory Data Analysis

In [None]:
train_data.info()

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
sns.heatmap(train_data.corr(), center = 0)
plt.title("Correlations Between Columns")
plt.show()

## Split input and target variables

In [None]:
y = train_data.SalePrice
X = train_data.drop(columns=["SalePrice"], axis=1)

In [None]:
y.shape, X.shape, test_data.shape

# Feature Engineering

## Choose only the significant features, discard those with correlation score < 0.5 with the target variable

In [None]:
corr_matrix = train_data.corr()

In [None]:
corr_matrix['SalePrice'][(corr_matrix["SalePrice"] > 0.40) | (corr_matrix["SalePrice"] < -0.40)]

In [None]:
important_num_cols = list(corr_matrix['SalePrice'][(corr_matrix["SalePrice"] > 0.5) | (corr_matrix["SalePrice"] < -0.5)].index)

important_num_cols.remove('SalePrice')
len(important_num_cols)

In [None]:
important_num_cols

In [None]:
X_num_only = X[important_num_cols]

In [None]:
X_num_only.shape

## Remove the feautures which are highly correlated with each other

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(X_num_only.corr(), center = 0)
plt.title("Correlations Between Columns")
plt.show()

In [None]:
corr_X = X_num_only.corr()
len(corr_X)

In [None]:

for i in range(0, len(corr_X) - 1):
    for j in range(i + 1, len(corr_X)):
        if(corr_X.iloc[i, j] < -0.6 or corr_X.iloc[i, j] > 0.6):
            print(corr_X.iloc[i, j], i, j, corr_X.index[i], corr_X.index[j])
            

In [None]:
# Based on the above information, we further discard the features 1stFlrSF, FullBath, TotRmsAbvGrd, GarageArea
#num_cols = [i for i in X_modified.columns if i not in ['1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'GarageArea']]
num_cols = [i for i in X_num_only.columns if i not in ['1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'GarageArea']]


In [None]:
# Categorical columns - choose the important ones

cat_cols = ["MSZoning", "Utilities","BldgType","Heating","KitchenQual","SaleCondition","LandSlope"]

In [None]:
X_final = X[num_cols]

In [None]:
X_final.shape

## Modify 'YearRemodAdd' feature - make it more informative

In [None]:
X_final['YearRemodAdd'] = X_final['YearRemodAdd'] - X_final['YearBuilt']

In [None]:
X_final.head()

# Handling missing data

In [None]:
X_final.isna().sum()

In [None]:
#X_final['MasVnrArea'] = X_final['MasVnrArea'].fillna(X_final['MasVnrArea'].median())

In [None]:
X[cat_cols].isna().sum()

# Encoding Categorical data

In [None]:
X_categorical_df = pd.get_dummies(X[cat_cols], columns=cat_cols)

In [None]:
X_categorical_df

In [None]:
# Create final dataframe

In [None]:
X_final = X_final.join(X_categorical_df)

In [None]:
X_final

# Normalizing the data

In [None]:
from sklearn import preprocessing
standardize = preprocessing.StandardScaler().fit(X_final[num_cols])

In [None]:
#See mean per column
standardize.mean_

In [None]:
#transform
X_final[num_cols] = standardize.transform(X_final[num_cols])

In [None]:
X_final

In [None]:
X_final.head()

## Split training data into training and validation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_final, y, test_size=0.2, random_state=1)

In [None]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

# Regression Using Machine Learning 

In [None]:
from sklearn.metrics import r2_score 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures

In [None]:
perf = []
method = []

In [None]:
from sklearn.metrics import mean_squared_log_error

In [None]:
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
predictions = lin_reg.predict(X_val)

r_squared = r2_score(predictions, y_val)

print("R2 Score:", r_squared)
rmsle = np.sqrt(mean_squared_log_error(predictions, y_val))
print("RMSLE:", rmsle)
method.append('Linear Regression')
perf.append(rmsle)


In [None]:
# Ridge regression
ridge = Ridge()
ridge.fit(X_train, y_train)
predictions = ridge.predict(X_val)

r_squared = r2_score(predictions, y_val)

print("R2 Score:", r_squared)
method.append('Ridge Regression')

rmsle = np.sqrt(mean_squared_log_error(predictions, y_val))
print("RMSLE:", rmsle)
perf.append(rmsle)

In [None]:
# Ridge regression
lasso = Lasso()
lasso.fit(X_train, y_train)
predictions = lasso.predict(X_val)

r_squared = r2_score(predictions, y_val)

print("R2 Score:", r_squared)
method.append('Lasso Regression')

rmsle = np.sqrt(mean_squared_log_error(predictions, y_val))
print("RMSLE:", rmsle)
perf.append(rmsle)

In [None]:
# support vector regression
from sklearn.svm import SVR
svr = SVR(C=1000000)
svr.fit(X_train, y_train)
predictions = svr.predict(X_val)

r_squared = r2_score(predictions, y_val)

print("R2 Score:", r_squared)
#method.append('SVM')
rmsle = np.sqrt(mean_squared_log_error(predictions, y_val))
print("RMSLE:", rmsle)
#perf.append(rmsle)

In [None]:
svr_rbf = SVR(kernel="rbf", C=1000000, gamma=0.01, epsilon=0.1)
svr_rbf.fit(X_train, y_train)
predictions = svr_rbf.predict(X_val)

r_squared = r2_score(predictions, y_val)

print("R2 Score:", r_squared)

method.append('SVR')
rmsle = np.sqrt(mean_squared_log_error(predictions, y_val))
print("RMSLE:", rmsle)
perf.append(rmsle)

In [None]:
#Random forest regressor
for i in range(50 , 500, 50):
    random_forest = RandomForestRegressor(n_estimators=i)
    random_forest.fit(X_train, y_train)
    predictions = random_forest.predict(X_val)

    r_squared = r2_score(predictions, y_val)

    print("R2 Score:", r_squared)
    method.append('Random Forest Regressor')
    rmsle = np.sqrt(mean_squared_log_error(predictions, y_val))
    print("RMSLE:", rmsle)
    perf.append(rmsle)

In [None]:
# xgboost
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.01)
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_val)

r_squared = r2_score(predictions, y_val)

print("R2 Score:", r_squared)
method.append('XGBoost Regressor')
rmsle = np.sqrt(mean_squared_log_error(predictions, y_val))
print("RMSLE:", rmsle)
perf.append(rmsle)

In [None]:
# ANN
'''
import math
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import MeanSquaredLogarithmicError



hidden_units1 = 400
#hidden_units2 = 480
hidden_units3 = 256
learning_rate = 0.01
# Creating model using the Sequential in tensorflow
def build_model_using_sequential():
    model = Sequential([
        Dense(hidden_units1, kernel_initializer='normal', activation='relu'),
        Dropout(0.2),
        Dense(hidden_units3, kernel_initializer='normal', activation='relu'),
        Dense(1, kernel_initializer='normal', activation='linear')
      ])
    return model
# build the model
model = build_model_using_sequential()

# loss function
msle = MeanSquaredLogarithmicError()
model.compile(
    loss=msle, 
    optimizer=Adam(learning_rate=learning_rate), 
    metrics=[msle]
)

# train the model
history = model.fit(
    X_final.values, 
    y.values, 
    epochs=1000, 
    batch_size=64,
    validation_split=0.2
)
predictions = model.predict(X_val)
rmsle = np.sqrt(mean_squared_log_error(predictions, y_val))
print("RMSLE:", rmsle)
method.append('ANN')
perf.append(rmsle)
'''

In [None]:
# Compare performances of models
plt.barh(method, perf)
plt.title('RMSLE comparison of models')

# Testing

In [None]:
# Test Data Preprocessing

X_test = test_data[num_cols + cat_cols]
X_test['YearRemodAdd'] = X_test['YearRemodAdd'] - X_test['YearBuilt']

In [None]:
X_test.shape

In [None]:
# Encode categorical similar to train
X_test = pd.get_dummies(X_test)

In [None]:
X_test

In [None]:
# Add missed columns missed due to get dummies on X_test
X_test = X_test.reindex(columns = X_final.columns, fill_value=0)

In [None]:
X_test

In [None]:
#transform
X_test[num_cols] = standardize.transform(X_test[num_cols])

In [None]:
X_test

## Handling missing values in test data

In [None]:
X_test.isna().sum()

In [None]:
# we will use median for missing values
X_test['TotalBsmtSF'] = X_test['TotalBsmtSF'].fillna(train_data['TotalBsmtSF'].median())

In [None]:
# mode for cars
X_test['GarageCars'] = X_test['GarageCars'].fillna(train_data['GarageCars'].mode()[0])

In [None]:
# Submission using SVR

preds = svr_rbf.predict(X_test)
submit = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': preds})
submit.to_csv('submission.csv',index=False)


In [None]:
# Submission using ANN
'''
preds = model.predict(X_test)
preds_2 = [i[0] for i in preds]
out = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': preds_2}) 
out.to_csv('submission.csv',index=False)
'''

# Effect of Label Encondings for Categorical Features

We used one-hot encoding for categorical features. Lets see the effect on performance for label encodings which can be used for Random Forest, XGBoost

In [None]:
X_numerical = X[num_cols]

In [None]:
X_numerical

In [None]:
X_categorical_df = X[cat_cols]

In [None]:
X_categorical_df

In [None]:
datatypes = X_categorical_df.dtypes
encodings = {}

for col, dt in datatypes.iteritems():
    if(str(dt) not in ['float64', 'int64']):
        
        #print(col, dt)
        X_categorical_df[col] = X_categorical_df[col].astype("category")
        encodings[col] = X_categorical_df[col].cat.codes
        X_categorical_df[col] = encodings[col]

In [None]:
X_categorical_df


In [None]:
X_final_2 = X_numerical.join(X_categorical_df)

In [None]:
X_final_2

In [None]:
X_final_2['YearRemodAdd'] = X_final_2['YearRemodAdd'] - X_final_2['YearBuilt']

In [None]:
standardize = preprocessing.StandardScaler().fit(X_final_2)
#transform
X_final_2 = standardize.transform(X_final_2)
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(X_final_2, y, test_size=0.2, random_state=1)

In [None]:
X_train_2

In [None]:
#Random forest regressor
for i in range(50 , 500, 50):
    random_forest = RandomForestRegressor(n_estimators=i)
    random_forest.fit(X_train_2, y_train_2)
    predictions = random_forest.predict(X_val_2)

    r_squared = r2_score(predictions, y_val_2)

    print("R2 Score:", r_squared)
    method.append('Random Forest Regressor')
    rmsle = np.sqrt(mean_squared_log_error(predictions, y_val_2))
    print("RMSLE:", rmsle)
    perf.append(rmsle)

In [None]:
# xgboost
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.01)
xgb.fit(X_train_2, y_train_2)
predictions = xgb.predict(X_val_2)

r_squared = r2_score(predictions, y_val_2)

print("R2 Score:", r_squared)
method.append('XGBoost Regressor')
rmsle = np.sqrt(mean_squared_log_error(predictions, y_val))
print("RMSLE:", rmsle)
perf.append(rmsle)

## There is not much change in the performance when we change the categorical labelling method.

# Alternatively: Using LazyRegressor to compare all the models
Trying LazyRegressor Library to compare performance of different regression models

In [None]:
#pip install lazypredict

In [None]:
'''
from lazypredict.Supervised import LazyRegressor
reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_val, y_train, y_val)
print(models)'''

In [None]:
'''                               Adjusted R-Squared  R-Squared       RMSE  \
Model                                                                     
GradientBoostingRegressor                    0.89       0.91   25630.04   
BaggingRegressor                             0.89       0.90   26123.02   
RandomForestRegressor                        0.89       0.90   26324.15   
PoissonRegressor                             0.87       0.89   28297.38   
ExtraTreesRegressor                          0.86       0.88   29108.04   
LGBMRegressor                                0.86       0.88   29311.39   
HistGradientBoostingRegressor                0.86       0.88   29370.53   
XGBRegressor                                 0.86       0.87   30029.86   
AdaBoostRegressor                            0.83       0.85   32620.73   
LassoCV                                      0.81       0.84   34059.91   
LassoLarsCV                                  0.81       0.84   34064.17   
LarsCV                                       0.81       0.84   34094.51   
LassoLarsIC                                  0.81       0.84   34171.60   
LassoLars                                    0.81       0.83   34353.80   
Lars                                         0.81       0.83   34356.05   
HuberRegressor                               0.81       0.83   34389.33   
TransformedTargetRegressor                   0.81       0.83   34394.09   
LinearRegression                             0.81       0.83   34394.09   
Lasso                                        0.81       0.83   34398.37   
Ridge                                        0.81       0.83   34403.95   
RidgeCV                                      0.81       0.83   34449.75   
PassiveAggressiveRegressor                   0.81       0.83   34450.98   
BayesianRidge                                0.81       0.83   34521.54   
OrthogonalMatchingPursuitCV                  0.80       0.83   34962.43   
RANSACRegressor                              0.80       0.83   35026.20   
OrthogonalMatchingPursuit                    0.78       0.81   36603.98   
GammaRegressor                               0.78       0.80   37373.70   
ElasticNet                                   0.77       0.80   38043.20   
DecisionTreeRegressor                        0.75       0.78   39247.87   
KNeighborsRegressor                          0.72       0.76   41542.03   
TweedieRegressor                             0.72       0.76   41548.79   
GeneralizedLinearRegressor                   0.72       0.76   41548.79   
ExtraTreeRegressor                           0.62       0.67   48387.85   
ElasticNetCV                                -0.05       0.09   80708.65   
SGDRegressor                                -0.05       0.08   80970.68   
NuSVR                                       -0.15      -0.00   84465.81   
DummyRegressor                              -0.15      -0.01   84695.70   
SVR                                         -0.16      -0.02   85098.78   
KernelRidge                                 -4.55      -3.85  185944.47   
MLPRegressor                                -5.02      -4.25  193564.51   
LinearSVR                                   -5.04      -4.27  193955.44   
GaussianProcessRegressor                 -2305.38   -2012.13 3789135.85   
'''