In [None]:
# importing required libraries
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge, SGDRegressor, LinearRegression
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore")

In [None]:
# reading in the train, test dataset
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
id_test = df_test['Id']
print(df_train.shape)
print(df_test.shape)

In [None]:
pd.options.display.max_rows = 40
pd.options.display.max_columns = None
plt.rcParams["figure.figsize"] = (18, 8)

# examining datasets

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.drop(columns='Id', inplace=True)
df_test.drop(columns='Id', inplace=True)

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
# missing data 
sns.heatmap(df_train.isnull(), cmap='Blues', cbar=False, yticklabels=False, xticklabels=df_train.columns)

In [None]:
# cloumns of 'Alley', 'MiscFeature', 'PoolQC', 'FireplaceQU', 'Fence' got a large number of missing values, 
# so its better to drop these columns
df_train = df_train.drop(columns=['Alley', 'MiscFeature', 'PoolQC', 'FireplaceQu', 'Fence'])
df_test = df_test.drop(columns=['Alley', 'MiscFeature', 'PoolQC', 'FireplaceQu', 'Fence'])

In [None]:
feature_cols = [col for col in df_train.columns if col not in ['SalePrice']]
target_col = ['SalePrice']
categorical_cols = [col for col in feature_cols if df_train[col].dtype=='O']
numeric_cols = [col for col in feature_cols if col not in categorical_cols]

In [None]:
df_train[numeric_cols].isnull().sum()

In [None]:
df_train[categorical_cols].isnull().sum()

In [None]:
# creating numerical part of the original dataframe
df_train_numerical = df_train.select_dtypes(include=['int64', 'float64'])
df_train_numerical.head()

In [None]:
# creating correlation matrix to gain more insights about numeric data 
corrmat = df_train_numerical.corr()
sns.heatmap(corrmat, cmap='RdYlGn')

In [None]:
# as feature - LotFrontage is very less correlated with SalePrice, and it contains many missing values too
# so better remove it from the df.
df_train.drop(columns='LotFrontage', inplace=True)
df_test.drop(columns='LotFrontage', inplace=True)

In [None]:
t_corr=corrmat.index[abs(corrmat['SalePrice'])>0.5]
sns.heatmap(df_train_numerical[t_corr].corr(),annot=True,cmap="RdYlGn")

# Removing Outliers

**for feature - GrLivArea**

In [None]:
plt.scatter(df_train.GrLivArea, df_train.SalePrice)
plt.title('GrLivArea vs SalePrice')
plt.xlabel('GrLivArea')
plt.ylabel('SalePrice')

In [None]:
df_train = df_train.drop(df_train[df_train['GrLivArea']>4000].index)
plt.scatter(df_train.GrLivArea, df_train.SalePrice)
plt.title('GrLivArea vs SalePrice')
plt.xlabel('GrLivArea')
plt.ylabel('SalePrice')

**for feature - TotalBsmtSF**

In [None]:
plt.scatter(df_train.TotalBsmtSF, df_train.SalePrice)
plt.title('TotalBsmtSF vs SalePrice')
plt.xlabel('TotalBsmtSF')
plt.ylabel('SalePrice')

In [None]:
df_train = df_train.drop(df_train[df_train['TotalBsmtSF']>3000].index)
plt.scatter(df_train.TotalBsmtSF, df_train.SalePrice)
plt.title('TotalBsmtSF vs SalePrice')
plt.xlabel('TotalBsmtSF')
plt.ylabel('SalePrice')

**for feature - 1stFlrSF**

In [None]:
plt.scatter(df_train['1stFlrSF'], df_train.SalePrice)
plt.title('1stFlrSF vs SalePrice')
plt.xlabel('1stFlrSF')
plt.ylabel('SalePrice')

In [None]:
df_train = df_train.drop(df_train[df_train['1stFlrSF']>2750].index)
plt.scatter(df_train['1stFlrSF'], df_train.SalePrice)
plt.title('1stFlrSF vs SalePrice')
plt.xlabel('1stFlrSF')
plt.ylabel('SalePrice')

**for feature - GarageArea**

In [None]:
plt.scatter(df_train['GarageArea'], df_train.SalePrice)
plt.title('GarageArea vs SalePrice')
plt.xlabel('GarageArea')
plt.ylabel('SalePrice')

In [None]:
df_train = df_train.drop(df_train[df_train['GarageArea']>1200].index)
plt.scatter(df_train['GarageArea'], df_train.SalePrice)
plt.title('GarageArea vs SalePrice')
plt.xlabel('GarageArea')
plt.ylabel('SalePrice')

In [None]:
print(df_train.shape)

**Creating copies of datasets**

In [None]:
X_train = df_train.loc[:, 'MSSubClass':'SaleCondition']
y_train = df_train['SalePrice']
print(X_train.shape, y_train.shape)

In [None]:
X_test = df_test.loc[:, 'MSSubClass':'SaleCondition']
print(X_test.shape)

# Handling missing values

**For training data**

In [None]:
df_train.columns

In [None]:
missing_cells = df_train.isnull().sum().sum()
total_cells = np.product(df_train.shape)

percent_missing = (missing_cells/total_cells)*100
percent_missing

In [None]:
df_train.loc[:, 'MSSubClass':'Heating'].isnull().sum()

In [None]:
df_train.loc[:, 'HeatingQC':'SalePrice'].isnull().sum()

In [None]:
# creating columns depending on the imputing value for missing values of training set
columns_na_to_None = ['BsmtQual',
                      'BsmtCond',
                      'BsmtExposure',
                      'BsmtFinType1',
                      'BsmtFinType2',
                      'GarageType',
                      'GarageFinish',
                      'GarageQual',
                      'GarageCond']

columns_na_to_mf = ['MasVnrType',
                    'Electrical']

columns_na_to_avg = ['MasVnrArea',
                     'GarageYrBlt']

In [None]:
for column in columns_na_to_None:
  imputer1 = SimpleImputer(missing_values = np.nan, strategy='constant', fill_value='None')
  imputer1.fit(X_train.loc[:, columns_na_to_None])
  X_train.loc[:, columns_na_to_None] = imputer1.transform(X_train.loc[:, columns_na_to_None])

for column in columns_na_to_mf:
  imputer2 = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
  imputer2.fit(X_train.loc[:, columns_na_to_mf])
  X_train.loc[:, columns_na_to_mf] = imputer2.transform(X_train.loc[:, columns_na_to_mf])

for column in columns_na_to_avg:
  imputer3 = SimpleImputer(missing_values = np.nan, strategy='mean')
  imputer3.fit(X_train.loc[:, columns_na_to_avg])
  X_train.loc[:, columns_na_to_avg] = imputer3.transform(X_train.loc[:, columns_na_to_avg])

In [None]:
sns.heatmap(X_train.isnull(), cmap='Blues', cbar=False, yticklabels=False, xticklabels=X_train.columns)

**for testing data**

In [None]:
missing_cells = df_test.isnull().sum().sum()
total_cells = np.product(df_test.shape)

percent_missing = (missing_cells/total_cells)*100
percent_missing

In [None]:
df_test.loc[:, 'MSSubClass':'Heating'].isnull().sum()

In [None]:
df_test.loc[:, 'HeatingQC':'SaleCondition'].isnull().sum()

In [None]:
# creating columns depending on the imputing value for missing values of testing set
columns_na_to_None = ['BsmtQual',
                      'BsmtCond',
                      'BsmtExposure',
                      'BsmtFinType1',
                      'BsmtFinType2',
                      'GarageType',
                      'GarageFinish',
                      'GarageQual',
                      'GarageCond']

columns_na_to_mf = ['MSZoning',
                    'Utilities',
                    'Exterior1st',
                    'Exterior2nd',
                    'MasVnrType',
                    'KitchenQual',
                    'Functional',
                    'SaleType']

columns_na_to_avg = ['MasVnrArea',
                     'BsmtFinSF1',
                     'BsmtFinSF2',
                     'BsmtUnfSF',
                     'TotalBsmtSF',
                     'GarageYrBlt']

columns_na_to_0 = ['BsmtFullBath',
                   'BsmtHalfBath',
                   'GarageCars',
                   'GarageArea']   

In [None]:
for column in columns_na_to_None:
  imputer4 = SimpleImputer(missing_values = np.nan, strategy='constant', fill_value='None')
  imputer4.fit(X_test.loc[:, columns_na_to_None])
  X_test.loc[:, columns_na_to_None] = imputer4.transform(X_test.loc[:, columns_na_to_None])

for column in columns_na_to_mf:
  imputer5 = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
  imputer5.fit(X_test.loc[:, columns_na_to_mf])
  X_test.loc[:, columns_na_to_mf] = imputer5.transform(X_test.loc[:, columns_na_to_mf])

for column in columns_na_to_avg:
  imputer6 = SimpleImputer(missing_values = np.nan, strategy='mean')
  imputer6.fit(X_test.loc[:, columns_na_to_avg])
  X_test.loc[:, columns_na_to_avg] = imputer6.transform(X_test.loc[:, columns_na_to_avg])

for column in columns_na_to_0:
  imputer7 = SimpleImputer(missing_values = np.nan, strategy='constant', fill_value=0)
  imputer7.fit(X_test.loc[:, columns_na_to_0])
  X_test.loc[:, columns_na_to_0] = imputer7.transform(X_test.loc[:, columns_na_to_0])

In [None]:
sns.heatmap(X_test.isnull(), cmap='Blues', cbar=False, yticklabels=False, xticklabels=X_test.columns)

# feature extraction

In [None]:
# first for training set
# assumption: OverallQual, OverallCond, ..., all these columns add to total house_worth
X_train['Total_Bathrooms'] = X_train['BsmtFullBath']+X_train['FullBath']+0.5*(X_train['BsmtHalfBath']+X_train['HalfBath'])
X_train['House_year'] = 0.5*(X_train['YearBuilt']+X_train['YearRemodAdd'])

# now for testing set
X_test['Total_Bathrooms'] = X_test['BsmtFullBath']+X_test['FullBath']+0.5*(X_test['BsmtHalfBath']+X_test['HalfBath'])
X_test['House_year'] = 0.5*(X_test['YearBuilt']+X_test['YearRemodAdd'])

In [None]:
X_train.drop(columns=['BsmtFullBath', 'FullBath', 'BsmtHalfBath', 'HalfBath', 'YearBuilt', 'YearRemodAdd'], inplace=True)
X_test.drop(columns=['BsmtFullBath', 'FullBath', 'BsmtHalfBath', 'HalfBath', 'YearBuilt', 'YearRemodAdd'], inplace=True)
print(X_train.shape)
print(X_test.shape)

# Categorical encoding

In [None]:
X_train_copy = X_train.copy()
X_train_encoded = X_train_copy.copy()
X_train_encoded = pd.get_dummies(X_train_copy)

In [None]:
X_test_copy = X_test.copy()
X_test_encoded = X_test_copy.copy()
X_test_encoded = pd.get_dummies(X_test_copy)

In [None]:
print(X_train_encoded.shape)
print(X_test_encoded.shape)

In [None]:
# as shape isn't equal, and #columns exceed in X_train_encoded than X_test_encoded, we will delete some columns 
# from X_train_encoded, and repeating the same with X_test_encoded, if needed

list1 = list(X_train_encoded.columns)
list2 = list(X_test_encoded.columns)

for column in list1:
  if column not in list2:
    X_train_encoded.drop(columns=column, inplace=True)
    
for column in list2:
  if column not in list1:
    X_test_encoded.drop(columns=column, inplace=True)
    
print(X_train_encoded.shape)
print(X_test_encoded.shape)

# feature scaling

In [None]:
# as some values in the dataset are large, its better to scale these features
# before all, we first need to separate numeric columns from all to apply feature scaling

# Numeric Columns
numeric_columns = []
object_columns = []
for column in X_train_copy.columns:
  if X_train_copy[column].dtype == object:
    object_columns.append(column)
  else:
    numeric_columns.append(column)

In [None]:
sc_x = StandardScaler()
sc_y = StandardScaler()

X_train_encoded.loc[:, numeric_columns] = sc_x.fit_transform(X_train_encoded.loc[:, numeric_columns])
X_test_encoded.loc[:, numeric_columns] = sc_x.transform(X_test_encoded.loc[:, numeric_columns])

In [None]:
y_train = sc_y.fit_transform(y_train.array.reshape(-1,1))

# Model selection and training

In [None]:
X = X_train_encoded
y = y_train.ravel()

X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
# choosing models to train and test

models = {
    "ridge": Ridge(),
    "SGD": SGDRegressor(),
    "xg": XGBRegressor(),
    "lightGBM": LGBMRegressor(),
    "catboost": CatBoostRegressor(),
    "gradientB": GradientBoostingRegressor(),
    "DecisionTree": DecisionTreeRegressor(),
    "Adaboost": AdaBoostRegressor(),
    "randomforest": RandomForestRegressor(),
    "svr_rbf": SVR(kernel="rbf"),
    "svr_lin": SVR(kernel="linear"),
    "svr_poly": SVR(kernel="poly"),
}

In [None]:
# applyiing k-fold cross-validation

for name, model in models.items():
  accuracies = cross_val_score(estimator = model, X = X, y = y.ravel(), cv = 10)
  print(name, "Accuracy: {:.2f} %".format(accuracies.mean()*100))
  print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [None]:
# calculate mse and then comparing and choosing best fit model

for desc, model in models.items():
    model.fit(X_train_final, y_train_final)
    prediction = model.predict(X_test_final)
    result = mean_squared_error(y_test_final, prediction, squared=False)
    print(f'{desc}: {np.mean(result)}')


In [None]:
# Model, Accuracy, Standard Deviation, MSE

# ridge, 90.69%, 0.91%, 0.29375050629020977
# SGD, 90.76%, 1.06%, 0.29483574914497945
# xg, 89.43%, 1.63%, 0.34289184578641235
# lightGBM, 91.08%, 1.68%, 0.30059592483255493
# catboost, 92.45%, 0.90%, 0.2713554006017869
# gradientB, 91.51%, 1.22%, 0.2903356551726021
# DecisionTree, 75.46%, 3.70%, 0.48797232970835924
# AdaBoost, 83.47%, 2.31%, 0.3943029926325227
# randomforest, 89.97%, 1.76%, 0.3408896939104737
# svr_rbf, 89.79%, 2.35%, 0.3453209972625774
# svr_lin, 90.86%, 0.86%, 0.3000789771202684
# svr_poly, 92.70%, 1.42%, 0.25634571066802375

# Submission

In [None]:
# Best fit model after comparing the above results can be CatBoostRegressor()

model = CatBoostRegressor()
model.fit(X, y.ravel())
sub_pred = sc_y.inverse_transform(model.predict(X_test_encoded).reshape(-1,1))
my_submission = pd.DataFrame({'Id': id_test, 'SalePrice': sub_pred.ravel()})
my_submission.to_csv('submission.csv', index=False)