# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Preprocessing Data

In [None]:
data=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
data.head(10)

In [None]:
data[data.columns[data.isna().sum() > 0]].isna().mean()*100

In [None]:
data.drop(['Alley','FireplaceQu','PoolQC','Fence','MiscFeature','Id'], inplace=True, axis=1)
data[data.columns[data.isna().sum() > 0]].isna().mean()*100

In [None]:
x=data.drop('SalePrice', axis=1)
y=data['SalePrice']

## Outlier Detection

In [None]:
for i in x.columns:
    if(x[i].dtype!='object'):
        sns.boxplot(x[i])
        plt.title(i)
        plt.show()

In [None]:
for i in x.columns:
    if(x[i].dtype!='object'):
        value_z=(x[i]-x[i].mean())/(x[i].std())
        sns.distplot(value_z)
        plt.show()

In [None]:
x['MSSubClass'][x['MSSubClass']>170]=170
x['LotFrontage'][x['LotFrontage']>190]=190
x['LotArea'][x['LotArea']>60000]=60000
x['OverallCond'][x['OverallCond']>8]=8
x['YearBuilt'][x['YearBuilt']<1879]=1879
x['MasVnrArea'][x['MasVnrArea']>1050]=1050
x['BsmtFinSF1'][x['BsmtFinSF1']>3000]=3000
x['BsmtUnfSF'][x['BsmtUnfSF']>2200]=2200
x['TotalBsmtSF'][x['TotalBsmtSF']>4000]=4000
x['1stFlrSF'][x['1stFlrSF']>3000]=3000
x['GrLivArea'][x['GrLivArea']>4100]=4100
x['BsmtFullBath'][x['BsmtFullBath']>2.5]=2.5
x['GarageArea'][x['GarageArea']>1300]=1300
x['WoodDeckSF'][x['WoodDeckSF']>650]=650
x['OpenPorchSF'][x['OpenPorchSF']>400]=400

## Handle Missing Data

In [None]:
x[x.columns[x.isna().sum() > 0]]

In [None]:
x[x.columns[x.isna().sum() > 0]].hist(figsize=(20,20))

In [None]:
numerical=['LotFrontage','MasVnrArea','GarageYrBlt']
for i in numerical:
    x[i]=x[i].fillna(x[i].median())
    
categorical=['MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Electrical','GarageType','GarageFinish','GarageQual','GarageCond']
for i in categorical:
    x[i]=x[i].fillna(x[i].mode()[0])
x[x.columns[x.isna().sum() > 0]].isna().mean()*100

## Dummy Variable Encoding

In [None]:
x_en=pd.get_dummies(x,drop_first=True)
x_en.head()

## Normalization Data

In [None]:
mm_scaler=MinMaxScaler()
x_scaled=pd.DataFrame(mm_scaler.fit_transform(x_en), columns=x_en.columns)
x_scaled.head()

In [None]:
target_scaler=MinMaxScaler()
y_data=pd.DataFrame(y)
target_scaler.fit(y_data)
y_scaled=target_scaler.transform(y_data)
y_scaled

# Feature Selection

## RFE

In [None]:
dtr=DecisionTreeRegressor()
dtr.fit(x_scaled,y_scaled)
rfe=RFE(dtr,15)
rfe=rfe.fit(x_scaled,y_scaled)

In [None]:
x_scaled.columns[rfe.support_]

In [None]:
selected_list=['LotFrontage', 'LotArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd',
       'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
       'TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'MoSold', 'CentralAir_Y']
x_selected=x_scaled[selected_list]
x_selected.head()

## VIF

In [None]:
def checkVIF(x):
    vif=pd.DataFrame()
    vif['features']=x.columns
    vif['VIF']=[variance_inflation_factor(x.values,i) for i in range(x.shape[1])]
    vif['VIF']=round(vif['VIF'],3)
    vif=vif.sort_values(by="VIF",ascending=False)
    print(vif)

In [None]:
checkVIF(x_selected)

In [None]:
def tableOLS(x,y):
    xc=sm.add_constant(x)
    lm=sm.OLS(y,xc).fit()
    print(lm.summary())

In [None]:
tableOLS(x_selected,y_scaled)

In [None]:
x_selected.drop(['GrLivArea','TotRmsAbvGrd','GarageArea','MoSold','CentralAir_Y'],inplace=True,axis=1)
tableOLS(x_selected,y_scaled)

In [None]:
checkVIF(x_selected)

In [None]:
x_selected.drop(['OverallQual','TotalBsmtSF'],inplace=True,axis=1)
checkVIF(x_selected)

In [None]:
x_selected.drop('YearBuilt',inplace=True,axis=1)
checkVIF(x_selected)

In [None]:
x_features=x_selected.columns
x_features

# Evaluation Models

## Linear Regression

In [None]:
kfold=KFold(n_splits=10, random_state=76, shuffle=True)
model_LR=LinearRegression()
scoring='neg_mean_squared_error'
results_LR=cross_val_score(model_LR,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_LR.mean())

## Ridge Regression

In [None]:
model_Ridge=Ridge()
results_Ridge=cross_val_score(model_Ridge,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_Ridge.mean())

## Lasso Regression

In [None]:
model_Lasso=Lasso()
results_Lasso=cross_val_score(model_Lasso,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_Lasso.mean())

## ElasticNet Regression

In [None]:
model_EN=ElasticNet()
results_EN=cross_val_score(model_EN,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_EN.mean())

## KNN

In [None]:
params_KNN=dict(n_neighbors=[11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28])
model_KNN=KNeighborsRegressor()
grid_KNN=GridSearchCV(estimator=model_KNN, param_grid=params_KNN, scoring=scoring, cv=kfold)
grid_KNN.fit(x_selected, y_scaled)
print("Best Score: %f use parameters: %s" % (grid_KNN.best_score_, grid_KNN.best_params_))

## Decision Tree Regressor (CART)

In [None]:
params_cart=dict(max_depth=[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18])
model_cart=DecisionTreeRegressor()
grid_cart=GridSearchCV(estimator=model_cart, param_grid=params_cart, scoring=scoring, cv=kfold)
grid_cart.fit(x_selected, y_scaled)
print("Best Score: %f use parameters: %s" % (grid_cart.best_score_, grid_cart.best_params_))

## SVM

In [None]:
model_SVR=SVR()
params_svr=dict(kernel=['rbf'],
               gamma=[0.6,0.4,0.5],
               C=[13,14,15,16,17,18])
grid_svm=GridSearchCV(estimator=model_SVR, param_grid=params_svr, scoring=scoring, cv=kfold, n_jobs=-1)
grid_svm.fit(x_selected, y_scaled)
print("Best Score: %f use parameters: %s" % (grid_svm.best_score_, grid_svm.best_params_))

## Bagging Decision Tree Regressor

In [None]:
model_cart_bagging=DecisionTreeRegressor(max_depth=13)
model_bagging=BaggingRegressor(base_estimator=model_cart_bagging, n_estimators=10, random_state=76)
model_bagging.fit(x_selected, y_scaled)
y_pred_train=model_bagging.predict(x_selected)
error_bagging=mean_squared_error(y_scaled, y_pred_train)
error_bagging

## Random Forest

In [None]:
model_randomforest=RandomForestRegressor()
params_rt=dict(max_depth=[7,8,9,10,11,12,13,14,15,16,18])
grid_rt=GridSearchCV(estimator=model_randomforest, param_grid=params_rt, scoring=scoring, cv=kfold, n_jobs=-1)
grid_rt.fit(x_selected, y_scaled)
print("Best Score: %f use parameters: %s" % (grid_rt.best_score_, grid_rt.best_params_))

## Extra Trees

In [None]:
model_extratrees=ExtraTreesRegressor()
params_extratrees=dict(max_depth=range(50,100,10))
grid_extratrees=GridSearchCV(estimator=model_extratrees, param_grid=params_extratrees, scoring=scoring, cv=kfold, n_jobs=-1)
grid_extratrees.fit(x_selected, y_scaled)
print("Best Score: %f use parameters: %s" % (grid_extratrees.best_score_, grid_extratrees.best_params_))

## Ada Boost

In [None]:
model_ada=AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=6))
params_ada=dict(n_estimators=range(30,45,1))
grid_ada=GridSearchCV(estimator=model_ada, param_grid=params_ada, scoring=scoring, cv=kfold, n_jobs=-1)
grid_ada.fit(x_selected, y_scaled)
print("Best Score: %f use parameters: %s" % (grid_ada.best_score_, grid_ada.best_params_))

## Gradient Boosting

In [None]:
model_gradientBoosting=GradientBoostingRegressor()
params_gb=dict(n_estimators=range(45,55,1),
              learning_rate=[0.1,0.09,0.08],
              max_depth=range(4,6,1))
grid_gb=GridSearchCV(estimator=model_gradientBoosting, param_grid=params_gb, scoring=scoring, cv=kfold, n_jobs=-1)
grid_gb.fit(x_selected, y_scaled)
print("Best Score: %f use parameters: %s" % (grid_gb.best_score_, grid_gb.best_params_))

## Compare Models together

In [None]:
model_name=['Linear Regression', 'Ridge', 'Lasso', 'Elastic Net', 'KNN', 'CART', 'SVM', 'Bagging CART', 'Random Forest', 'Extra Trees', 'Ada Boost', 'Gradient Boosting']
error_model=[abs(results_LR.mean()), abs(results_Ridge.mean()), abs(results_Lasso.mean()), abs(results_EN.mean()), abs(grid_KNN.best_score_), abs(grid_cart.best_score_), abs(grid_svm.best_score_), abs(error_bagging), abs(grid_rt.best_score_), abs(grid_extratrees.best_score_), abs(grid_ada.best_score_), abs(grid_gb.best_score_)]

In [None]:
fig=plt.figure(figsize=(20,8),facecolor='gainsboro')
fig.suptitle('Errors of the Algorithms')
ax=fig.add_subplot(111)
plt.bar(x=model_name,height=error_model,color='lightgrey',edgecolor='cyan')
plt.show()

# Final Model with Bagging Decision Tree

In [None]:
final_cart=DecisionTreeRegressor(max_depth=13)
final_model=BaggingRegressor(base_estimator=model_cart_bagging, n_estimators=10, random_state=76)
final_model.fit(x_selected, y_scaled)

In [None]:
test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test.head()

In [None]:
Id_pred=test['Id']
test.drop(['Alley','FireplaceQu','PoolQC','Fence','MiscFeature','Id'], inplace=True, axis=1)
test_en=pd.get_dummies(test,drop_first=True)
test_en.shape

In [None]:
result_test= x_en.append(test_en, sort=False)
result_test.shape

In [None]:
test_en_2=result_test[1460:2919]
test_en_2.shape

In [None]:
test_en_2[test_en_2.columns[test_en_2.isna().sum() > 0]].isna().mean()*100

In [None]:
list_test=['Utilities_NoSeWa','Condition2_RRAe','Condition2_RRAn','Condition2_RRNn','HouseStyle_2.5Fin','RoofMatl_CompShg','RoofMatl_Membran','RoofMatl_Metal','RoofMatl_Roll','Exterior1st_ImStucc','Exterior1st_Stone','Exterior2nd_Other','Heating_GasA','Heating_OthW','Electrical_Mix','GarageQual_Fa']
for i in list_test:
    test_en_2[i]=test_en_2[i].fillna(0)
test_en_2[test_en_2.columns[test_en_2.isna().sum() > 0]].isna().mean()*100

In [None]:
numerical=['LotFrontage','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','GarageYrBlt','GarageCars','GarageArea']
for i in numerical:
    test_en_2[i]=test_en_2[i].fillna(test_en_2[i].median())
test_en_2[test_en_2.columns[test_en_2.isna().sum() > 0]].isna().mean()*100

In [None]:
test_scaled = pd.DataFrame(mm_scaler.transform(test_en_2), columns=test_en_2.columns)
test_scaled.head()

In [None]:
test_selected=test_scaled[x_features]
test_selected.head()

In [None]:
ypred_scale=final_model.predict(test_selected)
ypred_scale=pd.DataFrame(ypred_scale)
ypred=target_scaler.inverse_transform(ypred_scale)
ypred

In [None]:
pred_data=pd.DataFrame(ypred,columns=['SalePrice'])
target_pred=pd.concat([Id_pred,pred_data],axis=1)
target_pred.head()

In [None]:
target_pred

In [None]:
target_pred.to_csv("submission.csv", index=False)
print("Submission was successfully saved!")