# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.impute import KNNImputer
from sklearn.neighbors import LocalOutlierFactor
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Preprocessing Data

In [None]:
data=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
data.head(10)

In [None]:
data[data.columns[data.isna().sum() > 0]].isna().mean()*100

In [None]:
data.drop(['Alley','FireplaceQu','PoolQC','Fence','MiscFeature','Id'], inplace=True, axis=1)
data[data.columns[data.isna().sum() > 0]].isna().mean()*100

In [None]:
X=data.drop('SalePrice', axis=1)
y=data['SalePrice']

## KNN Impute

In [None]:
x_en=pd.get_dummies(X,drop_first=True)
imputer=KNNImputer()
imputer.fit(x_en)
x=imputer.transform(x_en)
x=pd.DataFrame(x, columns=x_en.columns)

In [None]:
x[x.columns[x.isna().sum() > 0]].isna().mean()*100

## Outlier Detection

In [None]:
lof = LocalOutlierFactor()
yhat = lof.fit_predict(x.to_numpy())
mask=yhat!=-1

In [None]:
x_train=x.to_numpy()[mask, :]
x_train=pd.DataFrame(x_train, columns=x.columns)
y_train=y[mask]
print(x_train.shape, y_train.shape)

## Normalization Data

In [None]:
mm_scaler=MinMaxScaler()
x_scaled=pd.DataFrame(mm_scaler.fit_transform(x_train), columns=x_train.columns)
x_scaled.head()

In [None]:
target_scaler=MinMaxScaler()
y_data=pd.DataFrame(y_train)
target_scaler.fit(y_data)
y_scaled=target_scaler.transform(y_data)
y_scaled

# Feature Selection

## RFE

In [None]:
dtr=DecisionTreeRegressor()
dtr.fit(x_scaled,y_scaled)
rfe=RFE(dtr,16)
rfe=rfe.fit(x_scaled,y_scaled)

In [None]:
x_scaled.columns[rfe.support_]

In [None]:
selected_list=['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'GrLivArea', 'GarageCars', 'GarageArea', 'MoSold',
       'CentralAir_Y']
x_selected=x_scaled[selected_list]
x_selected.head()

In [None]:
x_features=x_selected.columns
x_features

# Evaluation Models

## Linear Regression

In [None]:
kfold=KFold(n_splits=10, random_state=76, shuffle=True)
model_LR=LinearRegression()
scoring='neg_mean_squared_error'
results_LR=cross_val_score(model_LR,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_LR.mean())

## Ridge Regression

In [None]:
model_Ridge=Ridge()
results_Ridge=cross_val_score(model_Ridge,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_Ridge.mean())

## Lasso Regression

In [None]:
model_Lasso=Lasso()
results_Lasso=cross_val_score(model_Lasso,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_Lasso.mean())

## ElasticNet Regression

In [None]:
model_EN=ElasticNet()
results_EN=cross_val_score(model_EN,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_EN.mean())

## KNN

In [None]:
params_KNN=dict(n_neighbors=[11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28])
model_KNN=KNeighborsRegressor()
grid_KNN=GridSearchCV(estimator=model_KNN, param_grid=params_KNN, scoring=scoring, cv=kfold)
grid_KNN.fit(x_selected, y_scaled)
print("Best Score: %f use parameters: %s" % (grid_KNN.best_score_, grid_KNN.best_params_))

## Decision Tree Regressor (CART)

In [None]:
params_cart=dict(max_depth=[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18])
model_cart=DecisionTreeRegressor()
grid_cart=GridSearchCV(estimator=model_cart, param_grid=params_cart, scoring=scoring, cv=kfold)
grid_cart.fit(x_selected, y_scaled)
print("Best Score: %f use parameters: %s" % (grid_cart.best_score_, grid_cart.best_params_))

## SVM

In [None]:
model_svm=SVR(kernel='rbf', gamma=0.4, C=13)
results_svm=cross_val_score(model_svm,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_svm.mean())

## Bagging Decision Tree Regressor

In [None]:
model_cart_bagging=DecisionTreeRegressor(max_depth=9)
model_bagging=BaggingRegressor(base_estimator=model_cart_bagging, n_estimators=10, random_state=76)
results_bagging=cross_val_score(model_bagging,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_bagging.mean())

## Random Forest

In [None]:
model_randomforest=RandomForestRegressor()
params_rt=dict(max_depth=[7,8,9,10,11,12,13,14,15,16,18])
grid_rt=GridSearchCV(estimator=model_randomforest, param_grid=params_rt, scoring=scoring, cv=kfold, n_jobs=-1)
grid_rt.fit(x_selected, y_scaled)
print("Best Score: %f use parameters: %s" % (grid_rt.best_score_, grid_rt.best_params_))

## Extra Trees

In [None]:
model_extratrees=ExtraTreesRegressor(max_depth=60)
results_extratrees=cross_val_score(model_extratrees,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_extratrees.mean())

## Ada Boost

In [None]:
model_ada=AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=9), n_estimators=40)
results_ada=cross_val_score(model_ada,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_ada.mean())

## Gradient Boosting

In [None]:
model_gradientBoosting=GradientBoostingRegressor(n_estimators=49, learning_rate=0.14997, max_depth=4)
results_gradientBoosting=cross_val_score(model_gradientBoosting,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_gradientBoosting.mean())

In [None]:
model_gradientBoosting=GradientBoostingRegressor()
params_gb=dict(n_estimators=range(45,57,1),
              learning_rate=[0.149968,0.14997,0.149967,0.149966],
              max_depth=range(3,6,1))
grid_gb=GridSearchCV(estimator=model_gradientBoosting, param_grid=params_gb, scoring=scoring, cv=kfold, n_jobs=-1)
grid_gb.fit(x_selected, y_scaled)
print("Best Score: %f use parameters: %s" % (grid_gb.best_score_, grid_gb.best_params_))

## XGBoost

In [None]:
model_xgb=XGBRegressor(n_estimators=31, learning_rate=0.200879, max_depth=5)
results_xgb=cross_val_score(model_xgb,x_selected, y_scaled, cv=kfold, scoring=scoring)
print(results_xgb.mean())

## Compare Models together

In [None]:
model_name=['Linear Regression', 'Ridge', 'Lasso', 'Elastic Net', 'KNN', 'CART', 'SVM', 'Bagging CART', 'Random Forest', 'Extra Trees', 'Ada Boost', 'Gradient Boosting', 'XGBoost']
error_model=[abs(results_LR.mean()), abs(results_Ridge.mean()), abs(results_Lasso.mean()), abs(results_EN.mean()), abs(grid_KNN.best_score_), abs(grid_cart.best_score_), abs(results_svm.mean()), abs(results_bagging.mean()), abs(grid_rt.best_score_), abs(results_extratrees.mean()), abs(results_ada.mean()), abs(results_gradientBoosting.mean()), abs(results_xgb.mean())]

In [None]:
fig=plt.figure(figsize=(20,8),facecolor='gainsboro')
fig.suptitle('Errors of the Algorithms')
ax=fig.add_subplot(111)
plt.bar(x=model_name,height=error_model,color='lightgrey',edgecolor='cyan')
plt.show()

# Final Model with Gradient Boosting

In [None]:
final_gradient=GradientBoostingRegressor(learning_rate= 0.14997, max_depth= 4, n_estimators= 49)
final_gradient.fit(x_selected, y_scaled)

In [None]:
test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test.head()

In [None]:
Id_pred=test['Id']
test.drop(['Alley','FireplaceQu','PoolQC','Fence','MiscFeature','Id'], inplace=True, axis=1)
test_en=pd.get_dummies(test,drop_first=True)
test_en.shape

In [None]:
result_test= x_en.append(test_en, sort=False)
result_test.shape

In [None]:
test_en_2=result_test[1460:2919]
test_en_2.shape

In [None]:
test_en_2[test_en_2.columns[test_en_2.isna().sum() > 0]].isna().mean()*100

In [None]:
test_en_3=imputer.transform(test_en_2)

In [None]:
test_en_3=pd.DataFrame(test_en_3, columns=test_en_2.columns)
test_en_3[test_en_3.columns[test_en_3.isna().sum() > 0]].isna().mean()*100

In [None]:
test_scaled = pd.DataFrame(mm_scaler.transform(test_en_3), columns=test_en_3.columns)
test_scaled.head()

In [None]:
test_selected=test_scaled[x_features]
test_selected.head()

In [None]:
ypred_scale=final_gradient.predict(test_selected)
ypred_scale=pd.DataFrame(ypred_scale)
ypred=target_scaler.inverse_transform(ypred_scale)
ypred

In [None]:
pred_data=pd.DataFrame(ypred,columns=['SalePrice'])
target_pred=pd.concat([Id_pred,pred_data],axis=1)
target_pred.head()

In [None]:
target_pred

In [None]:
target_pred.to_csv("submission.csv", index=False)
print("Submission was successfully saved!")