In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Exploratory Data Analysis:

In [None]:
pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
#Here we learn that SalePrice is the target column

In [None]:
pd.read_fwf("../input/house-prices-advanced-regression-techniques/data_description.txt")


In [None]:
df= pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df.info()

In [None]:
test=pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
test.info()

In [None]:
df.describe() # we get all the statistical information of all the numerical data

SalePrice column is our target 

In [None]:
y=df["SalePrice"].values

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12,10))
sns.set_style("darkgrid")
sns.histplot(data=df, x="SalePrice", bins=50,cbar=True)
#It seem that majority of the prices are between 100 000 and 250 000

<font color="blue">
The SalePrice is skewed to the right.SalePrice is not normally distributed, so we need to adjust it.

In [None]:
# Skew and kurtosis for SalePrice 
print("Skewness: %f" % df['SalePrice'].skew())
print("Kurtosis: %f" % df['SalePrice'].kurt())

In [None]:
#Applying log transformation to remove skewness and make target variable normally distributed
df['SalePrice'] = np.log1p(df['SalePrice'])

In [None]:
plt.figure(figsize=(12,10))
sns.set_style("darkgrid")
sns.histplot(data=df, x="SalePrice", bins=50,cbar=True,color="red")
#Now it is normally distributed

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(data=df, x="SalePrice",color="green")


In [None]:
df["SalePrice"].describe() 
#The mean price is 180 921 and standart deviation is 79 442 and its very high

In [None]:
df.corr()["SalePrice"].sort_values(ascending=False)
#Here we list the correlation between the target and other features from the highest to the negative ones
#It seem the overall quality has the highest positive correlation with the target

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(),cmap="jet",annot=False,linewidths=1,robust=True)

In [None]:
plt.figure(figsize=(12,10))
sns.regplot(x="OverallQual", y="SalePrice",data=df)
#OverallQual has the highest positive correlation with the target 

In [None]:
plt.figure(figsize=(12,10))
sns.regplot(x="KitchenAbvGr", y="SalePrice",data=df,color="red")
#KitchenAbvGr has the highest negative correlation with the target

In [None]:
#Visualising numerical predictor variables with Target Variables
train_num = df.select_dtypes(include=['int64','float64'])
fig,axs= plt.subplots(12,3,figsize=(20,80))
#adjust horizontal space between plots 
fig.subplots_adjust(hspace=0.6)
for i,ax in zip(train_num.columns,axs.flatten()):
    sns.scatterplot(x=i, y='SalePrice', hue='SalePrice',data=train_num,ax=ax,palette='coolwarm')
    plt.xlabel(i,fontsize=12)
    plt.ylabel('SalePrice',fontsize=12)
    #ax.set_yticks(np.arange(0,900001,100000))
    ax.set_title('SalePrice'+' - '+str(i),fontweight='bold',size=20)

In [None]:
##Visualising Categorical predictor variables with Target Variables
categorical = df.select_dtypes(exclude=['int64','float64'])
def facetgrid_boxplot(x, y, **kwargs):
    sns.boxplot(x=x, y=y)
    x=plt.xticks(rotation=90)
    

f = pd.melt(df, id_vars=['SalePrice'], value_vars=sorted(df[categorical.columns]))
g = sns.FacetGrid(f, col="variable", col_wrap=3, sharex=False, sharey=False, size=5)
g = g.map(facetgrid_boxplot, "value", "SalePrice")

## 2. Prepare the Data Before Applying Machine Learning Algorithms

<font color="red">
2.1. Handling the Missing Values:

In [None]:
num_cols=df.columns[df.dtypes!= "object"]
cat_cols=df.columns[df.dtypes== "object"]
test_num_cols=test.columns[test.dtypes!= "object"]
test_cat_cols=test.columns[test.dtypes== "object"]

In [None]:
df[num_cols].isnull().sum().sort_values(ascending=False)
#We have only 3 numerical columns with missing values

In [None]:
test[test_num_cols].isnull().sum().sort_values(ascending=False)

In [None]:
df[num_cols].isnull().sum().sort_values(ascending=False)/len(df)
#Their percentage is not so high and we can fill the missing values

In [None]:
#Lets begin with the one with highest missing value among numerical columns
df["LotFrontage"].value_counts() #There 110 different types

In [None]:
df["LotFrontage"].describe()

In [None]:
df["LotFrontage"].fillna(df["LotFrontage"].mean(),inplace=True) # we fill the missing values with the mean of the column
test["LotFrontage"].fillna(test["LotFrontage"].mean(),inplace=True)
df["LotFrontage"].isnull().sum()

In [None]:
test["LotFrontage"].isnull().sum()

In [None]:
#Let look at another numerical column with missing value:
df["GarageYrBlt"] #These are the years of the garage built: 

In [None]:
df["GarageYrBlt"].fillna(df["GarageYrBlt"].median(),inplace=True)
test["GarageYrBlt"].fillna(test["GarageYrBlt"].median(),inplace=True)
print(df["GarageYrBlt"].isnull().sum())
print(test["GarageYrBlt"].isnull().sum())

In [None]:
df["MasVnrArea"].fillna(df["MasVnrArea"].median(), inplace=True)
test["MasVnrArea"].fillna(test["MasVnrArea"].median(), inplace=True)


In [None]:
df[num_cols].isnull().sum().sort_values(ascending=False)
#Now we do not have any missing value with the numerical columns

In [None]:
test[test_num_cols].isnull().sum().sort_values(ascending=False) # we still have some missing values, but so import

In [None]:
test["BsmtHalfBath"].fillna(test["BsmtHalfBath"].median(), inplace=True)
test["BsmtFullBath"].fillna(test["BsmtFullBath"].median(), inplace=True)
test["BsmtFinSF1"].fillna(test["BsmtFinSF1"].mean(), inplace=True)
test["GarageCars"].fillna(test["GarageCars"].mean(), inplace=True)
test["GarageArea"].fillna(test["GarageArea"].mean(), inplace=True)
test["TotalBsmtSF"].fillna(test["TotalBsmtSF"].mean(), inplace=True)
test["BsmtUnfSF"].fillna(test["BsmtUnfSF"].mean(), inplace=True)
test["BsmtFinSF2"].fillna(test["BsmtFinSF2"].mean(), inplace=True)
test[test_num_cols].isnull().sum().sort_values(ascending=False) 
#We have dealt with the missing values in all numerical columns in the test set



In [None]:
df[cat_cols].isnull().sum().sort_values(ascending=False)/len(df[cat_cols])
#We have 16 categorical columns with missing values

In [None]:
test[test_cat_cols].isnull().sum().sort_values(ascending=False)/len(test[test_cat_cols])

In [None]:
# We will drop columns which have more than %80 missing values:
df.drop(["PoolQC","MiscFeature","Alley","Fence"],axis=1, inplace=True)
df.isnull().sum().sort_values(ascending=False)

In [None]:
test.drop(["PoolQC","MiscFeature","Alley","Fence"],axis=1, inplace=True)
test.isnull().sum().sort_values(ascending=False)

In [None]:
#FireplaceQu  has % over %40 missing value, but it has a couple of categories that we can fill
df["FireplaceQu"].value_counts()

In [None]:
df["FireplaceQu"].fillna("Gd",inplace=True)
df["FireplaceQu"].isnull().sum()

In [None]:
test["FireplaceQu"].fillna("Gd",inplace=True)
test["FireplaceQu"].isnull().sum()

In [None]:
df["GarageType"].value_counts()

In [None]:
df["GarageType"].fillna("Attchd",inplace=True) #Lets fill the missing values with the most common value


In [None]:
test["GarageType"].fillna("Attchd",inplace=True)

In [None]:
df["GarageFinish"].fillna("Unf",inplace=True)
test["GarageFinish"].fillna("Unf",inplace=True)

In [None]:
df["GarageCond"].fillna("TA",inplace=True)
test["GarageCond"].fillna("TA",inplace=True)

In [None]:
df["GarageQual"].fillna("TA",inplace=True)
test["GarageQual"].fillna("TA",inplace=True)


In [None]:
df.isnull().sum().sort_values(ascending=False) #The rest of the missing values are so few, so we can just drop raws with missing values

In [None]:
df["BsmtFinType2"].fillna("Unf",inplace=True)

In [None]:
df["BsmtExposure"].fillna("No",inplace=True)

In [None]:
df["BsmtFinType1"].fillna("Unf",inplace=True)

In [None]:
df["BsmtQual"].fillna("TA",inplace=True)

In [None]:
df["BsmtCond"].fillna("TA",inplace=True)

In [None]:
df["MasVnrType"].fillna("None",inplace=True)

In [None]:
df["Electrical"].fillna("SBrkr",inplace=True)

In [None]:
df.isnull().sum().sort_values(ascending=False) #Now we do  ot have nay missing value in categorical columns of trainin set

In [None]:
test.isnull().sum().sort_values(ascending=False)

In [None]:
test["BsmtCond"].fillna("TA",inplace=True)
test["BsmtQual"].fillna("TA",inplace=True)
test["BsmtExposure"].fillna("No",inplace=True)
test["BsmtFinType2"].fillna("Unf",inplace=True)
test["BsmtFinType1"].fillna("GLQ",inplace=True)
test.isnull().sum().sort_values(ascending=False)

In [None]:
test["MasVnrType"].fillna("None",inplace=True)
test["MSZoning"].fillna("RL",inplace=True)
test["Functional"].fillna("No",inplace=True)
test["Utilities"].fillna("AllPub",inplace=True)
test["Exterior2nd"].fillna("VinylSd",inplace=True)
test.isnull().sum().sort_values(ascending=False)
#We have still 3 columns with only 1 missing value

In [None]:
test["KitchenQual"].fillna("TA",inplace=True)
test["Exterior1st"].fillna("VinylSd",inplace=True)
test["SaleType"].fillna("WD",inplace=True)
test.isnull().sum().sort_values(ascending=False)
#There is no missing value in our test data now.

<font color= "red">
2.2. Fixing Skewed Numerical Columns:

In [None]:
# Creating box plots for all numeric columns:
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale("log")
ax = sns.boxplot(data=df[num_cols] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)

In [None]:
# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax


skew(a, axis=0, bias=True, nan_policy='propagate')
    Compute the sample skewness of a data set.
    
    For normally distributed data, the skewness should be about zero. For
    unimodal continuous distributions, a skewness value greater than zero means
    that there is more weight in the right tail of the distribution. The
    function `skewtest` can be used to determine if the skewness value
    is close enough to zero, statistically speaking.

In [None]:
# Finding skewed numerical columns:
skew_columns = df[num_cols].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_columns[skew_columns > 0.5]
skew_index = high_skew.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skew_columns

boxcox_normmax(x, brack=(-2.0, 2.0), method='pearsonr')
    Compute optimal Box-Cox transform parameter for input data.
    
    Parameters
    ----------
    x : array_like
        Input array.
    brack : 2-tuple, optional
        The starting interval for a downhill bracket search with
        `optimize.brent`.  Note that this is in most cases not critical; the
        final result is allowed to be outside this bracket.
    method : str, optional
        The method to determine the optimal transform parameter (`boxcox`
        ``lmbda`` parameter). Options are:
    
        'pearsonr'  (default)
            Maximizes the Pearson correlation coefficient between
            ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be
            normally-distributed.
    
        'mle'
            Minimizes the log-likelihood `boxcox_llf`.  This is the method used
            in `boxcox`.
    
        'all'
            Use all optimization methods available, and return all results.
            Useful to compare different methods.

In [None]:
# Lets normalize skewed features
for i in skew_index:
    df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1))

In [None]:
# Let's make sure we handled all the skewed values
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale("log")
ax = sns.boxplot(data=df[skew_index] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)
#Now all the numerical seems to be normally distributed

In [None]:
#Lets do the same operations to the test data:
# Creating box plots for all numeric columns:
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale("log")
ax = sns.boxplot(data=test[test_num_cols] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)

In [None]:
# Finding skewed numerical columns:
skew_columns = test[test_num_cols].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_columns[skew_columns > 0.5]
skew_index = high_skew.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skew_columns

In [None]:
# Lets normalize skewed features
for i in skew_index:
    test[i] = boxcox1p(test[i], boxcox_normmax(test[i] + 1))
# Let's make sure we handled all the skewed values
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale("log")
ax = sns.boxplot(data=test[skew_index] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)
#Now all the numerical seems to be normally distributed

In [None]:
df["SalePrice"]

<font color= "red">
2.3. Handling the Text and Categorical Columns:

In [None]:

cat_cols=df.columns[df.dtypes== "object"]
cat_cols

In [None]:
test_cat_cols = test.columns[test.dtypes=="object"]
test_cat_cols

In [None]:
df[cat_cols].head()

In [None]:
df= pd.get_dummies(df,columns =["MSZoning","Street","LotShape","LandContour",'LandSlope', 
                                'LotConfig','Neighborhood', 'Condition1', 'BldgType', 
                                'RoofStyle',  'MasVnrType', 'ExterQual', 
                                'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                                'BsmtFinType1', 'HeatingQC', 'CentralAir', 
                                'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish', 
                                'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'],drop_first =True)
df.head() #Now we transform all of the categorical  columns into numerical values

In [None]:
df.drop(["Exterior2nd","Condition2","BsmtFinType2","Utilities", 'HouseStyle',
         'RoofMatl', 'Exterior1st',  'Heating', 'Electrical', 'Functional', 
        'GarageQual'],axis=1,inplace=True) # We drop 3 columns because they have the same information with some columns
df.head()

In [None]:
df.info() # Now all the columns are numerical

In [None]:
#Lets do the same operation for test data:
test[test_cat_cols].head()

Utilities, test["HouseStyle"],RoofMatl, Exterior1st, Heating,Electrical,Functional, GarageQual

In [None]:
test= pd.get_dummies(test,columns =["MSZoning","Street","LotShape","LandContour",'LandSlope', 
                                    'LotConfig','Neighborhood', 'Condition1', 'BldgType', 
                                    'RoofStyle',  'MasVnrType', 'ExterQual', 
                                    'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                                    'BsmtFinType1', 'HeatingQC', 'CentralAir', 
                                    'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish', 
                                    'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'],drop_first =True)
test.head() #Now we transform all of the categorical  columns into numerical values

In [None]:
test.drop(["Exterior2nd","Condition2","BsmtFinType2","Utilities", 
           'HouseStyle', 'RoofMatl', 'Exterior1st',  'Heating', 
           'Electrical', 'Functional', 'GarageQual'],axis=1,inplace=True) # We drop 3 columns because they have the same information with some columns
test.head()

In [None]:
test.info() # Now all the columns are numerical for test data and both train and test data is ready for ML algorithms

In [None]:
#Id column is just index, so we need to drop it totally from the dataframe:
df.drop("Id",axis=1,inplace=True)
df.head()


In [None]:
test_id = test["Id"]
test_id

In [None]:

test.drop("Id",axis=1,inplace=True)
test.head()

## 2.3. Select and Train a Model:

In [None]:
X=df.drop("SalePrice",axis=1)
X = X.values
X.shape

In [None]:

y.shape

In [None]:
test= test.values
test.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test=train_test_split(X,y, test_size=0.05,random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
predictions1 = model.predict(X_test)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
print(mean_absolute_error(y_test,predictions1))
print(mean_squared_error(y_test,predictions1))
print(np.sqrt(mean_squared_error(y_test,predictions1)))

In [None]:
import seaborn as sns
sns.set_style("darkgrid")
plt.figure(figsize=(12,10))
sns.regplot(predictions1,y_test)

<font color="green">
Lets use another model:

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor()
dtree.fit(X_train,y_train)
predictions2 = dtree.predict(X_test)
print(mean_absolute_error(y_test,predictions2))
print(mean_squared_error(y_test,predictions2))
print(np.sqrt(mean_squared_error(y_test,predictions2))) 

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = dtree, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

<font color="green">
Lets use another model:

In [None]:
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor()
random_forest.fit(X_train,y_train)
predictions_rforest= random_forest.predict(X_test)
print(mean_absolute_error(y_test,predictions_rforest))
print(mean_squared_error(y_test,predictions_rforest))
print(np.sqrt(mean_squared_error(y_test,predictions_rforest)))

In [None]:
accuracies = cross_val_score(estimator = random_forest, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
#It seems random fores has very good accuracy in test data, is the best to this point

<font color="green">
Lets use another model:

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
{'n_estimators': [ 100,120,150], 'max_features': [ 12,14,16]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3,
4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=10,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(-best_accuracy*100))
print("Best Parameters:", best_parameters)

In [None]:
grid_predictions = grid_search.predict(X_test)
print(mean_absolute_error(y_test,grid_predictions))
print(mean_squared_error(y_test,grid_predictions))
print(np.sqrt(mean_squared_error(y_test,grid_predictions))) #The predictions is very close to Random Forest

<font color="green">
Lets use another Other Ensemble and Boosting Models to have a better performance:

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.linear_model import LassoCV,RidgeCV,ElasticNetCV
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV,KFold,RandomizedSearchCV,StratifiedKFold,cross_val_score

In [None]:
kfold= KFold(n_splits=11,random_state=42,shuffle=True) #kfold cross validation

Light Gradient Boosting Regressor:

Gradient boosting refers to a class of ensemble machine learning algorithms that can be used for classification or regression predictive modeling problems.

Ensembles are constructed from decision tree models. Trees are added one at a time to the ensemble and fit to correct the prediction errors made by prior models. This is a type of ensemble machine learning model referred to as boosting.

In [None]:
lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=7000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       verbose=-1,
                       random_state=42)

In [None]:
lightgbm.fit(X_train,y_train)
lightgbm_predictions = lightgbm.predict(X_test)
print(mean_absolute_error(y_test,lightgbm_predictions))
print(mean_squared_error(y_test,lightgbm_predictions))
print(np.sqrt(mean_squared_error(y_test,lightgbm_predictions))) 
#This is better than both Random Forest and Grid Search Results

You could use RobustScaler if you have outliers and want to reduce their influence. However, you might be better off removing the outliers, instead. Use StandardScaler if you need a relatively normal distribution.
So we will use RobustScaler with Support Vector Regressor

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
# Support Vector Regressor
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.00015))

In [None]:
svr.fit(X_train,y_train)
svr_predictions= svr.predict(X_test)
print(mean_absolute_error(y_test,svr_predictions))
print(mean_squared_error(y_test,svr_predictions))
print(np.sqrt(mean_squared_error(y_test,svr_predictions)))

<font color="green">
Lets try XGBoost:

In [None]:
from xgboost import XGBRegressor
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

In [None]:
xgboost.fit(X_train, y_train)
xgboost_predictions =xgboost.predict(X_test)

In [None]:
print(mean_absolute_error(y_test,xgboost_predictions))
print(mean_squared_error(y_test,xgboost_predictions))
print(np.sqrt(mean_squared_error(y_test,xgboost_predictions)))

<font color="green">
Lets try GradientBoostingRegressor:

In [None]:
gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42) 

In [None]:
gbr.fit(X_train,y_train)
gbr_predictions = gbr.predict(X_test)
print(mean_absolute_error(y_test,gbr_predictions))
print(mean_squared_error(y_test,gbr_predictions))
print(np.sqrt(mean_squared_error(y_test,gbr_predictions))) 

<font color="green">
Lets try ElasticNetCV:

In [None]:
alpha_elnet= [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
l1ratio_elnet = [0.8, 0.85, 0.9, 0.95, 0.99, 1]
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1000000, alphas=alpha_elnet, \
                                                        cv=kfold, l1_ratio=l1ratio_elnet))

In [None]:
elasticnet.fit(X_train,y_train)
elastic_predictions = elasticnet.predict(X_test)
print(mean_absolute_error(y_test,elastic_predictions))
print(mean_squared_error(y_test,elastic_predictions))
print(np.sqrt(mean_squared_error(y_test,elastic_predictions)))

<font color="green">
Lets try LassoCV:

In [None]:
alphas_lasso = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008] #Best value of alpha parmaters for lasso
lasso = make_pipeline(RobustScaler(), LassoCV(alphas=alphas_lasso, cv=kfold))

In [None]:
lasso.fit(X_train,y_train)
lasso_predictions = lasso.predict(X_test)
print(mean_absolute_error(y_test,lasso_predictions))
print(mean_squared_error(y_test,lasso_predictions))
print(np.sqrt(mean_squared_error(y_test,lasso_predictions)))

<font color="green">
Lets try StackingCVRegressor:

In [None]:
# Stack up all the models that performs better than the others, optimized using xgboost
stack_reg = StackingCVRegressor(regressors=(xgboost, lightgbm, random_forest,
                                            gbr,svr,lasso,elasticnet),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)
stack_reg.fit(X_train,y_train)
stack_predictions = stack_reg.predict(X_test)
print(mean_absolute_error(y_test,stack_predictions))
print(mean_squared_error(y_test,stack_predictions))
print(np.sqrt(mean_squared_error(y_test,stack_predictions))) 

<font color="green">
Lets try a blending version of best models we have used above:

In [None]:
# Blend models in order to make the final predictions more robust to overfitting
blend_predictions=(0.025* elasticnet.predict(X_test)) + \
            (0.025 * lasso.predict(X_test)) + \
            (0.025 * random_forest.predict(X_test)) + \
            (0.025* svr.predict(X_test)) + \
            (0.62 * gbr.predict(X_test)) + \
            (0.03 * xgboost.predict(X_test)) + \
            (0.03 * lightgbm.predict(X_test)) + \
            (0.22 * stack_reg.predict(np.array(X_test))) 

In [None]:
print(mean_absolute_error(y_test,blend_predictions))
print(mean_squared_error(y_test,blend_predictions))
print(np.sqrt(mean_squared_error(y_test,blend_predictions))) 
#As we can see the blended version of the models outperforms every single model:

Lets try ANNs:

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

In [None]:

rs=RobustScaler()

In [None]:
X_deep_train=rs.fit_transform(X_train)
X_deep_train

In [None]:
X_deep_test =rs.transform(X_test)
X_deep_test

In [None]:

model = Sequential()
model.add(Dense(200, kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(50, kernel_initializer='normal', activation='relu'))
model.add(Dense(25, kernel_initializer='normal', activation='relu'))
model.add(Dense(1))
# Compile model
model.compile(loss='mean_squared_error', optimizer="adam")



model.fit(np.array(X_train), np.array(y_train), epochs=100, batch_size=10)


In [None]:
deep_predictions=model.predict(X_test)

In [None]:
print(mean_absolute_error(y_test,deep_predictions))
print(mean_squared_error(y_test,deep_predictions))
print(np.sqrt(mean_squared_error(y_test,deep_predictions)))

In [None]:
submission_sample = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
submission_sample.shape

In [None]:
pd.DataFrame(X_train)

In [None]:
def latest_predictions(features): 
    return ((0.025* elasticnet.predict(features)) + \
            (0.025 * lasso.predict(features)) + \
            (0.025 * random_forest.predict(features)) + \
            (0.025* svr.predict(features)) + \
            (0.62 * gbr.predict(features)) + \
            (0.03 * xgboost.predict(features)) + \
            (0.03 * lightgbm.predict(features)) + \
            (0.22 * stack_reg.predict(np.array(features))))

In [None]:
submission_sample.iloc[:,1] = latest_predictions(test)
submission_sample

In [None]:
q1 = submission_sample['SalePrice'].quantile(0.005)
q2 = submission_sample['SalePrice'].quantile(0.995)
submission_sample['SalePrice'] = submission_sample['SalePrice'].apply(lambda x: x if x > q1 else x*0.77)
submission_sample['SalePrice'] = submission_sample['SalePrice'].apply(lambda x: x if x < q2 else x*1.1)


In [None]:
submission_sample.to_csv("new_submission_regression4.csv", index=False)