In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing other libraries that are required for our study

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

plt.style.use('bmh') #Setting matplot style option to 'Bayesian Methods for Hackers style'

#setting max number of columns to display == 100 in pandas options.
pd.options.display.max_columns = 100

Importing data:

In [None]:
#train set
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

#test set
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
#train_df
print(train_df.shape)

train_df.head()

We have 1460 records in test set and 81 feature, target variable is **'SalePrice'**

In [None]:
#test_df
print(test_df.shape)

test_df.head()

We have 1459 records in test set and 80 feature (target feature 'SalePrice' needs to predicted.) 

Now lets clean the data from both train and test dataset, 
> For that we will concatenate the both train_df and test_df to maintain homogeneity in this process for boththe datasets.

In [None]:
df_train_test = pd.concat([train_df, test_df], axis=0, join='outer', ignore_index=False, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True) 
print(df_train_test.shape)

df_train_test.head(10)

In [None]:
df_train_test.info()

# **Observations:**

**From the above we can see that, there are *2919 rows and 81 columns* in combined dataset(both train and test together).**

And we have some columns with very less no. of. non-NaN values in them.

Like: 
* >Alley with 198 no. of. non-NaN values. 
* >FireplaceQu with 1499 no. of. non-NaN values.
* >PoolQu -10, Fence - 571, MiscFeature -105 etc.
* >SalePrice also has shown on 1460 non-NaN values, this is because this our target feature to be predictedfor test dataset, our test set dont have SalePrice column(Observed above.). So we wont be dealing with these missing values.

This means we have to clean our data as our data contains missing values.

# Data Cleaning

Lets identify and drop the columns of such (The onces with  no. of. NaN values more than 30%) kind. Because these might effect the aggregations and analysis in our study with these NaN values.

And the remaing columns with less than that can be imputed with median of that respective column.

#****Checking for null values % in all feature of this df********

In [None]:
(df_train_test.isnull().sum()/len(df_train_test) * 100).round(2)

In [None]:
#No of columns with atleast 1 NaN Values in both train and test sets together
print('No of columns with atleast 1 NaN Values:',
      (df_train_test.isnull().sum()/len(df_train_test) * 100).round(2)[(df_train_test.isnull().sum()/len(df_train_test) * 100).round(2) > 0.00].count())

In [None]:
#The Features with 30% and more NaN values 
(df_train_test.isnull().sum()/len(df_train_test) * 100).round(2)[(df_train_test.isnull().sum()/len(df_train_test) * 100).round(2) >= 30.00]

In [None]:
#list of Features that can be droped for now
cols_with_30pct_n_more = (df_train_test.isnull().sum()/len(df_train_test) * 100).round(2)[(df_train_test.isnull().sum()/len(df_train_test) * 100).round(2) >= 30.00].index.to_list()

cols_with_30pct_n_more.remove('SalePrice') #As we are not going to deal with missing values in this

cols_with_30pct_n_more

In [None]:
#Lets also drop Id column alogng with the above
cols_to_drop = ['Id']

#adding the columns to be dropped due to high NaN values
cols_to_drop.extend(cols_with_30pct_n_more)

cols_to_drop

In [None]:
#dropping

df_train_test = df_train_test.drop(cols_to_drop, axis = 1)

In [None]:
#Checking the shape of df after dropping 

df_train_test.shape

After dropping 'Id' and other columns with High no. of. NaN values, we are left out with 75 columns.

In [None]:
#No of columns with atleast 1 NaN Values in both train and test sets together 
#After dropping top cols of them
print('No of columns with atleast 1 NaN Values:',
      (df_train_test.isnull().sum()/len(df_train_test) * 100).round(2)[(df_train_test.isnull().sum()/len(df_train_test) * 100).round(2) > 0.00].count())

In [None]:
print('Columns with atleast 1 NaN Values:',
      (df_train_test.isnull().sum()/len(df_train_test) * 100).round(2)[(df_train_test.isnull().sum()/len(df_train_test) * 100).round(2) > 0.00].sort_values(ascending = False))

Now, we can deal with these missing values in 2 step, identifing categorical, and numerical feature separately.

And imputing their NaN values with appropriate methods like median for numericals features, and mode for categorical features.

In [None]:
nan_cols = (df_train_test.isnull().sum()/len(df_train_test) * 100).round(2)[(df_train_test.isnull().sum()/len(df_train_test) * 100).round(2) > 0.00].sort_values(ascending = False).index.to_list()

nan_cols

In [None]:
#their data types

t_f_obj_nan_cols = (df_train_test[nan_cols].dtypes == object)
t_f_obj_nan_cols

So we have 18 object type feature, 12 numeric.

In [None]:
#Type casting bool to str
t_f_obj_nan_cols = t_f_obj_nan_cols.astype('str')

t_f_obj_nan_cols

In [None]:
#list object type features
obj_nan_cols = t_f_obj_nan_cols[t_f_obj_nan_cols == 'True'].index.to_list()

#list of numeric type features
num_nan_cols = t_f_obj_nan_cols[t_f_obj_nan_cols == 'False'].index.to_list()

print("object type features:")
print(obj_nan_cols)
print('\n')
print("numeric type features:")
print(num_nan_cols)

Now we have two lists, one with object type feature names and the other with numerical type feature names.

now lets impute meadian under numerical features and mode under object type features.

Note: We have to exclude the 'SalePrice' as discussed before.

***Firstly, lets convert string type values of object features to categorical values***

In [None]:
from sklearn.preprocessing import LabelEncoder

for i in range(df_train_test.shape[1]):
    if df_train_test.iloc[:,i].dtypes == object:
        lbl = LabelEncoder()
        lbl.fit(list(df_train_test.iloc[:,i].values))
        df_train_test.iloc[:,i] = lbl.transform(list(df_train_test.iloc[:,i].values))

print(df_train_test['SaleCondition'].unique())

In [None]:
#Removing SalePrice from num_nan_cols
num_nan_cols.remove('SalePrice')

In [None]:
#imputing NaNs

df_train_test[num_nan_cols] = df_train_test[num_nan_cols].fillna(df_train_test[num_nan_cols].median())

#df_train_test[obj_nan_cols] = df_train_test[obj_nan_cols].fillna(df_train_test[obj_nan_cols].mode())
for column in df_train_test[obj_nan_cols]:
    mode = df_train_test[column].mode()
    df_train_test[column] = df_train_test[column].fillna(mode)

In [None]:
print(df_train_test.isnull().sum()[df_train_test.isnull().sum()>0])

Now,as we can see we successfully dealth with NaN values in all features.

# Feature Engineering

Adding new feature TotalSFA which is total Surface area of the house for sale. Which is generally thought of immediately when you come to know that a house is for sale.

This is nothing but 'TotalBsmtSF'+ '1stFlrSF' + '2ndFlrSF'.

In [None]:
df_train_test['TotalSFA'] = df_train_test['TotalBsmtSF'] + df_train_test['1stFlrSF'] + df_train_test['2ndFlrSF']

df_train_test[['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'TotalSFA']].head()

**Spliting data back into train_df and test_df**

> As we know, we can do that in different ways, lets use those NaN values under SalePrice to this.

In [None]:
df_train_test[df_train_test.SalePrice.isnull() == True].shape

Here we can see the test set after dealing with NaNs. Lets name it back as **df_test**.

In [None]:
df_test = df_train_test[df_train_test.SalePrice.isnull() == True]

df_test = df_test.drop('SalePrice', axis = 1)

df_test.columns

In [None]:
df_train = df_train_test.dropna(axis = 0)

In [None]:
df_train.shape

Here we go, now we have our train set as well.

*From here lets call our train set as **'df'***

In [None]:
df = df_train

----------------------
# **Lets now understand how the 'Housing Prices -> SalePrice' is distributed **

In [None]:
#Target variable -> SalePrice

df.SalePrice.head()

These are the top 5 recorded values under SalePrice column, and it is * **int** * type variable.

In [None]:
print(df.SalePrice.describe().round(2))
plt.figure(figsize=(9, 8))
sns.distplot(df.SalePrice, color='orange', bins=100, hist_kws={'alpha': 0.4}); #; will avoid the matplotlib verbose informations

# Observations:

From describe()
---
* No NaN(Null) Values. Counting to **1460**.
* **Average SalePrice** of a house from the data availabe is **180921.2**
* **Middle value of ordered SalePrice = 163000.0** ('Median' or 'Q3')
* **Minimum SalePrice** = **34900.0** and **Maximum SalePrice** = **755000.0**

From distplot()
----
* The prices in **SalePrice** are observed to be **Right-Skewed**.
* There are outliers above ~500,000 

Eventually to attain the normal distribution to data of SalePrice, we will treat these outliers with appropriate approach.
----

--------------------

We use log-transform to make them normally distributed.

In [None]:
# log-transform the target variable for normality
df['SalePrice'] = np.log(df['SalePrice'])

plt.figure(figsize=(9, 8))
sns.distplot(df.SalePrice, color='orange', bins=100, hist_kws={'alpha': 0.4});

Yes, now our target variable is approximately following normal distribution.

# **Checking Feature data distribution**

In [None]:
df.head()

In [None]:
df.dtypes.unique()

In [None]:
df.describe()

---------------
***Lets look at the distribution of all of the features by ploting them***

In [None]:
df.hist(figsize = (30, 35), bins = 50, xlabelsize = 8, ylabelsize = 8, color='orange');

-------------------
# What to analyse from the above histogram graphs?
The first thing that cae into my mind by seeing, observing these graphs is 'Okay! I understand few features are distributed approximately normal and some are skewed. But, what about other shapes, other feature distribution what do they say, what to analyse, what to understand from those others graphs?', then in search of answers to these question striked in my mind, I came accross these points.

1. What is Histogram?
> **Histogram:** A graphic summary of variation in a set of data. The pictorial nature of a histogram lets people see patterns that are difficult to detect in a simple table of      numbers.
> A histogram is the most commonly used graph to show frequency distributions. It looks very much like a bar chart, but there are important differences between them. 


>> Answers to my questions: How to analyze the meaning of your histogram's shape. [Typical histogram shapes and what they mean](http://asq.org/quality-resources/histogram#Shapes).
     
   ------------------------------ 

# **Observations from the above Histograms:**



* 1stFlrSF, GrLivArea, LotFrontage, TotalBsmtSF and 'TotalSFA' are all almost similarlly distributed as the way previously our SalePrice, Right-Skewed.There are outliers in these Features.

* Some values such as GarageCars -> SalePrice or Fireplaces -> SalePrice shows a particular pattern with verticals lines roughly meaning that they are discrete variables with a short range.

* In most of the other features the mode of datapoints lie at 0, this might because those features may not be available for majority of the records(houses).

# Correlation between features

Pandas **dataframe.corr()** is used to find the *pairwise correlation* of all columns in the dataframe. Any **na values are automatically excluded.** For any *non-numeric data type columns in the dataframe it is ignored*.

> **Syntax: DataFrame.corr(self, method=’pearson’, min_periods=1)**

In [None]:
df_corr = df.corr()

#Only the reltion coefficients between all other features to SalePrice.
df_corr = df_corr.SalePrice 

df_corr = df_corr.drop('SalePrice')# Because we dont need the correlation SalePrice - SalePrice

We can find the **features** which are *strongly correlated with SalePrice* from the above. And now we will store those Feature names in a list called, ***strong_corr_features***.

In [None]:
#strong correlation
#sorted in descending order of correlation
strong_corr_features = df_corr[abs(df_corr) > 0.6].sort_values(ascending = False) #abs() to avoid the effect of sign

print('There are {} strongly correlated features with SalePrice:\n{}'.format(len(strong_corr_features), strong_corr_features))

By looking at correlation between features we discovered 7 features which have a strong relationship to a house price. 

Lets Check
# Feature to Feture correlation

This will help us in feture reduction

In [None]:
corr = df.drop('SalePrice', axis=1).corr() # We already examined SalePrice correlations
plt.figure(figsize=(25, 25))

sns.heatmap(corr[(corr >= 0.8) | (corr <= -0.8)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

# Obersvations:

* A lot of features seems to be correlated between each other.
* Some of them such as YearBuild -> GarageYrBlt may just indicate a price inflation over the years.
* As for 1stFlrSF -> TotalBsmtSF, it is normal that the more the 1st floor is large (considering many houses have only 1 floor), the more the total basement will be large.
* Very high correlation is found between [TotalSFA] -> [TotalBsmtSF, 1stFlrSF, GrLiveArea], where the relation between first two features is obvious but the relation between TotalSFA -> GrLivArea is interesting. 

Like wise there are many interesting things found from above.

-------------

Now lets move ahead and check for  
# Feature Importance

Which of the features are more influencing the target variable?
By understanding this insted of using all 76 feature, we can use the top most influencing features to train the model.

*We will use a random forest regressor to do that.*

In [None]:
#Spliting data to X_train, y_train and X_test
y_train = df['SalePrice']

X_train = df.drop('SalePrice', axis = 1)

X_test = df_test

In [None]:
# feature importance using random forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=80, max_features='auto')
rf.fit(X_train, y_train)
print('Training done using Random Forest')

ranking = np.argsort(-rf.feature_importances_)
f, ax = plt.subplots(figsize=(18, 12))
sns.barplot(x=rf.feature_importances_[ranking], y=X_train.columns.values[ranking], orient='h')
ax.set_xlabel("Feature Importance")
plt.tight_layout()
plt.show()

Surprisingly, only 2 features are dominant: **'OverallQual'** and **'TotalSF'**. So instead of using all the 77 features, maybe just using the top 30 features is good enough (dimensionality reduction, in a way).

Here, we make a new feature called 'Interaction': simply the multiplication between the top 2 features. Also, we normalize the data via z-scoring.

In [None]:
# use the top 30 features only
X_train = X_train.iloc[:,ranking[:30]]
X_test = X_test.iloc[:,ranking[:30]]

# interaction between the top 2
X_train["Interaction"] = X_train["TotalSFA"]*X_train["OverallQual"]
X_test["Interaction"] = X_test["TotalSFA"]*X_test["OverallQual"]

# zscoring
X_train = (X_train - X_train.mean())/X_train.std()
X_test = (X_test - X_test.mean())/X_test.std()
    
# heatmap
f, ax = plt.subplots(figsize=(11, 5))
cmap = sns.cubehelix_palette(light=1, as_cmap=True)
sns.heatmap(X_train, cmap=cmap)
plt.show()

Now we are ready with our most influencing features, lets check again how the are related to 'SalePrice' visually.

In [None]:
# relation to the target
fig = plt.figure(figsize=(12,7))
for i in np.arange(30):
    ax = fig.add_subplot(5,6,i+1)
    sns.regplot(x=X_train.iloc[:,i], y=y_train)

plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(12,7))
sns.regplot(x=X_train.iloc[:,3], y=y_train)
plt.show()

In [None]:
fig = plt.figure(figsize=(12,7))
sns.regplot(x=X_train.iloc[:,7], y=y_train)
plt.show()

# Observations:
* TotalSFA show good linear relation toward SalePrice, we can observe few outlier in the TotalSFA panel, we can remove them for better results.
* Its Samecase with GrLivArea, this shows linear relation and also have few outliers.
* Under 'GarageArea' 
Lets treat these outliers

In [None]:
X_temp = X_train
X_temp['SalePrice'] = y_train
X_temp = X_temp.drop(X_temp[(X_temp['TotalSFA']>5) & (X_temp['SalePrice']<12.5)].index)
X_temp = X_temp.drop(X_temp[(X_temp['GrLivArea']>5) & (X_temp['SalePrice']<13)].index)
X_temp = X_temp.drop(X_temp[(X_temp['GarageArea']>3) & (X_temp['SalePrice']<12.5)].index)
X_temp = X_temp.drop(X_temp[(X_temp['BsmtFinSF1']>2) & (X_temp['SalePrice']>13.25)].index)
X_temp = X_temp.drop(X_temp[(X_temp['BsmtFinSF1']>-1) & (X_temp['SalePrice']<11.2)].index)
# recover
y_train = X_temp['SalePrice']
X_train = X_temp.drop(['SalePrice'], axis=1)

Now, its time to use ensambling ML model: 
# XGBOOST

In [None]:
# XGBoost
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

print("Parameter optimization")
xgb_model = xgb.XGBRegressor()
reg_xgb = GridSearchCV(xgb_model,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200]}, verbose=1)
reg_xgb.fit(X_train, y_train)
print(reg_xgb.best_score_)
print(reg_xgb.best_params_)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

def create_model(optimizer='adam'):
    model = Sequential()
    model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(16, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))

    model.compile(loss='mean_squared_error', optimizer=optimizer)
    return model

model = KerasRegressor(build_fn=create_model, verbose=0)
# define the grid search parameters
optimizer = ['SGD','Adam']
batch_size = [10, 30, 50]
epochs = [10, 50, 100]
param_grid = dict(optimizer=optimizer, batch_size=batch_size, epochs=epochs)
reg_dl = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
reg_dl.fit(X_train, y_train)

print(reg_dl.best_score_)
print(reg_dl.best_params_)

In [None]:
# SVR
from sklearn.svm import SVR

reg_svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5,
                   param_grid={"C": [1e0, 1e1, 1e2, 1e3],
                               "gamma": np.logspace(-2, 2, 5)})
reg_svr.fit(X_train, y_train)

print(reg_svr.best_score_)
print(reg_svr.best_params_)

In [None]:
# second feature matrix
X_train2 = pd.DataFrame( {'XGB': reg_xgb.predict(X_train),
     'NN': reg_dl.predict(X_train).ravel(),
     'SVR': reg_svr.predict(X_train),
    })
X_train2.head()

In [None]:
# second-feature modeling using linear regression
from sklearn import linear_model

reg = linear_model.LinearRegression()
reg.fit(X_train2, y_train)

# prediction using the test set
X_test2 = pd.DataFrame( {'XGB': reg_xgb.predict(X_test),
     'DL': reg_dl.predict(X_test).ravel(),
     'SVR': reg_svr.predict(X_test),
    })

# Don't forget to convert the prediction back to non-log scale
y_pred = np.exp(reg.predict(X_test2))

In [None]:
#y_pred = np.exp(reg_xgb.predict(X_test))

In [None]:
y_pred

In [None]:
test_Id = test_df['Id']

# Submission

In [None]:
submission = pd.DataFrame({ 
    "Id": test_Id, 
    "SalePrice": y_pred }) 

submission.to_csv('houseprice_111.csv', index=False)