In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import time

# 3D atavisualization
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly 
import plotly.graph_objs as go
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.express as px
from plotly.offline import iplot, init_notebook_mode
import cufflinks as cf
import plotly.figure_factory as ff 
from plotly.offline import iplot
from plotly import tools


# Machine Learning
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, Lasso, Ridge

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_recall_curve
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor,GradientBoostingRegressor, RandomForestRegressor,  GradientBoostingRegressor
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split, KFold, cross_val_score
from sklearn.ensemble import StackingRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings("ignore")


# You can go offline on demand by using
cf.go_offline() 
# initiate notebook for offline plot
init_notebook_mode(connected=False)         

# set some display options:
plt.rcParams['figure.dpi'] = 100
colors = px.colors.qualitative.Prism
pio.templates.default = "plotly_white"

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### BIG thanks to @Alaa Sedeeq with this notebook who helped me a lot ! --> https://www.kaggle.com/alaasedeeq/house-price-prediction-top-8

# GOAL OF THE NOTEBOOK
Predict sales prices and practice feature engineering, RFs, and gradient boosting

### File descriptions
train.csv - the training set
test.csv - the test set
data_description.txt - full description of each column, originally prepared by Dean De Cock but lightly edited to match the column names used here
sample_submission.csv - a benchmark submission from a linear regression on year and month of sale, lot square footage, and number of bedrooms

### Data fields
**SalePrice** - the property's sale price in dollars. This is the target variable that you're trying to predict.
**MSSubClass**: The building class
**MSZoning**: The general zoning classification
**LotFrontage**: Linear feet of street connected to property
**LotArea**: Lot size in square feet
**Street**: Type of road access
**Alley**: Type of alley access
**LotShape**: General shape of property
**LandContour**: Flatness of the property
**Utilities**: Type of utilities available
**LotConfig**: Lot configuration
**LandSlope**: Slope of property
**Neighborhood**: Physical locations within Ames city limits
**Condition1**: Proximity to main road or railroad
**Condition2**: Proximity to main road or railroad (if a second is present)
**BldgType**: Type of dwelling
**HouseStyle**: Style of dwelling
**OverallQual**: Overall material and finish quality
**OverallCond**: Overall condition rating
**YearBuilt**: Original construction date
**YearRemodAdd**: Remodel date
**RoofStyle**: Type of roof
**RoofMatl**: Roof material
**Exterior1st**: Exterior covering on house
**Exterior2nd**: Exterior covering on house (if more than one material)
**MasVnrType**: Masonry veneer type
**MasVnrArea**: Masonry veneer area in square feet
**ExterQual**: Exterior material quality
**ExterCond**: Present condition of the material on the exterior
**Foundation**: Type of foundation
**BsmtQual**: Height of the basement
**BsmtCond**: General condition of the basement
**BsmtExposure**: Walkout or garden level basement walls
**BsmtFinType1**: Quality of basement finished area
**BsmtFinSF1**: Type 1 finished square feet
**BsmtFinType2**: Quality of second finished area (if present)
**BsmtFinSF2**: Type 2 finished square feet
**BsmtUnfSF**: Unfinished square feet of basement area
**TotalBsmtSF**: Total square feet of basement area
**Heating**: Type of heating
**HeatingQC**: Heating quality and condition
**CentralAir**: Central air conditioning
**Electrical**: Electrical system
**1stFlrSF**: First Floor square feet
**2ndFlrSF**: Second floor square feet
**LowQualFinSF**: Low quality finished square feet (all floors)
**GrLivArea**: Above grade (ground) living area square feet
**BsmtFullBath**: Basement full bathrooms
**BsmtHalfBath**: Basement half bathrooms
**FullBath**: Full bathrooms above grade
**HalfBath**: Half baths above grade
**Bedroom**: Number of bedrooms above basement level
**Kitchen**: Number of kitchens
**KitchenQual**: Kitchen quality
**TotRmsAbvGrd**: Total rooms above grade (does not include bathrooms)
**Functional**: Home functionality rating
**Fireplaces**: Number of fireplaces
**FireplaceQu**: Fireplace quality
**GarageType**: Garage location
**GarageYrBlt**: Year garage was built
**GarageFinish**: Interior finish of the garage
**GarageCars**: Size of garage in car capacity
**GarageArea**: Size of garage in square feet
**GarageQual**: Garage quality
**GarageCond**: Garage condition
**PavedDrive**: Paved driveway
**WoodDeckSF**: Wood deck area in square feet
**OpenPorchSF**: Open porch area in square feet
**EnclosedPorch**: Enclosed porch area in square feet
**3SsnPorch**: Three season porch area in square feet
**ScreenPorch**: Screen porch area in square feet
**PoolArea**: Pool area in square feet
**PoolQC**: Pool quality
**Fence**: Fence quality
**MiscFeature**: Miscellaneous feature not covered in other categories
**MiscVal**: Value of miscellaneous feature
**MoSold**: Month Sold
**YrSold**: Year Sold
**SaleType**: Type of sale
**SaleCondition**: Condition of sale

# I - BASIC EDA

In [None]:
# Load all the data
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
df_submission = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
test_id = df_test.reset_index().drop('index',axis=1)['Id']

In [None]:
# Take a look of the shape
print(f'Train shape : {df_train.shape}')
print(f'Test shape : {df_test.shape}')
print(f'Submission shape : {df_submission.shape}')

In [None]:
# Configuration to see all features
pd.set_option('display.max_row', 111)
pd.set_option('display.max_column', 111)

In [None]:
# Take a look on train set
df_train.head()

In [None]:
# Take a look on test set
df_test.head()

In [None]:
# Take a look on submission exemple
df_submission.head()

### First Step : We will concat the Train and Test set
    - 1 : combine df_test + df_submissions
    - 2 : Merge df_train + df _test

In [None]:
# Combine df_test + df_submisision
df_test_full = pd.merge(df_test,df_submission, how = 'inner')
df_test_full.shape

In [None]:
# Concat df_train + df_test_full
data = pd.concat([df_train,df_test_full], axis=0)
data.shape



In [None]:
#Copy the dataset for secure
df = data.copy()

### First Look on your Data

#### CHECKLIST :

##### Shape Analysis :
- target variable : **'SalePrice'**
- shape of your dataset : **row : 2919, columns : 81**
- Features types : **float64(12), int64(26), object(43)**
- Missing Values analysis : We have a lot of variables with NaN 
    - 1 **group : > 80% NaN**
            - PoolQC	99.657417
            - MiscFeature	96.402878
            - Alley	93.216855
            - Fence	80.438506
     - **2 group : >15 % <50%**
            - FireplaceQu	48.646797
            - LotFrontage	16.649538
     - **3 group : Garage Option**
            - GarageFinish	5.447071
            - GarageQual	5.447071
            - GarageCond	5.447071
            - GarageYrBlt	5.447071
            - GarageType	5.378554
     - **4 group : < 3%**
            - BsmtExposure	2.809181
            - BsmtCond	2.809181
            - BsmtQual	2.774923
            - BsmtFinType2	2.740665
            - BsmtFinType1	2.706406
     - **5 group :**
            - MasVnrType	0.822199
            - MasVnrArea	0.787941
            - MSZoning	0.137033
      - **6 group : Condtion Optional**
            - Functional	0.068517
            - Utilities	0.068517
            - BsmtHalfBath	0.068517
            - BsmtFullBath	0.068517
      - **7 group : Condition Optimal**
            - GarageArea	0.034258
            - BsmtFinSF1	0.034258
            - SaleType	0.034258
            - GarageCars	0.034258
            - BsmtUnfSF	0.034258
            - Electrical	0.034258
            - Exterior2nd	0.034258
            - Exterior1st	0.034258
            - KitchenQual	0.034258
            - TotalBsmtSF	0.034258
            - BsmtFinSF2	0.034258
      - **8 group : Complete Features **
              - TotRmsAbvGrd	0.000000
              - Fireplaces	0.000000
              - BedroomAbvGr	0.000000
              - PavedDrive	0.000000
              - WoodDeckSF	0.000000
              - OpenPorchSF	0.000000
              - EnclosedPorch	0.000000
              - 3SsnPorch	0.000000
              - ScreenPorch	0.000000
              - PoolArea	0.000000
              - MiscVal	0.000000
              - MoSold	0.000000
              - YrSold	0.000000
              - SaleCondition	0.000000
              - KitchenAbvGr	0.000000
              - HeatingQC	0.000000
              - HalfBath	0.000000
              - FullBath	0.000000
              - LotArea	0.000000
              - Street	0.000000
              - LotShape	0.000000
              - LandContour	0.000000
              - LotConfig	0.000000
              - LandSlope	0.000000
              - Neighborhood	0.000000
              - Condition1	0.000000
              - Condition2	0.000000
              - BldgType	0.000000
              - HouseStyle	0.000000
              - OverallQual	0.000000
              - OverallCond	0.000000
              - YearBuilt	0.000000
              - YearRemodAdd	0.000000
              - RoofStyle	0.000000
              - RoofMatl	0.000000
              - ExterQual	0.000000
              - ExterCond	0.000000
              - Foundation	0.000000
              -  Heating	0.000000
              - MSSubClass	0.000000
              - CentralAir	0.000000
              - 1stFlrSF	0.000000
              - 2ndFlrSF	0.000000
              - LowQualFinSF	0.000000
              - GrLivArea	0.000000
              - SalePrice	0.000000 

In [None]:
# Take a look on the differents variables
df.info()

In [None]:
# Type representation
df.dtypes.value_counts().plot.pie()

In [None]:
#finding the unique values in each column
for col in df.columns:
    print('We have {} unique values in {} column'.format(len(df[col].unique()),col))
    print('__'*30)

In [None]:
#describe our data
df[df.select_dtypes(exclude='object').columns].drop('Id',axis=1).describe().\
style.background_gradient(axis=1,cmap=sns.light_palette('green', as_cmap=True))

In [None]:
#find the null values in each column
(df.isnull().sum()/df.shape[0]*100).sort_values(ascending=False).to_frame().rename(columns={0:'Null values'})

In [None]:
#null percentage for each column

null_df = round(100*(df.isnull().sum().sort_values(ascending=False)/len(df.index)),2)\
                    .to_frame().rename(columns={0:'Null values percentage'})[:15]
null_df

In [None]:
#Pie plot for the percentage values

null_df.reset_index().iplot(kind='pie',
                            labels='index',
                            title='Null values percentage',
                            textinfo='label+text+percent',
                            values='Null values percentage')

In [None]:
#visuaize the null values in each column
plt.figure(figsize=(20,8));
sns.heatmap(df.isnull(), cmap='viridis');

In [None]:
#lets see the correlation between columns and target column
corr = df.corr()
corr['SalePrice'].sort_values(ascending=False)[1:].to_frame()\
.style.background_gradient(axis=1,cmap=sns.light_palette('green', as_cmap=True))

In [None]:
#lets create a dataframe for the numeric columns with high skewness

skewness = pd.DataFrame()

num_cols = []
for col in df.select_dtypes(exclude='object'):
    num_cols.append(col)

skewness[['Positive Columns','Skewness(+v)']] = df[num_cols].drop('Id',axis=1).skew().sort_values(ascending=False)[:10].reset_index()
skewness[['Negative Columns','Skewness(-v)']] = df[num_cols].drop('Id',axis=1).skew().sort_values(ascending=True)[:10].reset_index()

skewness.columns = pd.MultiIndex.from_tuples([('Positive Skewness', 'Columns'), ('Positive Skewness', 'Skewness'),
                                              ('Negative Skewness', 'Columns'), ('Negative Skewness', 'Skewness')])
skewness

In [None]:
liste_skewness = ['MasVnrArea','BsmtHalfBath','ScreenPorch',
                  'EnclosedPorch','BsmtFinSF2','KitchenAbvGr','3SsnPorch','LowQualFinSF',
                  'LotArea','PoolArea','MiscVal']

### First Conclusion :

- A lot of NaN Values mostly on : 

      - PoolQC    99.657417
      - MiscFeature    96.402878
      - Alley    93.216855
      - Fence    80.438506
      
- SalePrice get a high correlation mostly with :  

    - GrLivArea	0.588010
    - OverallQual	0.550911
    - TotRmsAbvGrd	0.469800
    - GarageCars	0.469249
    - GarageArea	0.464809
    - 1stFlrSF	0.462865
    - TotalBsmtSF	0.453224
    - FullBath	0.433710
    - YearBuilt	0.362066
    - MasVnrArea	0.355608
    - Fireplaces	0.353567
    - YearRemodAdd	0.350032
    - GarageYrBlt	0.325297
    - LotFrontage	0.318084
    - LotArea	0.296497
    
- We got a high skewness with :  

    - 3SsnPorch	11.381914
    - LowQualFinSF	12.094977
    - LotArea	12.829025
    - PoolArea	16.907017
    - MiscVal	21.958480

### Take a look on your target

In [None]:
fig, axes = plt.subplots(1, 2, sharex=False, figsize=(14,5))
sns.histplot(ax=axes[0],data=df, x="SalePrice", kde=True, color='orange')
axes[0].set_title('Normal SalePrice')
sns.histplot(ax=axes[1],data=df, x=np.log1p(df['SalePrice']), kde=True, color='g')
axes[1].set_title('Log SalePrice')

Using Logarithms helps us to have a normal distribution which could help us in a number of different ways such as outlier detection.

In this data We have a right skewed distribution in which most Sales are between 0 and 340K.

### Background ANALYSIS

 ##### 1 - TARGET / TARGET : Numerical Variables

In [None]:
# Take a look on the numerical distributions : float type
sns.set_style('whitegrid')
fig, axes = plt.subplots(3,4, figsize=(18, 8));
plt.subplots_adjust(hspace = 0.7, wspace=0.2)
fig.suptitle('Numerical Float Distributions', fontsize=20)

# Take a look on the numerical distributions
a = len(df.select_dtypes('float').columns)  # number of rows

for i,col in zip(range(a),df.select_dtypes('float')):
    sns.kdeplot(df[col], ax=axes[i//4][i%4], fill= True);
    axes[i//4][i%4].set_title(col+' Distribution')

In [None]:
# Take a look on the numerical distributions : int type
sns.set_style('whitegrid')
fig, axes = plt.subplots(9,3, figsize=(18, 12));
plt.subplots_adjust(hspace = 1.5, wspace=0.2)
fig.suptitle('Numerical Int Distributions', fontsize=20)

# Take a look on the numerical distributions
a = len(df.select_dtypes('int').columns)  # number of rows

for i,col in zip(range(a),df.select_dtypes('int')):
    sns.kdeplot(df[col], ax=axes[i//3][i%3], fill= True, color='g');
    axes[i//3][i%3].set_title(col+' Distribution')

 ##### 2 - TARGET / TARGET : Categorical Variables

In [None]:
for col in df.select_dtypes('object'):
    print(f'{col :-<50} {df[col].unique()}')

In [None]:
#for col in df.select_dtypes('object'):
#    plt.figure()
#    df[col].value_counts().plot.pie()

### Relationship Target/Features :

     1 - Numericals Features (float/int)

In [None]:
# Take a look on the numerical regression : float typee
sns.set_style('whitegrid')
fig, axes = plt.subplots(3,4, figsize=(18, 8));
plt.subplots_adjust(hspace = 0.7, wspace=0.2)
fig.suptitle('Numerical float Regression', fontsize=20)

# Take a look on the numerical distributions
a = len(df.select_dtypes('float').columns)  # number of rows

for i,col in zip(range(a),df.drop('SalePrice',axis=1).select_dtypes('float')):
    sns.regplot(x=df['SalePrice'],y=df[col],marker="+", ax=axes[i//4][i%4]);
    axes[i//4][i%4].set_title(col+' Regression')

In [None]:
# Take a look on the numerical regression : int typee
sns.set_style('whitegrid')
fig, axes = plt.subplots(9,3, figsize=(16, 20));
plt.subplots_adjust(hspace = 1.2, wspace=0.4)
fig.suptitle('Numerical Int Regression', fontsize=20)

# Take a look on the numerical distributions
a = len(df.select_dtypes('int').columns)  # number of rows

for i,col in zip(range(a),df.drop(['Id','SalePrice'],axis=1).select_dtypes('int')):
    sns.regplot(x=df['SalePrice'],y=df[col],marker="+",color='g', ax=axes[i//3][i%3]);
    axes[i//3][i%3].set_title(col+' Regression')

2 - Categoricals Features (object)

In [None]:
# Relationship beetween : SaleCondition and SalePrice with the best correalion feature : GrLivArea
g = sns.lmplot(x="SalePrice", y="GrLivArea", hue="SaleCondition", data=df)
h = sns.lmplot(x="SalePrice", y="GrLivArea", col="SaleCondition", hue="SaleCondition",
               data=df, col_wrap=2, height=3)

In [None]:
# Take a look on the categorical regression : object typee
sns.set_style('whitegrid')
fig, axes = plt.subplots(15,3, figsize=(14, 24));
plt.subplots_adjust(hspace = 1.2, wspace=0.6)
fig.suptitle('Categorical Features Visualizations', fontsize=20)

# Take a look on the numerical distributions
a = len(df.select_dtypes('object').columns)  # number of rows

for i,col in zip(range(a),df.select_dtypes(include='object')):
    sns.heatmap(pd.crosstab(df['SaleCondition'], df[col]), annot=True, fmt='d', ax=axes[i//3][i%3]);
    axes[i//3][i%3].set_title(col)

We will create a new variables : saleprice_bins to get a better vision

In [None]:
# new feature :
df['SalePrice_bins'] = pd.cut(df['SalePrice'],bins=6, labels=False)
df['SalePrice_bins'].value_counts()

In [None]:
sns.scatterplot(x='SalePrice_bins',y='SalePrice',data=df)

In [None]:
dico_bins_sale = {
    0 : '<200k',
    1 : '<300k',
    2 : "<400k",
    3 : "<500k",
    4 : "<600k",
    5 : ">700k"
}

df['SalePrice_bins'] = df['SalePrice_bins'].map(dico_bins_sale)

### We will take a look on your group of data that we identified more early

In [None]:
def visualisation_data(dataset,xlabel):
    
    #Visualization on your Data

    #  plot Numerical Data

    a = len(dataset.select_dtypes(include='object').columns)  # number of rows
    b = 2  # number of columns
    c = 1  # initialize plot counter


    fig = plt.figure(figsize=(14,22))

    for i in dataset.select_dtypes(include='object'):
        if i != 'SalePrice_bins':
            plt.subplot(a, b, c)
        #plt.title('{} (heatmap), subplot: {}{}{}'.format(i, a, b, c))
            plt.xlabel(xlabel)
            sns.heatmap(pd.crosstab(df['SalePrice_bins'], dataset[i]), annot=True, fmt='d')
            c = c + 1

            plt.subplot(a, b, c)
        #plt.title('{} (scatter), subplot: {}{}{}'.format(i, a, b, c))
            plt.xlabel(xlabel)
            sns.scatterplot(x=xlabel, y="SalePrice", hue=i, alpha=.5, palette="muted", data=dataset)
            c = c + 1
    
    plt.show()

In [None]:
# 1 - Group Garage Option
garage_df = df.loc[:,['GarageFinish','GarageQual','GarageCond','GarageYrBlt','GarageType','SalePrice','SalePrice_bins']]
garage_df.info()

In [None]:
visualisation_data(garage_df,'GarageYrBlt')

## Conclusion : 
Garage type could be a argument for the sale price but 'GarageYrBlt' has a bad outlier to remove later

In [None]:
# Group 2 : BSM features
bsmt_df = df.loc[:,['BsmtExposure','BsmtCond','BsmtQual','BsmtFinType2','BsmtFinType1','SalePrice','SalePrice_bins','YearBuilt']]
bsmt_df.info()

In [None]:
visualisation_data(bsmt_df, 'YearBuilt')

## Conclusion : 
BsmtQual --> we can see a linearity with year of build and salePrice!

In [None]:
# Group 3 : other features
oth_df = df.loc[:,['MasVnrType','MasVnrArea','MSZoning','SalePrice','SalePrice_bins','YearBuilt']]
oth_df.info()

In [None]:
#MasVnrAre vs Target
g = sns.regplot(data=oth_df, x="MasVnrArea", y="SalePrice",marker='x')

In [None]:
visualisation_data(oth_df,'YearBuilt')

## Conclusion :

MsZoning and MasVnrAre look good too

In [None]:
# Group 4 : Condtion Optional
optional_df = df.loc[:,['Functional','Utilities','BsmtHalfBath','BsmtFullBath','SalePrice','SalePrice_bins','YearBuilt']]
optional_df.info()

In [None]:
fig = plt.figure(figsize=(12,8))

#  subplot #1
plt.subplot(121)
plt.title('subplot: 211')
sns.scatterplot(data=optional_df, x="YearBuilt", y="SalePrice",hue="BsmtHalfBath")

#  subplot #2
plt.subplot(122)
plt.title('subplot: 212')
sns.scatterplot(data=optional_df, x="YearBuilt", y="SalePrice",hue="BsmtFullBath")

plt.show()

In [None]:
visualisation_data(optional_df,'YearBuilt')

We can see that "Utilities" is useless to your model, so we will drop it later

In [None]:
# Group 5 : Condition Optimal
optimal_df = df.loc[:,['GarageArea','BsmtFinSF1','SaleType','GarageCars','BsmtUnfSF',
                       'Electrical','Exterior2nd',
                       'Exterior1st','KitchenQual','TotalBsmtSF','BsmtFinSF2',
                       'SalePrice','SalePrice_bins','YearBuilt']]
optimal_df.info()

In [None]:
fig = plt.figure(figsize=(12,10))

# GarageArea
plt.subplot(321)
sns.scatterplot(data=optimal_df, x='GarageArea', y="SalePrice")

# GarageCars
plt.subplot(322)
sns.scatterplot(data=optimal_df, x='YearBuilt', y="SalePrice", hue='GarageCars')

# BsmtFinSF1
plt.subplot(323)
sns.scatterplot(data=optimal_df, x='BsmtFinSF1', y="SalePrice")

# BsmtFinSF2
plt.subplot(324)
sns.scatterplot(data=optimal_df, x='BsmtFinSF2', y="SalePrice")

# BsmtUnfSF
plt.subplot(325)
sns.scatterplot(data=optimal_df, x='BsmtUnfSF', y="SalePrice")

# TotalBsmtSF
plt.subplot(326)
sns.scatterplot(data=optimal_df, x='TotalBsmtSF', y="SalePrice")

In [None]:
visualisation_data(optimal_df,'YearBuilt')

In [None]:
# Group 8 : last
last_df = df.loc[:,['PavedDrive','HeatingQC','HalfBath','FullBath','YearBuilt','LotArea','Street','LotShape','LandContour','LotConfig','LandSlope',
              'Condition1','Condition2','SalePrice_bins','SalePrice']]

last_df.info(),

In [None]:
visualisation_data(last_df,'YearBuilt')

Street is a bit useful because it's not equilibrate we will maybe drop it tooRobustScaler

In [None]:
# Group 8 : last
last_df_bis = df.loc[:,['BldgType','HouseStyle','OverallQual','OverallCond','YearBuilt','YearRemodAdd',
              'RoofStyle','RoofMatl','ExterQual','ExterCond','Foundation','Heating','MSSubClass','CentralAir','1stFlrSF','2ndFlrSF',
              'LowQualFinSF','GrLivArea','SalePrice_bins','SalePrice']]

last_df_bis.info(),

In [None]:
visualisation_data(last_df_bis,'YearBuilt')

## Conclusion :

- GarageArea / GarageCars / BsmtFinSF1 / TotalBsmtSF get nice correlation too like we saw early

## Multivariate Visualisation

### Scatter Matrix

In [None]:
#correlation heatmap
corr = df.corr()

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)]=True

cmap = sns.diverging_palette(180, 30, as_cmap=True)

with sns.axes_style('white'):
    fig, ax = plt.subplots(figsize=(25, 25))
    sns.heatmap(corr,  mask=mask, cmap=cmap, annot=True, center=0, vmin=-1, vmax=0.8,
                square=True, cbar_kws={'shrink':.5, 'orientation': 'vertical'}, linewidth=.02);

In [None]:
#Correlation Map
ax = sns.clustermap(df.select_dtypes(exclude='object').corr())

# PHASE II : PREPROCESSING

In [None]:
sns.scatterplot(x='YearBuilt', y="SalePrice", hue='Fence', alpha=.5, palette="muted", data=df)

## Procedure : 

1 - We will drop our columns with 90% + of missing values
    -> Fence don't look like a good feature to your model, so we will drop it too
    
2 - Split the dataset into a numerical and a categorical set 
 
     Look for Outliers

3 - Encodage 

4 - feature_engineering

5 - Imputation

6 - First look with a basic model

#### 1 - Drop the missing value

In [None]:
df = data.copy()

In [None]:
# Drop the columns with 80 + missing Value
df = df.loc[:,(df.isnull().sum()/df.shape[0]*100) < 80]
df.shape

### 2 - Split the dataset

In [None]:
# Extract object features
num_cols = []
for col in df.select_dtypes(exclude='object'):
    num_cols.append(col)
    
# Keep object features
cat_cols = []
for col in df.select_dtypes(include='object'):
    cat_cols.append(col)

In [None]:
print(f'Numerical features : {num_cols}')
print('-'*180)
print(f'Categorical features : {cat_cols}')

In [None]:
# Split our Dataset
df_cat = df[cat_cols]
df_num = df[num_cols]

In [None]:
print(f'Categorical shape : {df_cat.shape}')
print(f'Numerical shape : {df_num.shape}')

### LOOK ON OUTLIERS

In [None]:
#Visualize columns have corr with SalePrice

high_corr = corr['SalePrice'].sort_values(ascending=False)[1:][:13].index.tolist()

fig, axes = plt.subplots(4,3, figsize=(20, 10), sharey=True);
plt.subplots_adjust(hspace = 0.7, wspace=0.1)
fig.suptitle('Highest Correlation with sale price', fontsize=20);

for i,col in zip(range(12),high_corr):
    sns.scatterplot(y=df['SalePrice'], x=df[col], ax=axes[i//3][i%3])
    axes[i//3][i%3].set_title('SalesPrice with '+col)

In [None]:
# We can detect now our outliers

drop_index = df[((df['GarageArea']>1200) & (df['SalePrice']<300000))|
                  ((df['GrLivArea']>3000) & (df['SalePrice']<300000))|
                  ((df['1stFlrSF']>3000) & (df['SalePrice']<300000))|
                  ((df['TotalBsmtSF']>5000) & (df['SalePrice']<300000))|
                  ((df['MasVnrArea']>1200) & (df['SalePrice']<700000))|
                  ((df['SalePrice']>600000))].index

In [None]:
drop_index

## 3 - Preprocessing step : encodage, normalization, feature_engineering ...

In [None]:
df_cat.head()

In [None]:
for col in df_cat:
    print(f'{col :-<50} {df[col].unique()}')

In [None]:
def encodage(df):
    """ This function will encode our dataset df
    with the OneHotEncoder Method"""
    
    for col in df:
        df[col] = df[col].astype('category').cat.codes
    
    return df

In [None]:
def normalisation(df):
    """ This function will normalize our dataset df
    with the StandardScaler / RobustEncoder Method"""
    
    temp = pd.DataFrame(df['SalePrice'])
    # temp_2 = pd.DataFrame(df['Id'])
    
    df_norm = df
    
    #Init our Scaler
    #scaler = StandardScaler()
    scaler = RobustScaler()
    
    #FitTransform our data
    df = scaler.fit_transform(df)
    
    df_norm = pd.DataFrame(df, columns = df_norm.columns)
    df_norm = df_norm.drop(['Id','SalePrice'], axis=1)
    df_norm_final = pd.merge(df_norm, temp, how='inner',on=df_norm.index)
    df_norm_final = df_norm_final.drop('key_0', axis=1)
    #df_norm_final = pd.merge(df_norm_final, temp_2, how='inner',on=df_norm.index)
    #df_norm_final = df_norm_final.drop('key_0', axis=1)
    
    x=np.log1p(df_norm_final['SalePrice'])
    df_norm_final['SalePrice'] = x
    
    #RÃ©indexing
    df_norm_final = df_norm_final.reindex(columns=['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'])
    
    return df_norm_final

In [None]:
def imputation(df):
    """ Impute NaN feature by mean strategy"""
    df_imputed = df
    # Drop NaN
    # Init Imputer
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    df = imputer.fit_transform(df)
    
    df_imputed = pd.DataFrame(df, columns = df_imputed.columns)
    #df_imputed = df.dropna(axis=0)
    return df_imputed

In [None]:
def feature_engineering(df):
    """ Create news features"""
    
    #df['SpaceRooms'] = df['GrLivArea'] / df['TotRmsAbvGrd']
    #df['OveralSalePrice'] = df['SalePrice'] / df['OverallQual']
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df['TotalAreaExt'] = df['GrLivArea'] + df['GarageArea']
    df['TotalAreaInt'] = df['GrLivArea'] + df['TotalBsmtSF']

    return df

In [None]:
def preprocessing(df):
    
    """ Preprocessing of your pipeline"""
    
    # Drop the columns with 80 + missing Value
    df = df.loc[:,(df.isnull().sum()/df.shape[0]*100) < 90]
    df = df.drop(['Utilities','Street'], axis = 1) # Useless feature to your model
    
    # Drop Outliers
    #df = df.drop(drop_index)
    
    # Extract object features
    num_cols = []
    for col in df.select_dtypes(exclude='object'):
        num_cols.append(col)
    
    # Keep object features
    cat_cols = []
    for col in df.select_dtypes(include='object'):
        cat_cols.append(col)
    
    # Split our Dataset
    df_cat = df[cat_cols]
    df_num = df[num_cols]
    
    print(f'Categorical shape : {df_cat.shape}')
    print(f'Numerical shape : {df_num.shape}')
    
    # Preprocessing
    df_encode = encodage(df_cat)
    df_normalize = normalisation(df_num)
    #df_normalize = df_num
    #df = feature_engineering(df)
    
    # Join Dataset 
    df = df_encode.join(df_normalize)
    
    df = imputation(df)
    print(f'After Imputation shape : {df.shape}, So : {round((df_num.shape[0]-df.shape[0])/df_num.shape[0] * 100,2)} % of rows deleted')
    
    # Feature engineering
    df = feature_engineering(df)
    
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']
    
    return X, y

In [None]:
# Preprocessing of our train set
X_train, y_train = preprocessing(df_train)

In [None]:
# Preprocessing of our test set
X_test, y_test = preprocessing(df_test_full)

In [None]:
drop_index

In [None]:
# We will drop our outliers detected early
X_train = X_train.drop(drop_index)
y_train = y_train.drop(drop_index)

## PRE - MODELING

In [None]:
# First basic model 
model_1 = make_pipeline(LinearRegression())

In [None]:
# Evaluation Modeling
def evaluation(model,name_model):
    
    model.fit(X_train, y_train)
    model_score = model.score(X_train,y_train)
    ypred = model.predict(X_test)
    
    N, train_score, val_score = learning_curve(model, X_train, y_train,
                                              cv=5,scoring='neg_root_mean_squared_error',
                                               train_sizes=np.linspace(0.1, 1, 10))
    
    
    #print('Training scores:\n\n', train_score)
    #print('\n', '-' * 70) # separator to make the output easy to read
    #print('\nValidation scores:\n\n', val_score)
    
    train_scores_mean = -train_score.mean(axis = 1)
    validation_scores_mean = -val_score.mean(axis = 1)
    
    print(f'Model :{name_model}')
    #print('\n', '-' * 20) # separator
    #print('Mean training scores\n\n', pd.Series(train_scores_mean, index = N))
    #print('\n', '-' * 20) # separator
    #print('\nMean validation scores\n\n',pd.Series(validation_scores_mean, index = N))
    #print('\n', '-' * 20) # separator
    
    print(f'Score R2 : {model_score}')
    print('Mean training scores : ', train_scores_mean.mean())
    print('Mean Validation scores : ', validation_scores_mean.mean())
    print('MAE:', mean_absolute_error(y_test, ypred))
    print('MSE:', mean_squared_error(y_test, ypred))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, ypred)))
    
    #print('\n', '-' * 20) # separator
    print('\n','-' * 20) # separator
    plt.figure(figsize=(12, 8))
    plt.plot(N, train_score.mean(axis=1), label='train score')
    plt.plot(N, val_score.mean(axis=1), label='validation score')
    plt.xlabel('Number of train size')
    plt.ylabel('neg_root_mean_squared_error')
    plt.title(name_model)
    plt.legend()

In [None]:
evaluation(model_1,'LinearModel')

### TEST MORE MODEL

In [None]:
#Init preprocessor
preprocessor = make_pipeline(SelectKBest(f_regression, k=46))

In [None]:
Elastic = make_pipeline(preprocessor, ElasticNet(alpha=0.0005, l1_ratio=0.9,random_state=0))
Lasso_model = make_pipeline(preprocessor,Lasso(alpha =0.0005,random_state=0))
Ridge_model = make_pipeline(preprocessor, Ridge(random_state=0))
SVR_model =  make_pipeline(preprocessor, SVR())
RandomForest = make_pipeline(preprocessor, RandomForestRegressor(random_state=0))
Adaboost = make_pipeline(preprocessor, AdaBoostRegressor(random_state=0))
XGboost = make_pipeline(preprocessor, XGBRegressor())
GradientBoosting = make_pipeline(preprocessor, GradientBoostingRegressor(random_state=0))

In [None]:
# define a dict of model
dict_of_models = {
                 'Elastic': Elastic,
                 'Lasso_model': Lasso_model,
                 'Ridge_model': Ridge_model,
                 'SVR_model': SVR_model,
                 'RamdomForest': RandomForest,
                 'Adaboost': Adaboost,
                 'XGboost' : XGboost,
                 'GradientBoosting': GradientBoosting}

In [None]:
for name, model in dict_of_models.items():
    evaluation(model, name)

### Conclusion : Gradient Boosting look like the best model in first look but Elastic, Lasso and Ridge look good too when 
### we look at the evolution on the validation curve.
### We will compare this model in detail with visualisation 

In [None]:
# prepare configuration for cross validation test

#Create two dictionaries to store the results of R-Squared and RMSE 
r_2_results = {'R-Squared':{},'Mean':{},'std':{}}   
rmse_results = {'RMSE':{},'Mean':{},'std':{}}

n_folds = 5
kfold = KFold(n_folds, shuffle=True, random_state=0).get_n_splits(X_train)

for name, model in dict_of_models.items():
    r_2 = cross_val_score(model, X_train, y_train, scoring='r2', cv=kfold)   #R-Squared 
    rms = np.sqrt(-cross_val_score(model, X_train, y_train, cv=kfold,        #RMSE
                                   scoring='neg_mean_squared_error'))
    
    #save the R-Squared reults
    r_2_results['R-Squared'][name] = r_2
    r_2_results['Mean'][name] = r_2.mean()
    r_2_results['std'][name] = r_2.std()
    
    #save the RMSE reults
    rmse_results['RMSE'][name] = rms
    rmse_results['Mean'][name] = rms.mean()
    rmse_results['std'][name] = rms.std()

#### Vizualisation 

R-Squared

In [None]:
#visualizing the results of R-Squared for each model

r_2_cv_results = pd.DataFrame(index=r_2_results['R-Squared'].keys())

#append the max R-Squared for each model to the dataframe
r_2_cv_results['Max'] = [r_2_results['R-Squared'][m].max() for m in r_2_results['R-Squared'].keys()]
#append the mean of all R-Squared for each model to the dataframe
r_2_cv_results['Mean'] = [r_2_results['Mean'][m] for m in r_2_results['Mean'].keys()]
#append the min R-Squared for each model to the dataframe
r_2_cv_results['Min'] = [r_2_results['R-Squared'][m].min() for m in r_2_results['R-Squared'].keys()]
#append the std of all R-Squared for each model to the dataframe
r_2_cv_results['std'] = [r_2_results['std'][m] for m in r_2_results['std'].keys()]

r_2_cv_results = r_2_cv_results.sort_values(by='Mean',ascending=False)
r_2_cv_results.iplot(kind='bar',
                 title='Max, Min, Mean, and standard deviation <br>For R-Squared values for each model')

In [None]:
#visualizing the variance of R-Squared for each model

scores = pd.DataFrame(r_2_results['R-Squared'])
scores.iplot(kind='box',
             title='Box plot for the variation of R-Squared for each model')

RMSE

In [None]:
#visualize the results of RMSE for each model

rmse_cv_results = pd.DataFrame(index=rmse_results['RMSE'].keys())

#append the max R-Squared for each model to the dataframe
rmse_cv_results['Max'] = [rmse_results['RMSE'][m].max() for m in rmse_results['RMSE'].keys()]
#append the mean of all R-Squared for each model to the dataframe
rmse_cv_results['Mean'] = [rmse_results['Mean'][m] for m in rmse_results['Mean'].keys()]
#append the min R-Squared for each model to the dataframe
rmse_cv_results['Min'] = [rmse_results['RMSE'][m].min() for m in rmse_results['RMSE'].keys()]
#append the std of all R-Squared for each model to the dataframe
rmse_cv_results['std'] = [rmse_results['std'][m] for m in rmse_results['std'].keys()]

rmse_cv_results = rmse_cv_results.sort_values(by='Mean',ascending=True)
rmse_cv_results.iplot(kind='bar',
                 title='Maximum, Minimun, Mean values and standard deviation <br>For RMSE values for each model')

In [None]:
#visualize the variance of RMSE for each model

scores = pd.DataFrame(rmse_results['RMSE'])
scores.iplot(kind='box',
             title='Box plot for the variation of RMSE values for each model')

### Conlusion : 
When we look the difference between each model, we can extract some informations :
- RandomForest get the best high socre but get some RMSE --> Gradboosting is better
- SVR std is too large to continue with with
- Elastic, Lasso and Ridge don't get the high score but have looks good in general



## Focus on Elastic model : try to optimize it

In [None]:
# Look on hyperparams
Elastic.get_params()

In [None]:
parametersGrid = {"elasticnet__max_iter": [1, 5, 10, 100],
                  "elasticnet__alpha": [0.0005, 0.005, 0.001, 0.01, 0.1, 1, 10, 100],
                  "elasticnet__l1_ratio": np.arange(0.0, 1.0, 0.1)}

In [None]:
kfold = KFold(n_splits=10)

Elastic_grid = GridSearchCV(Elastic, parametersGrid, scoring='neg_root_mean_squared_error', cv=kfold)

Elastic_grid.fit(X_train, y_train)

print(Elastic_grid.best_params_)

In [None]:
evaluation(Elastic_grid.best_estimator_,'Elastic')

In [None]:
#see the results of the model for training

Elastic_score = round(Elastic_grid.best_estimator_.score(X_train, y_train)*100, 3)
predic = Elastic_grid.best_estimator_.predict(X_train)
Elastic_rmse = round(np.sqrt(mean_squared_error(y_train, predic).mean())*100, 3)
print(' _'*15)
print('\nElastic Results for trining test : \n')
print(f'Score : {Elastic_score}%')
print(f'RMSE  : {Elastic_rmse}%')
print(' _'*15)

### We will try to stack the models :

In [None]:
base_models = (KernelRidge(),
               make_pipeline(Lasso(alpha=0.0005, random_state=0)),
               make_pipeline(ElasticNet(alpha=0.0005, l1_ratio=0.9)),             
               make_pipeline(GradientBoostingRegressor(learning_rate=0.005, 
                                                                        loss='huber',
                                                                        max_depth=4, 
                                                                        max_features='sqrt',
                                                                        min_samples_leaf=15,
                                                                        min_samples_split=10,
                                                                        n_estimators=3000,
                                                                        random_state=0)))

meta_model = LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, 
                           feature_fraction=0.2319, feature_fraction_seed=9,
                           learning_rate=0.05, max_bin=55, min_data_in_leaf=6,
                           min_sum_hessian_in_leaf=11, n_estimators=720, num_leaves=5,
                           bagging_seed=9,objective='regression')

In [None]:
#Building the stacking model

stack = StackingCVRegressor(regressors=base_models,
                            meta_regressor=meta_model, 
                            use_features_in_secondary=True,
                            store_train_meta_features=True,
                            shuffle=False,cv=kfold,
                            random_state=0)

In [None]:
#fitting the model to our data
stack.fit(X_train,y_train)

In [None]:
#see the results of the model for training

stack_score = round(stack.score(X_train, y_train)*100, 3)
predictions = stack.predict(X_train)
stack_rmse = round(np.sqrt(mean_squared_error(y_train, predictions).mean())*100, 3)
print(' _'*15)
print('\nStacking Results for trining test : \n')
print(f'Score : {stack_score}%')
print(f'RMSE  : {stack_rmse}%')
print(' _'*15)

## CONCLUDE : Stacking upgrade our score very well ! 

In [None]:
#lets make the predictions for the submission 

y_stacking = np.expm1(stack.predict(X_test)) #using expm1 (The inverse of log1p)

In [None]:
#Make Submisison
submission = pd.DataFrame({
        "Id": df_test.Id,
        "SalePrice": y_stacking
    })

In [None]:
submission.to_csv('submission_Stacking.csv', index=False)