In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Dependencies

In [None]:
import pandas as pd
import numpy as np
import  matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.pandas.set_option('display.max_columns', None)

import warnings

### Understanding the data

In [None]:
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
print('Training set shape:', df_train.shape)

df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
print('Testing set shape:', df_test.shape)

In [None]:
df_train.head()

In [None]:
def df_characteristics(df):
    print('Shape of the dataset: {}'.format(df.shape))
    
    df_numerical = df.select_dtypes(include = [np.number])
    print('Number of Numerical Features: {}'.format(df_numerical.shape[1]))
    df_categorical = df.select_dtypes(exclude = [np.number])
    print('Number of Categorical Features: {}'.format(df_categorical.shape[1]))

In [None]:
df_characteristics(df_train)

In [None]:
df_characteristics(df_test)

List of Numerical Features:

In [None]:
df_numerical = df_train.select_dtypes(include = [np.number])
numerical_features = df_numerical.columns
numerical_features

List of Categorical Features

In [None]:
df_categorical = df_train.select_dtypes(exclude = [np.number])
categorical_features = df_categorical.columns
categorical_features

## Exploratory Data Analysis

### Missing Data

Define a function to find out the percentage of missing values in respective features in both training and test sets.

In [None]:
def check_null(df):
    null_percent = (df.isnull().sum() / len(df)) * 100
    
    try:
        null_percent = (null_percent.drop(null_percent[null_percent == 0].index)).sort_values(ascending=False)
        
    except:
        print('There is No null values in the dataset')
        print('Returning the dataset...')
        return df
    
    return null_percent

Representation of missing values in percentage

In [None]:
train_nan = check_null(df_train)
test_nan = check_null(df_test)

nan = pd.DataFrame({'Train(%)': train_nan, 'Test(%)': test_nan})
nan.sort_values(by='Train(%)', ascending=False)

### Handling the missing data

#### 1. Dropping the irrelevant features
As `PoolQC`, `MiscFeature`, `Alley` and `Fence` are highly null (above 90%), We shall drop them.

In [None]:
columns_drop = ['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence']

df_train.drop(columns = columns_drop, axis=1, inplace=True)
df_test.drop(columns = columns_drop, axis=1, inplace=True)

#### 2. FireplaceQu : Fireplace Quality


Ex  - Excellent  
Gd  - Good  
TA  - Average  
Fa  - Fair  
Po  - Poor  
NA  - No Fireplace

In [None]:
df_train.FireplaceQu.value_counts()

In [None]:
sns.countplot(df_train['FireplaceQu'])

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='FireplaceQu')

As this feature directly related with our target column, it would be better if we fill it with **0**.

In [None]:
df_train['FireplaceQu'] = df_train['FireplaceQu'].fillna(0)
df_test['FireplaceQu'] = df_test['FireplaceQu'].fillna(0)

sns.boxplot(data=df_train, x='SalePrice', y='FireplaceQu')

#### 3. LotFrontage: Linear feet of street connected to property

In [None]:
sns.distplot(df_train.LotFrontage)

In [None]:
def LotFrontage_Stats(df):
    print('Mean: {}, Medain: {}'.format(df.LotFrontage.mean(), 
                                        df.LotFrontage.median()))
    
LotFrontage_Stats(df_train)  # training set
LotFrontage_Stats(df_test)  # testing set

In [None]:
sns.regplot(data=df_train, x='SalePrice', y='LotFrontage')

The difference between Mean and Medain for both training and test set is very little, so let's fill the missing values with Median.

In [None]:
df_train['LotFrontage'] = df_train['LotFrontage'].fillna(df_train['LotFrontage'].median())
df_test['LotFrontage'] = df_test['LotFrontage'].fillna(df_test['LotFrontage'].median())

#### 4. GarageQual: Garage Quality

In [None]:
df_train['GarageQual'].value_counts()

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='GarageQual')

In [None]:
sns.distplot(df_train[df_train['GarageQual'] == 'TA'].SalePrice)

In [None]:
sns.distplot(df_train[df_train['GarageQual'] == 'Fa'].SalePrice)

Dropping this feature will not affect our model badly. So let's drop it.

In [None]:
df_train.drop('GarageQual', axis=1, inplace=True)
df_test.drop('GarageQual', axis=1, inplace=True)

#### 5. GarageFinish: Interior finish of the garage

Fin - Finished  
RFn - Rough Finished  
Unf - Unfinished  
Nog  - No Garage

In [None]:
df_train.GarageFinish.value_counts()

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='GarageFinish')

This feature is good as the value difference in the categories is not too big. Let's create a new category for NaN values called `Nog`

In [None]:
df_train['GarageFinish'] = df_train['GarageFinish'].fillna('Nog')
df_test['GarageFinish'] = df_test['GarageFinish'].fillna('Nog')

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='GarageFinish')

#### 6. GarageCond : Garage Condition

In [None]:
df_train.GarageCond.value_counts()

Here also we will replace NaN values with `Nog` ie., NoGarage and see the results.

In [None]:
df_train['GarageCond'] = df_train['GarageCond'].fillna('Nog')

sns.boxplot(data=df_train, x='SalePrice', y='GarageCond')

I don't think this feature is much helpful to our model, so let's drop it.

In [None]:
df_train.drop('GarageCond', axis=1, inplace=True)
df_test.drop('GarageCond', axis=1, inplace=True)

#### 7. GarageYrBlt: Year garage was built

In [None]:
sns.distplot(df_train.GarageYrBlt)

In [None]:
print('Maximum value: {}'.format(df_train.GarageYrBlt.max()))
print('Minimun value: {}'.format(df_train.GarageYrBlt.min()))

In [None]:
sns.regplot(data=df_train, x='SalePrice', y='GarageYrBlt')

Let's fill it with minimun value

In [None]:
df_train['GarageYrBlt'] = df_train['GarageYrBlt'].fillna(df_train.GarageYrBlt.min())
df_test['GarageYrBlt'] = df_test['GarageYrBlt'].fillna(df_test.GarageYrBlt.min())

#### 8. GarageType: Garage Location

In [None]:
df_train.GarageType.value_counts()

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='GarageType')

Here also we will replace NaN values with `Nog` ie., NoGarage and see the results.

In [None]:
df_train['GarageType'] = df_train['GarageType'].fillna('Nog')
df_test['GarageType'] = df_test['GarageType'].fillna('Nog')

sns.boxplot(data=df_train, x='SalePrice', y='GarageType')

#### 9. BsmtQual : Evaluates the height of basement

In [None]:
df_train.BsmtQual.value_counts()

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='BsmtQual')

Create a new category `NoBsmt` for NaN values.

In [None]:
df_train['BsmtQual'] = df_train['BsmtQual'].fillna('NoBsmt')
df_test['BsmtQual'] = df_test['BsmtQual'].fillna('NoBsmt')

sns.boxplot(data=df_train, x='SalePrice', y='BsmtQual')

#### 10. BsmtCond : Evaluates the general condition of the basement

In [None]:
df_train.BsmtCond.value_counts()

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='BsmtCond')

This feature won't help much to our model. So, let's drop it.

In [None]:
df_train.drop('BsmtCond', axis=1, inplace=True)
df_test.drop('BsmtCond', axis=1, inplace=True)

#### 11. BsmtExposure : Refers to walkout or garden level walls

   Gd -  Good Exposure  
   Av -  Average Exposure   
   Mn -  Mimimum Exposure  
   No -  No Exposure  
   NA -  No Basement  

In [None]:
df_train.BsmtExposure.value_counts()

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='BsmtExposure')

Creating a new category `NoBsmt` for NaN values.

In [None]:
df_train['BsmtExposure'] = df_train['BsmtExposure'].fillna('NoBsmt')
df_test['BsmtExposure'] = df_test['BsmtExposure'].fillna('NoBsmt')

sns.boxplot(data=df_train, x='SalePrice', y='BsmtExposure')

#### 12.BsmtFinType1: Rating of basement finished area

   GLQ - Good Living Quarters  
   ALQ - Average Living Quarters  
   BLQ - Below Average Living Quarters     
   Rec - Average Rec Room  
   LwQ - Low Quality  
   Unf - Unfinshed  
   NA  - No Basement  

In [None]:
df_train.BsmtFinType1.value_counts()

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='BsmtFinType1')

Creating a new category `NoBsmt` for NaN values.

In [None]:
df_train['BsmtFinType1'] = df_train['BsmtFinType1'].fillna('NoBsmt')
df_test['BsmtFinType1'] = df_test['BsmtFinType1'].fillna('NoBsmt')

sns.boxplot(data=df_train, x='SalePrice', y='BsmtFinType1')

#### 13.BsmtFinType2: Rating of basement finished area (if multiple types)

In [None]:
df_train.BsmtFinType2.value_counts()

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='BsmtFinType2')

Dropping this one would be okay because it is not going to help our model

In [None]:
df_train.drop('BsmtFinType2', axis=1, inplace=True)
df_test.drop('BsmtFinType2', axis=1, inplace=True)

#### 14. MasVnrType : Masonary Veneer Type

   BrkCmn  - Brick Common  
   BrkFace - Brick Face  
   CBlock  - Cinder Block  
   Stone   - Stone  
   None - None  
   

In [None]:
df_train.MasVnrType.value_counts()

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='MasVnrType')

Fill the missing values with `None`

In [None]:
df_train['MasVnrType'] = df_train['MasVnrType'].fillna('None')
df_test['MasVnrType'] = df_test['MasVnrType'].fillna('None')

sns.boxplot(data=df_train, x='SalePrice', y='MasVnrType')

#### 15. MasVnrArea : Masonary Veneer Area in square feet

In [None]:
sns.distplot(df_train.MasVnrArea)

In [None]:
print('Maximum value: {}'.format(df_train.MasVnrArea.max()))
print('Minimun value: {}'.format(df_train.MasVnrArea.min()))

Let's fill the missing value with Minimun

In [None]:
df_train['MasVnrArea'] = df_train['MasVnrArea'].fillna(df_train.MasVnrArea.min())
df_test['MasVnrArea'] = df_test['MasVnrArea'].fillna(df_test.MasVnrArea.min())

#### 16. Electrical

SBrkr  -  Standard Circuit Breakers & Romex  
FuseA  -  Fuse Box over 60 AMP and all Romex wiring (Average)   
FuseF  -  60 AMP Fuse Box and mostly Romex wiring (Fair)  
FuseP  -  60 AMP Fuse Box and mostly knob & tube wiring (poor)  
Mix - Mixed  

In [None]:
df_train.Electrical.value_counts()

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='Electrical')

We will fill the missing values with the most frequently used one in training set only as there in no missing values in this feature for test set. ie., `SBrkr`  


In [None]:
df_train['Electrical'] = df_train['Electrical'].fillna('SBrkr')

sns.boxplot(data=df_train, x='SalePrice', y='Electrical')

In [None]:
check_null(df_train)

Here we completed replacing the missing values which were both in Training set and Test set. Now there is mo missing value left in Training set, but still there are few in Test set which we will carry out replacing now. 

#### 17. MSZoning : Identifies the general zoning classification of the sale.  

   A  -  Agriculture  
   C  -  Commercial  
   FV -  Floating Village Residential  
   I  -  Industrial  
   RH -  Residential High Density  
   RL -  Residential Low Density  
   RP -  Residential Low Density Park   
   RM -  Residential Medium Density  

In [None]:
df_test.MSZoning.value_counts()

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='MSZoning')

Filling the missing values with the most frequent one in test set only as there is no missing value for this feature in training set. ie.,`RL`

In [None]:
df_test['MSZoning'] = df_test['MSZoning'].fillna('RL')

#### 18. Functional : Home Functionality (Assume typical unless deductions are warrented)

   Typ - Typical Functionality  
   Min1 - Minor Deductions 1  
   Min2 - Minor Deductions 2  
   Mod - Moderate Deductions  
   Maj1 - Major Deductions 1  
   Maj2 - Major Deductions 2  
   Sev - Severely Damaged  
   Sal - Salvage only

In [None]:
df_test.Functional.value_counts() 

In [None]:
df_train.Functional.value_counts()

Fill the missing value with most frequent value, which is `Typ`

In [None]:
df_test['Functional'] = df_test['Functional'].fillna('Typ')

#### 19.BsmtFullBath : Basement full bathrooms

In [None]:
df_test.BsmtFullBath.value_counts()

In [None]:
df_train.BsmtFullBath.value_counts()

Fill the missing value with most frequent value, which is `0`

In [None]:
df_test['BsmtFullBath'] = df_test['BsmtFullBath'].fillna('0')

#### 20. BsmtHalfBath : Basement half bathrooms

In [None]:
df_test.BsmtHalfBath.value_counts()

In [None]:
df_train.BsmtHalfBath.value_counts()

Fill the missing value with most frequent value, which is `0`

In [None]:
df_test['BsmtHalfBath'] = df_test['BsmtHalfBath'].fillna('0')

#### 21. Utilities: Type of utilities available

   AllPub -  All public Utilities (E,G,W,& S)    
   NoSewr -  Electricity, Gas, and Water (Septic Tank)  
   NoSeWa -  Electricity and Gas Only  
   ELO - Electricity only     

In [None]:
df_test.Utilities.value_counts()

In [None]:
df_train.Utilities.value_counts()

Dropping this feature as it won't help our model.

In [None]:
df_train.drop('Utilities', axis=1, inplace=True)
df_test.drop('Utilities', axis=1, inplace=True)

#### 22. SaleType: Type of sale

   WD -  Warranty Deed - Conventional  
   CWD - Warranty Deed - Cash  
   VWD - Warranty Deed - VA Loan  
   New - Home just constructed and sold  
   COD - Court Officer Deed/Estate  
   Con - Contract 15% Down payment regular terms  
   ConLw  -  Contract Low Down payment and low interest  
   ConLI  -  Contract Low Interest  
   ConLD  -  Contract Low Down  
   Oth - Other  

In [None]:
df_test.SaleType.value_counts()

In [None]:
df_train.SaleType.value_counts()

In [None]:
sns.boxplot(data=df_train, x='SalePrice', y='SaleType')

Replacing the missing value with the most frequent value, that is `WD`

In [None]:
df_test['SaleType'] = df_test['SaleType'].fillna('WD')

#### 23. GarageArea: Size of garage in square feet

In [None]:
sns.distplot(df_test.GarageArea)

Fill the missing value with minimum value

In [None]:
df_test['GarageArea'] = df_test['GarageArea'].fillna(df_test.GarageArea.min())

#### 24.GarageCars : Size of garage in car capacity

In [None]:
sns.distplot(df_test.GarageCars)

Fill the missing value with minimum value

In [None]:
df_test['GarageCars'] = df_test['GarageCars'].fillna(df_test.GarageCars.min())

#### 25. KitchenQual: Kitchen quality

In [None]:
df_test.KitchenQual.value_counts()

Replacing with the most frequent value, `TA`

In [None]:
df_test['KitchenQual'] = df_test['KitchenQual'].fillna('TA')

#### 26. TotalBsmtSF: Total square feet of basement area

In [None]:
sns.distplot(df_test.TotalBsmtSF)

Filling the missing value with minimum value

In [None]:
df_test['TotalBsmtSF'] = df_test['TotalBsmtSF'].fillna(df_test.TotalBsmtSF.min())

**27. BsmtUnfSF: Unfinished square feet of basement area**
* BsmtFinSF1: Type 1 finished square feet  
* BsmtFinSF2: Type 2 finished square feet

We will replace the missing values with minimum values in all three features.

In [None]:
df_test['BsmtUnfSF'] = df_test['BsmtUnfSF'].fillna(df_test.BsmtUnfSF.min())
df_test['BsmtFinSF1'] = df_test['BsmtFinSF1'].fillna(df_test.BsmtFinSF1.min())
df_test['BsmtFinSF2'] = df_test['BsmtFinSF2'].fillna(df_test.BsmtFinSF2.min())

#### 28. Exterior1st: Exterior covering on house

   AsbShng - Asbestos Shingles  
   AsphShn - Asphalt Shingles  
   BrkComm - Brick Common  
   BrkFace - Brick Face  
   CBlock  - Cinder Block  
   CemntBd - Cement Board  
   HdBoard - Hard Board  
   ImStucc - Imitation Stucco  
   MetalSd - Metal Siding  
   Other   - Other  
   Plywood - Plywood  
   PreCast - PreCast   
   Stone   - Stone  
   Stucco  - Stucco  
   VinylSd - Vinyl Siding  
   Wd Sdng - Wood Siding  
   WdShing - Wood Shingles  

In [None]:
df_test.Exterior1st.value_counts()

In [None]:
df_test['Exterior1st'] = df_test['Exterior1st'].fillna('VinylSd')

#### 29. Exterior2nd: Exterior covering on house (if more than one material)

In [None]:
df_test.Exterior2nd.value_counts()

In [None]:
df_test['Exterior2nd'] = df_test['Exterior2nd'].fillna('VinylSd')

In [None]:
check_null(df_train)
check_null(df_test)

So, here we completed filling all the missing values

### Temporal Variables (DateTime Variables)

In [None]:
year_features = [feature for feature in df_numerical
                 if 'Yr' in feature or 'Year' in feature]
year_features

In [None]:
for feature in year_features:
    print('\n', feature, '\n', df_train[feature].unique())

In [None]:
df_train.groupby('YrSold')['SalePrice'].median().plot()
plt.xlabel('Year Sold')
plt.ylabel('Median House Price')
plt.title('House Prive vs YearSold')

for feature in year_features:
    if feature != 'YrSold':
        df_train[feature] = df_train['YrSold'] - df_train[feature]
        
        plt.scatter(df_train[feature], df_train['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.show()

## Feature Engineering

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
def concat_df(train, test):
    return pd.concat((train, test), sort=True).reset_index(drop=True)

In [None]:
df_all = concat_df(df_train, df_test)

In [None]:
df_all = df_all.drop(['SalePrice'], axis=1)

In [None]:
df_cat = df_all.select_dtypes(exclude = [np.number])

In [None]:
df_cat_dummies = pd.get_dummies(df_cat)

df_cat_dummies.head()

In [None]:
print(df_all.shape)
print(df_cat.shape)
print(df_cat_dummies.shape)

In [None]:
df_all_features = df_all.join(df_cat_dummies)  # combined all features

In [None]:
df_all_features = df_all_features.drop(df_cat, axis=1)  # dropped original categorical features

In [None]:
df_all_features.head()

In [None]:
df_all_features.shape

## Model Training

In [None]:
def divide_df(df):
    return df.iloc[:1460], df.iloc[1460:]

In [None]:
X_train, X_test = divide_df(df_all_features)

In [None]:
y_train = df_train['SalePrice']

In [None]:
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score,mean_squared_error,make_scorer
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = make_pipeline(StandardScaler(), 
                   RandomForestRegressor(max_samples=1460, 
                                         n_estimators=5000, 
                                         min_samples_leaf=1, 
                                         random_state=14))
rf.fit(X_train, y_train)

In [None]:
check_null(X_train)

In [None]:
check_null(X_test)

In [None]:
pred = rf.predict(X_test)

### To CSV

In [None]:
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': pred })

In [None]:
submission.to_csv('Submission.csv', index=False)

In [None]:
submission.head()