In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
plt.style.use('seaborn')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/../input/house-prices-dataset'):
    for filename in filenames:
            print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
import matplotlib.cbook
import warnings; warnings.simplefilter('ignore')
warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)

## Setting the evaluation functions

In [None]:
# Defining the Root mean -squared Error metric
def metrics(actuals,predictions):
    
    error=np.sqrt(mean_squared_error(np.log(actuals), np.log(predictions)))
    return error

from sklearn.metrics import make_scorer
rmse = make_scorer(metrics, greater_is_better=False)

In [None]:
# Reading the Training data
x_train=pd.read_csv('../input/house-prices-dataset/train.csv')
x_train.drop(columns=['Id'],axis=1,inplace=True)
x_train.head()

## Splitting the training data into train,validation 

In [None]:
# Since I  dont have any dataset to test our model on unseen dataset. I will
# split the training data into train and validation data,so that i can check the performance
# of the model on an unseen dataset.
x_train,x_val,y_train,y_val=train_test_split(x_train,x_train['SalePrice'],
                                             test_size=0.3,random_state=34)

In [None]:
# removing the target variable from the validation dataset
x_val.drop(columns=['SalePrice'],axis=1,inplace=True)
x_val.head()

In [None]:
# Reading the test dataset
x_test=pd.read_csv('../input/house-prices-dataset/test.csv')
x_test.head()

In [None]:
# This are the respective dimensions of my datasets
x_train.shape,x_val.shape,x_test.shape

In [None]:
#Numeric_col means columns with numeric values
#Cat_col means columns with categorical values
numeric_col=[var for var in x_train.columns if x_train[var].dtypes!='O'and var!='SalePrice']
cat_col=[var for var in x_train.columns if x_train[var].dtypes=='O']

In [None]:
# Number of unique values for each numeric variable.
x_train[numeric_col].nunique()

In [None]:
# Lets us find out the discrete variables.For this i have set the condition that any 
# numeric variable whose number of unique observation is less than 10 is classified as unique
discrete_cols=[var for var in numeric_col if x_train[var].nunique()<10]
discrete_cols

In [None]:
# Similarly for continous variable
continous_cols=[var for var in numeric_col if var not in discrete_cols]
continous_cols

In [None]:
# missing_num means numeric columns with missing value
# missing_cat means categorical columns with missing value
missing_num=[ var for var in numeric_col if x_train[var].isnull().mean()>0]
missing_cat=[ var for var in cat_col if x_train[var].isnull().mean()>0]

In [None]:
missing_num

In [None]:
missing_cat

## Missing Value Imputation

###  Replace missing value with median imputation.

1. Data is missing completely at random</br>
2. No more than 5% of the variable contains missing data

In [None]:
# Distribution of numeric columns having missing values
x_train[missing_num].hist(figsize=(10,6),bins=50);

In [None]:
# % of missing values in numeric columns
x_train[missing_num].isnull().mean()

In [None]:
def median_impute(var):
  median=x_train[var].median()
  x_train[var]=x_train[var].fillna(median)
  x_val[var]=x_val[var].fillna(median)
  x_test[var]=x_test[var].fillna(median)
  return x_train,x_val,x_test

In [None]:
# Assuming all the MasVnrArea is missing completely at random
# imputing 'LotFrontage ' with median value
x_train,x_val,x_test=median_impute('MasVnrArea')
x_train['MasVnrArea'].isnull().mean()

### Replace missing value with end tail imputation-
The rationale is that if the value is missing, it is for a reason, therefore, NA 
would not be replaced by the mean which makes them look like the majority 
of the observations. Instead, NA are flagged as different by assigning a value 
at the tail of the distribution, where observations are rarely represented in 
the population.

In [None]:
def end_sample(var,distance):
  iqr=x_train[var].quantile(0.75)-x_train[var].quantile(0.25)
  upper_bound=x_train[var].quantile(0.75)+distance*iqr

  return upper_bound

In [None]:
for var in ['GarageYrBlt','LotFrontage']:
    upper_value=end_sample(var,3)
    x_train[var]=x_train[var].fillna(upper_value)
    x_val[var]=x_val[var].fillna(upper_value)
    x_test[var]=x_test[var].fillna(upper_value)

In [None]:
x_train[missing_num].isnull().mean()

In [None]:
# After imputation the distribution of the variables are as follows
x_train[missing_num].hist(figsize=(10,6),bins=50);

## Add a Category for Missing Data</br>
This next method is quite straightforward and only works for categorical data. You </br>would create a separate label for missing values — ‘missing’ or it could be anything relevant. The idea is to flag missing values and understand the importance of being missing.</br>
Assumptions: No assumption</br>
Advantages: Quick and easy to implement; Helps understand importance of missing data</br>
Disadvantage: Potentially misunderstood data; Number of missing data should be large enough

In [None]:
x_train[missing_cat].isnull().mean()

In [None]:
# Now for Categorical Variable
for var in ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']:
    x_train[var]=x_train[var].fillna('Missing')
    x_val[var]=x_val[var].fillna('Missing')
    x_test[var]=x_test[var].fillna('Missing')

In [None]:
columns_remaining=['MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
                   'BsmtFinType2','GarageType','GarageFinish','GarageQual','GarageQual']


In [None]:
for var in columns_remaining:
    mode=x_train[var].mode()
    x_train[var]=x_train[var].fillna(mode)
    x_val[var]=x_val[var].fillna(mode)
    x_test[var]=x_test[var].fillna(mode)

In [None]:
x_train[columns_remaining].isnull().mean()

## Create group of discrete variables

In [None]:
x_train[discrete_cols].nunique().plot.bar()

In [None]:
x_train[discrete_cols].apply(lambda x:x.unique())

In [None]:
plt.figure(figsize=(25,5))
plt.subplot(1,6,1)
x_train.groupby('OverallCond')['SalePrice'].mean().plot(color='r')
plt.ylabel('SalePrice')
plt.subplot(1,6,2)
x_train.groupby('BsmtFullBath')['SalePrice'].mean().plot(label='BsmtFullBath',color='r')
x_train.groupby('BsmtHalfBath')['SalePrice'].mean().plot(label='BsmtHalfBath',color='k')
plt.legend(loc='upper left')
plt.subplot(1,6,3)
x_train.groupby('FullBath')['SalePrice'].mean().plot(label='FullBath',color='r')
x_train.groupby('HalfBath')['SalePrice'].mean().plot(label='HalfBath',color='k')
plt.legend()
plt.subplot(1,6,4)
x_train.groupby('BedroomAbvGr')['SalePrice'].mean().plot(label='BedroomAbvGr',color='r')
x_train.groupby('KitchenAbvGr')['SalePrice'].mean().plot(label='KitchenAbvGr',color='k')
plt.legend(loc='upper right')
plt.tight_layout()

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,4,1)
x_train.groupby('Fireplaces')['SalePrice'].mean().plot(color='r')
plt.ylabel('SalePrice')
plt.subplot(1,4,2)
x_train.groupby('GarageCars')['SalePrice'].mean().plot(color='r')
plt.subplot(1,4,3)
x_train.groupby('PoolArea')['SalePrice'].mean().plot(color='r')
plt.subplot(1,4,4)
x_train.groupby('YrSold')['SalePrice'].mean().plot(color='r')
plt.tight_layout()

We can see that most of the variable shows no montonic relationship with the target variable. In order to achieve monotonicity we will first discretize the variables into custom groups and then encode them w.r.t the target variable i.e SalePrice

In [None]:
# Discretizing the discrete variable according to custom categories


OverallCond=['Poor','Average','Good']
OverallCond_int=[2,4,6,8]
BsmtFullBath=['0-1','1-2','2-3']
BsmtFullBath_int=[0,1,2,3]
FullBath=['0-1','1-2','2-3']
FullBath_int=[0,1,2,3]
BsmtHalfBath=['0-1','1-2']
BsmtHalfBath_int=[0,1,2]
HalfBath=['0-1','1-2']
HalfBath_int=[0,1,2]
BedroomAbvGr=['0-2','2-4','4-6','6-8']
BedroomAbvGr_int=[0,2,4,6,8]
KitchenAbvGr=['0-1','1-2']
KitchenAbvGr_int=[0,1,2]
Fireplaces=['0-1','1-2','2-3']
Fireplaces_int=[0, 1, 2, 3]
GarageCars=['0-2','2-4']
GarageCars_int=[0,2,4]
PoolArea=['Below 500','Between 500-600','Above 600']
PoolArea_int=[0, 500, 600,800]
YrSold=['2006-2008','2008-2010']
YrSold_int=[2006, 2008, 2010]


In [None]:

def discrete(col,train,val,test,interval,label):
    train[col]=pd.cut(train[col],bins=interval,labels=label,include_lowest=True)
    val[col]=pd.cut(val[col],bins=interval,labels=label,include_lowest=True)
    test[col]=pd.cut(test[col],bins=interval,labels=label,include_lowest=True)
    return train,val,test

In [None]:
discrete('OverallCond',x_train,x_val,x_test,OverallCond_int,OverallCond);
discrete('BsmtFullBath',x_train,x_val,x_test,BsmtFullBath_int,BsmtFullBath);
discrete('BsmtHalfBath',x_train,x_val,x_test,BsmtHalfBath_int,BsmtHalfBath);
discrete('FullBath',x_train,x_val,x_test,FullBath_int,FullBath);
discrete('HalfBath',x_train,x_val,x_test,HalfBath_int,HalfBath);
discrete('BedroomAbvGr',x_train,x_val,x_test,BedroomAbvGr_int,BedroomAbvGr);
discrete('KitchenAbvGr',x_train,x_val,x_test,KitchenAbvGr_int,KitchenAbvGr);
discrete('Fireplaces',x_train,x_val,x_test,Fireplaces_int,Fireplaces);
discrete('GarageCars',x_train,x_val,x_test,GarageCars_int,GarageCars);
discrete('PoolArea',x_train,x_val,x_test,PoolArea_int,PoolArea);
discrete('YrSold',x_train,x_val,x_test,YrSold_int,YrSold);

In [None]:
cat_col.extend(discrete_cols)

In [None]:
x_train[discrete_cols].head()

In [None]:
x_val[discrete_cols].head()

## Rare label Encoding

In [None]:
# Identifying variables with a large number of different categories
# i.e variables that have high cardinality.
for col in cat_col:
    labels=len(x_train[col].value_counts())
    print('{} has {} different category'.format(col,labels))

High cardinality may pose the following problems:

Variables with too many labels tend to dominate over those with only a few labels, particularly in Tree based algorithms.

A big number of labels within a variable may introduce noise with little, if any, information, therefore making machine learning models prone to over-fit.

Some of the labels may only be present in the training data set, but not in the test set, therefore machine learning algorithms may over-fit to the training set.

Contrarily, some labels may appear only in the test set, therefore leaving the machine learning algorithms unable to perform a calculation over the new (unseen) observation.

In [None]:
plt.figure(figsize=(10,5))
x_train[cat_col].nunique().plot.bar();

In [None]:
# Percentage of observations in each category in the respective categorical columns
for col in cat_col:
    
    print(x_train.groupby(col)[col].count() / len(x_train)) # frequency
    print()

we can see some categories occur more frequently than others. we will set a certain threshold regarding percentage of appearance of a category below which we will identify the category as a rare label

In [None]:
def non_rare_labels(data, var, tolerance):
    
    temp = data.groupby([var])[var].count() / len(data)
    
    non_rare = [x for x in temp.loc[temp>tolerance].index.values]
    
    return non_rare

In [None]:
def rare_encoding(x_train,x_val,x_test,var,tolerance):
    x_train = x_train.copy()
    x_val=x_val.copy()
    x_test = x_test.copy()
    
    freq_category=non_rare_labels(x_train,var,tolerance)
    x_train[var]=np.where(x_train[var].isin(freq_category),x_train[var],'Rare')
    x_val[var]=np.where(x_val[var].isin(freq_category),x_val[var],'Rare')
    x_test[var]=np.where(x_test[var].isin(freq_category),x_test[var],'Rare')
    
    return x_train,x_val,x_test

In [None]:
# setting the threshold to 0.05 below which the obeservation in the category 
# will be renamed as rare label.i.e they occcur rarely
for col in cat_col:
    x_train,x_val,x_test=rare_encoding(x_train,x_val,x_test,col,tolerance=0.05)

## Target Encoding the categorical variables

In [None]:
# Encoding the categories with target variable i.e  mean SalePrice
# i.e Ordering the categories according to the target means assigning a number 
# to the category from 1 to k, where k is the number of distinct categories in the variable, 
# but this numbering is informed by the mean of the target for each category.

def maps(df,var,target):
  categories=df.groupby([var])[target].mean().sort_values(ascending=False).index

  mappings={k:i for i, k in enumerate(categories,0)}

  return mappings

def target_encode(train,val,test,var,mappings):
    x_train[var]=x_train[var].map(mappings)
    x_val[var]=x_val[var].map(mappings)
    x_test[var]=x_test[var].map(mappings)

In [None]:
for var in cat_col:
    mappings=maps(x_train,var,'SalePrice')
    target_encode(x_train,x_val,x_test,var,mappings)

In [None]:
x_train.head()

In [None]:
x_val.head()

In [None]:
plt.figure(figsize=(25,5))
plt.subplot(1,6,1)
x_train.groupby('OverallCond')['SalePrice'].mean().plot(color='r')
plt.ylabel('SalePrice')
plt.subplot(1,6,2)
x_train.groupby('BsmtFullBath')['SalePrice'].mean().plot(label='BsmtFullBath',color='r')
x_train.groupby('BsmtHalfBath')['SalePrice'].mean().plot(label='BsmtHalfBath',color='k')
plt.legend(loc='upper left')
plt.subplot(1,6,3)
x_train.groupby('FullBath')['SalePrice'].mean().plot(label='FullBath',color='r')
x_train.groupby('HalfBath')['SalePrice'].mean().plot(label='HalfBath',color='k')
plt.legend()
plt.subplot(1,6,4)
x_train.groupby('BedroomAbvGr')['SalePrice'].mean().plot(label='BedroomAbvGr',color='r')
x_train.groupby('KitchenAbvGr')['SalePrice'].mean().plot(label='KitchenAbvGr',color='k')
plt.legend(loc='upper right')
plt.tight_layout()

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,4,1)
x_train.groupby('Fireplaces')['SalePrice'].mean().plot(color='r')
plt.ylabel('SalePrice')
plt.subplot(1,4,2)
x_train.groupby('GarageCars')['SalePrice'].mean().plot(color='r')
plt.subplot(1,4,3)
x_train.groupby('PoolArea')['SalePrice'].mean().plot(color='r')
plt.subplot(1,4,4)
x_train.groupby('YrSold')['SalePrice'].mean().plot(color='r')
plt.tight_layout()

We can see that most of the variable shows somewhat montonic relationship with the target variable compare to before ,which we visualised previously .

In [None]:
# Droping the target column i.e SalePrice
x_train.drop(columns=['SalePrice'],axis=1,inplace=True)

In [None]:
# Still some missing values present in the test data
missing_values=[var for var in x_test.columns if x_test[var].isnull().mean()>0]
missing_values

In [None]:
# Droping the columns containing missing data  also in train and validation data to 
# prevent overfitting
x_train.drop(columns=missing_values,axis=1,inplace=True)
x_val.drop(columns=missing_values,axis=1,inplace=True)
x_test.drop(columns=missing_values,axis=1,inplace=True)

In [None]:
# Storing the id of the test columns and using the remaining columns for prediction
Id=x_test['Id']
x_test=x_test.iloc[:,1:]

In [None]:
x_train.shape,x_val.shape,x_test.shape

## Training the model

In [None]:
# setting the grid -search parameters 

nestimators=[100,500,2500]

params_grid={'n_estimators':nestimators}

regressor=RandomForestRegressor(n_estimators=nestimators,max_depth=None,random_state=34,max_features=None,oob_score=True)

gridsearch=GridSearchCV(estimator=regressor,param_grid=params_grid,scoring=rmse,n_jobs=-1,cv=5)

gridsearch.fit(x_train,y_train)


In [None]:
gridsearch.best_params_

## Prediction on training and validation data and testing the results

In [None]:
metrics(y_train,gridsearch.predict(x_train))

In [None]:
predictions=gridsearch.predict(x_val)
metrics(y_val,predictions)

In [None]:
test_predictions=gridsearch.predict(x_test)

## Submission

In [None]:
sub = pd.DataFrame()
sub['Id'] = Id
sub['SalePrice'] = test_predictions
sub.to_csv('submission.csv',index=False)