In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## 1. Frequent Category imputation

In [None]:
#Load dataset
df = pd.read_csv('../input/house-prices-dataset/train.csv',usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.head()

In [None]:
#Checking the no. of null values present
df.isnull().sum()

In [None]:
#To check the null values in a column (Obnly demonstrating using one column right now)
df.BsmtQual.value_counts()

In [None]:
#To print the same in Graphical format
df.BsmtQual.value_counts().plot.bar()

In [None]:
#Showing the most frequent variable
most_freq_variable = df.BsmtQual.value_counts().index[0]
most_freq_variable

In [None]:
#Function to replace the null value with most frequent value
def most_freq(df,variable):
    df[variable]=df[variable].fillna(df.BsmtQual.value_counts().index[0])

In [None]:
#Calling the function in a loop for all columns
for i in ['BsmtQual','FireplaceQu','GarageType']:
    most_freq(df,i)

In [None]:
#Checking the count of NaN values now
df.isnull().sum()

### Advantages of replacing nan with mode
#### Advanatages
1. Easy To implement
2. Faster way to implement

#### Disadvantages
1. Since we are using the more frequent labels, it may use them in an over respresented way, if there are many nan's
1. It distorts the relation of the most frequent label

## 2. Adding a variable to capture nan

In [None]:
#Load dataset
df = pd.read_csv('../input/house-prices-dataset/train.csv',usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.head()

In [None]:
#Writing a function to create a new column for each existing columns and replace it with 1 if NaN value else 0
def impute_nan(df, variable, frequent):
    df[variable+'_new'] = np.where(df[variable].isnull(),1,0)
    df[variable].fillna(frequent, inplace = True)

In [None]:
#Replacing the existing Column NaN values with the most freq value as done in method 1
for feature in ['BsmtQual','FireplaceQu','GarageType']:
    frequent = df[feature].mode()[0]
    impute_nan(df, feature, frequent)

In [None]:
df.head()

### Advantages and Disadvantages
#### Advantages
1. Easy to implement
2. Capture the importance of missing values

#### Disadvantage
3. Creating additional feature (Curse of Dimentionality)

## 3. If you have more frequent categories, we just replace NAN with a new category

### Note: This is the most used technique

In [None]:
#Load dataset
df = pd.read_csv('../input/house-prices-dataset/train.csv',usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.head()

In [None]:
#Creating a new column where the missing values will be replaced by 'missing' variable and non missing variables will be copied
def impute_nan(df, variable):
    df[variable+'_new_var'] = np.where(df[variable].isnull(),'missing',df[variable])

In [None]:
#Implementing all the columns using for loop
for feature in ['BsmtQual','FireplaceQu','GarageType']:
    impute_nan(df, feature)

In [None]:
df.head()

In [None]:
#Dropping the original columns as new columns are more useful because of no null values
df = df.drop(['BsmtQual','FireplaceQu','GarageType'], axis=1)

In [None]:
df.head()