## House Price Prediction Assignment

### Problem Statement:

A US-based housing company named Surprise Housing has decided to enter the Australian market. The company uses data analytics to purchase houses at a price below their actual values and flip them on at a higher price. 

We are required to build a regression model using regularisation in order to predict the actual value of the prospective properties and decide whether to invest in them or not.


### Business Goals:

The company wants to know:
1. Which variables are significant in predicting the price of a house, and
2. How well those variables describe the price of a house.

Also, determine the optimal value of lambda for ridge and lasso regression.


### Steps:

#### 1. Data Sourcing

    1. Checking the encoding of the file
    2. Loading the data		

#### 2. Data Exploring & Cleaning

    A. Null Values Analysis
        1. Identify and drop columns with 100% missing data
        2. Identify and drop columns with more than 80% missing data
        3. Identify and drop columns having single unique values as they will not any value to the analysis
        4. Identify and drop unnecessary columns (like text based, Applicant Loan Behaviour)

    B. Datatype Check 

    C. Datatype Conversion
        1. Converting int to object
    
    D. Drop Records
        1. Drop Duplicates
    
    E. Impute Null Values
    
    F. Populating the categorical columns with correct mapping
    
    G. Outliers handling
    
    G. Derived Metrics
    
#### 3. Data Visualisation
#### 4. Data Preparation
#### 5. Splitting and Scaling the data
#### 6. Model Building & Evaluation
#### 7. Regualisation using Ridge and Lasso
#### 8. Making Predictions Using the Final Model on the test data
######################################################################

In [1]:
# Importing all required packages
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import cufflinks as cf
import plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
from pandas.api.types import is_object_dtype,is_string_dtype, is_numeric_dtype
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.feature_selection import RFE,RFECV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

import os

# hide warnings
import warnings
warnings.filterwarnings('ignore')

sns.set(style="whitegrid")
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 10000)
py.offline.init_notebook_mode(connected=True) # plotting in offilne mode 
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')
pd.set_option('display.max_colwidth', 1) # make sure data and columns are displayed correctly withput purge
pd.options.display.float_format = '{:20,.2f}'.format # display float value with correct precision 

In [2]:
# Generic Functions

# Method to get Meta-Data about any dataframe passed 
def getMetadata(dataframe) :
    metadata_matrix = pd.DataFrame({
                    'Datatype' : dataframe.dtypes, # data types of columns
                    'Total_Element': dataframe.count(), # total elements in columns
                    'Null_Count': dataframe.isnull().sum(), # total null values in columns
                    'Null_Percentage': round(dataframe.isnull().sum()/len(dataframe) * 100,2) ,# percentage of null values
                    'Unique_Value': dataframe.nunique()
                       })
    return metadata_matrix

def getVIF(X):
    vif = pd.DataFrame()
    vif['Features'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    return vif

def binary_map(x):
    return x.map({'yes': 1, "no": 0})

def cross_validation(X_train,y_train,lm):
    folds = KFold(n_splits = 2, shuffle = True, random_state = 100)
    hyper_params = [{'n_features_to_select': list(range(len(X_train.columns)))}]
    lm.fit(X_train, y_train)
    rfe = RFE(lm)             
    model_cv = GridSearchCV(estimator = rfe, 
                            param_grid = hyper_params, 
                            scoring= 'r2', 
                            cv = folds, 
                            verbose = 1,
                            return_train_score=True)      

    model_cv.fit(X_train, y_train)                  
    cv_results = pd.DataFrame(model_cv.cv_results_)
    plt.figure(figsize=(16,6))
    plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_test_score"])
    plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_train_score"])
    plt.xlabel('number of features')
    plt.ylabel('r-squared')
    plt.title("Optimal Number of Features")
    plt.legend(['test score', 'train score'], loc='upper left')
    
def plot_bar_chart(plotting_frame,x_column,y_column) :
            
        x_axis_title = x_column.title()
        y_axis_title = y_column.title()
        
        graph_title = "Bar Chart [" + x_axis_title.title() + " Vs " + y_axis_title.title() + "]"
        
        layout = go.Layout(
             title = graph_title,
             yaxis=dict(
                title=y_axis_title
             ),
             xaxis=dict(
                 title=x_axis_title
             )
        )

        data_to_be_plotted = [
            go.Bar(
                x=plotting_frame[x_column], 
                y=plotting_frame[y_column]
            )
        ]


        figure = go.Figure(data=data_to_be_plotted,layout=layout)
        py.offline.iplot(figure)
        
        
def plot_pie_chart(plotting_frame,x_column,y_column) : 
        
        labels = plotting_frame[x_column].tolist()
        values = plotting_frame[y_column].tolist()

        trace = go.Pie(labels=labels, values=values)

        py.offline.iplot([trace])

        
def plot_box_chart(dataframe) :
    data = []
    for index, column_name in enumerate(dataframe) :
        data.append(
        go.Box(
            y=dataframe.iloc[:, index],
            name=column_name
         ))   
        
    layout = go.Layout(
    yaxis=dict(
        title="Frequency",
        zeroline=False
    ),
       boxmode='group'
    )
    
    fig = go.Figure(data=data, layout=layout)    
    py.offline.iplot(fig) 
    
def plot_group_bar_chart(plot,col,hue) : 
    hue_col = pd.Series(data = hue)
    fig, ax = plt.subplots()
    width = len(plot[col].unique()) + 6 + 5*len(hue_col.unique())
    fig.set_size_inches(width , 10)
    ax = sns.countplot(data = loan_plot, x= col, order=plot[col].value_counts().index,hue = hue,palette="Set2") 
    
    for p in ax.patches:
                # Some segment wise value we are getting as Nan as respective value not present to tackle the Nan using temp_height
                temp_height = p.get_height()
                
                if math.isnan(temp_height):
                    temp_height = 0.01
                    
                
                ax.annotate('{:1.1f}%'.format((temp_height*100)/float(len(loan_plot))), (p.get_x()+0.05, temp_height+20)) 
    
    plt.show()

def col_list(df):
    num_list = []
    cat_list = []
    for column in df:
        if is_numeric_dtype(df[column]):
            num_list.append(column)
        elif is_object_dtype(df[column]):
            cat_list.append(column)    
    return cat_list,num_list

def outliers(df,num_list):
    oc = []
    noc=[]
    l=[]
    u=[]
    for c in num_list:
        data=df[c].values
        lower, upper = np.mean(data) - (np.std(data) * 3), np.mean(data) + (np.std(data) * 3)
        outliers=len([x for x in data if x < lower or x > upper])
        non_outliers=len([x for x in data if x >= lower and x <= upper])
        l.append(lower)
        u.append(upper)
        oc.append(outliers)
        noc.append(non_outliers)
    oc_metric = pd.Series(oc, name = 'Outliers')
    noc_metric = pd.Series(noc, name = 'Non-Outliers')
    lower_limit = pd.Series(l, name = 'Lower Limit')
    uper_limit = pd.Series(u, name = 'Upper Limit')    
    outl = pd.DataFrame(num_list,columns = ['Columns'])
    final_metric = pd.concat([outl, oc_metric, noc_metric,lower_limit,uper_limit], axis = 1)
 #   final_metric.set_index("Columns", inplace = True)
    return final_metric    

def assumption_graph(y_train,y_pred_train):
    
    ### Assumption of Error Terms Being Independent
    y_res_train = y_train - y_pred_train
    plt.scatter( y_pred_train , y_res_train)
    plt.axhline(y=0, color='r', linestyle=':')
    plt.xlabel("Predictions")
    plt.ylabel("Residual")
    plt.show()
    
    # Distribution of errors
    p = sns.distplot(y_res_train,kde=True)
    p = plt.title('Normality of error terms/residuals')
    plt.xlabel("Residuals")
    plt.show()
    
    #### Variance
    sns.regplot(x=y_train, y=y_pred_train)
    plt.title('Predicted Points Vs. Actual Points', fontdict={'fontsize': 20})
    plt.xlabel('Actual Points', fontdict={'fontsize': 15})
    plt.ylabel('Predicted Points', fontdict={'fontsize': 15})
    plt.show()
    print("Shape after outlier correction ",price_df.shape ,"rows & columns.")

def prediction_matrix(model,X_train,X_test,y_train,y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    metric = []
    r2_train_lr = r2_score(y_train, y_pred_train)
    rss_train_lr = np.sum(np.square(y_train - y_pred_train))
    mse_train_lr = mean_squared_error(y_train, y_pred_train)
    adjusted_r2_train_lr= (1 - (1-model.score(X_train, y_train))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
    
    r2_test_lr = r2_score(y_test, y_pred_test)
    rss_test_lr = np.sum(np.square(y_test - y_pred_test))
    mse_test_lr = mean_squared_error(y_test, y_pred_test)
    adjusted_r2_test_lr= (1 - (1-model.score(X_test, y_test))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)) 
    
    
    metric.append(r2_train_lr)
    metric.append(r2_test_lr)
    metric.append(adjusted_r2_train_lr)
    metric.append(adjusted_r2_test_lr)
    metric.append(rss_train_lr)
    metric.append(mse_test_lr)
    metric.append(mse_train_lr**0.5)
    metric.append(mse_test_lr**0.5)

    y_res_train = y_train - y_pred_train
    y_res_test = y_test - y_pred_test
    
    plt.figure(figsize=(20, 12))
    sns.set(font_scale= 1)
    sns.set_style('whitegrid')
    
    plt.subplot(2,3,1)
#    plt.scatter( y_pred_train , y_res_train)
    sns.scatterplot(x=y_pred_test,y=y_res_test,color='Blue')
    plt.axhline(y=0, color='r', linestyle=':')
    plt.xlabel("Train Predictions")
    plt.ylabel("Train Residual")
    
    plt.subplot(2,3,2)
    p = sns.distplot(y_res_train,kde=True,color='Blue')
    p = plt.title('Normality of error terms/residuals on Train Data')
    plt.xlabel("Residuals")
    

    plt.subplot(2,3,3)
    p=sns.regplot(x=y_train, y=y_pred_train,color='Green')
    p=plt.title('Predicted Points Vs. Actual Points on Train Data')
    plt.xlabel('Actual Points')
    plt.ylabel('Predicted Points')

    plt.subplot(2,3,4)
    sns.scatterplot(x=y_pred_test,y=y_res_test,color='Green')
  #  plt.scatter(y_pred_test , y_res_test)
    plt.axhline(y=0, color='r', linestyle=':')
    plt.xlabel("Test Predictions")
    plt.ylabel("Test Residual")

    plt.subplot(2,3,5)
    p = sns.distplot(y_res_test,kde=True,color='Green')
    p = plt.title('Normality of error terms/residuals on Test Data')
    plt.xlabel("Residuals")

    plt.subplot(2,3,6)
    p=sns.regplot(x=y_test, y=y_pred_test,color='Green')
    p=plt.title('Predicted Points Vs. Actual Points on Test Data')
    plt.xlabel('Actual Points')
    plt.ylabel('Predicted Points') 
    
    sns.despine()
    
#    assumption_graph(y_train,y_pred_train)
#    assumption_graph(y_test,y_pred_test)
    return metric

In [5]:
def cleaning(df):

    print("Records before dropping duplicates  : " + str(df.shape[0]))
    df.drop_duplicates(keep=False,inplace=True)
    print("Records after dropping duplicates  : " + str(df.shape[0]))    
    
    print("Null Value Analysis")
    price_metadata = getMetadata(df)
    price_metadata_group = price_metadata.groupby("Null_Percentage").count().reset_index()
    price_metadata_group.sort_values(["Null_Percentage"], axis=0,ascending=False, inplace=True)
    plot_pie_chart(price_metadata_group,"Null_Percentage","Null_Count")

    print('Completely Missing Data')
    completly_missing_data = price_metadata[price_metadata["Null_Percentage"] == 100.0]
    drop_missing_column = completly_missing_data.index.tolist()
    print("Null Columns before deleting  : " + str(df.shape[1]))
    df.drop(drop_missing_column,inplace=True,axis=1)
    print("Null Columns after deleting : " + str(df.shape[1]))
    
    print('80%+ Missing Data')
    missing_data_greater_80 = price_metadata[(price_metadata["Null_Percentage"] > 80.0) & 
                                         (price_metadata["Null_Percentage"] < 100.0)]
    drop_missing_column_80 = missing_data_greater_80.index.tolist()
    #df.drop(drop_missing_column_80, axis =1, inplace=True)
    display(drop_missing_column_80)
    print("Shape after deleting unique value columns ",df.shape ,"rows & columns.")
    
    print('Identify and drop columns having single value as they will not add any value to our analysis')
    unique_value = df.nunique()
    col_with_only_one_value = unique_value[unique_value.values == 1]
    col_to_drop = col_with_only_one_value.index.tolist()
    display(col_to_drop)
    df.drop(col_to_drop, axis =1, inplace=True)
    print("Shape after deleting unique value columns ",df.shape ,"rows & columns.")
    
    print('Datatype Check')
    price_data_type = getMetadata(df)
    display(price_data_type["Datatype"].value_counts())
    
    price_numeric = df.select_dtypes(include=['object'])
    print('Object Records:')
    display(price_numeric.head(5))

    print('Non Object Records:')
    price_object = df.select_dtypes(exclude=['object'])
    display(price_object.head(5))

    print('Columns requiring imputation:')
    impute_columns = getMetadata(df)
    impute_columns = impute_columns[impute_columns["Null_Count"] > 0]
    display(impute_columns.sort_values(by="Null_Count",ascending = False))
    
    return df

def outlier_treatment(df,columns):
    for col in columns:
        lower_limit=(outlier_info[(outlier_info.Columns==col)]["Lower Limit"].values)[0]
        upper_limit=(outlier_info[(outlier_info.Columns==col)]["Upper Limit"].values)[0]
        df[col]=np.where(df[col]>upper_limit,upper_limit,df[col])
        df[col]=np.where(df[col]<lower_limit,lower_limit,df[col])
    display(df[num_list].describe())

    ### Post fixing outliers
    outlier_columns=outlier_info[outlier_info.Outliers>0]["Columns"]
    i=int(len(outlier_columns)/3)
    plt.figure(figsize=(30,30))
    sns.set(font_scale=1.0)
    sns.set_style("whitegrid")
    j=1
    for p,c in enumerate(columns):
        plt.subplot(i,i,j)
        sns.boxplot(y=df[c],orient="h")
        plt.ylabel(c)
        j=j+1
    plt.show()
    print("Shape after outlier correction ",df.shape ,"rows & columns.")
    return df
    
def impute(df):
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("object"):
        df[name] = df[name].fillna("None")
    return df

def encode(df,nominal_feature,ordinal_feature):
    numeric_feature=list(df.select_dtypes(exclude=['object']).columns)
    display(len(ordinal_feature),len(nominal_feature),len(numeric_feature))
    num_list=numeric_feature.copy()
    price_ordinal=df[ordinal_feature]
    price_nominal=df[nominal_feature]
    price_numeric=df[num_list]
    #### Label Encoding
    df[ordinal_feature]=df[ordinal_feature].astype('category')
    for catg in ordinal_feature:
        df[catg]=df[catg].cat.codes
    display("Shape after dummy encoding ",df.shape ,"rows & columns.")
    ### One Hot Encoding
    price_dummies = pd.get_dummies(price_nominal, drop_first=True)
    df = df.drop(list(price_nominal.columns), axis=1)
    df = pd.concat([price_dummies,df],axis = 1)
    display(df.head())
    print("Shape after dummy encoding ",df.shape ,"rows & columns.")
    return df

def baseline(X_train,y_train,X_test,y_test,models):
    cross_metric_train = []
    cross_metric_test =[]
    for i,x in enumerate(models):
        score=cross_val_score(x,X_train,y_train,cv=10,scoring='r2')
        score2=cross_val_score(x,X_test,y_test,cv=10,scoring='r2')
        cross_metric_train.append(score.mean())
        cross_metric_test.append(score2.mean())
    y=pd.Series(cross_metric_test,name='Test')
    lr_table = {'Metric': ['LR','Lasso','Ridge'],
            'Train': cross_metric_train
            }
    lr_df=pd.DataFrame(lr_table,columns=["Metric","Train"])
    baseline_metric=pd.concat([lr_df,y],axis=1)
    display(baseline_metric)


def loadData():
    #input_path='../input/house-prices-advanced-regression-techniques/train.csv'
    train='../train.csv'
    test='../test.csv'
    train_df=pd.read_csv(train, index_col='Id')
    test_df=pd.read_csv(test,index_col='Id')
    display(train_df.shape,test_df.shape)
    df=pd.concat([train_df,test_df],axis=0)
    print('Top Five Records')
    display(df.head())
    print('Shape',df.info())
    price_metadata=getMetadata(df)
    display(price_metadata)   
    cleaning(df)
    
    return df

## Step 1.a: Load Data and Perform Data Cleaning 

In [6]:
price_df=loadData()

FileNotFoundError: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Step 1.b: Imputation
### 1.i: Imputing values

In [None]:
price_df["SalePrice"].isnull().sum()

In [None]:
price_df["MasVnrArea"].fillna((price_df["MasVnrArea"].median()),inplace=True)
price_df["LotFrontage"].fillna((price_df["LotFrontage"].median()),inplace=True)
#price_df["GarageYrBlt"].fillna((price_df["YearBuilt"]),inplace=True)
price_df["PoolQC"].fillna('NA',inplace=True)
price_df["MiscFeature"].fillna('NA',inplace=True)
price_df["Alley"].fillna('NA',inplace=True)
price_df["Fence"].fillna('NA',inplace=True)
price_df["FireplaceQu"].fillna('NA',inplace=True)
price_df["GarageType"].fillna('NA',inplace=True)
price_df["GarageFinish"].fillna('NA',inplace=True)
price_df["GarageQual"].fillna('NA',inplace=True)
price_df["GarageCond"].fillna('NA',inplace=True)
price_df["BsmtExposure"].fillna('NA',inplace=True)
price_df["BsmtFinType2"].fillna('NA',inplace=True)
price_df["BsmtFinType1"].fillna('NA',inplace=True)
price_df["BsmtCond"].fillna('NA',inplace=True)
price_df["BsmtQual"].fillna('NA',inplace=True)
price_df["MasVnrType"].fillna('None',inplace=True)
price_df["Electrical"].fillna((price_df["Electrical"].mode()[0]),inplace=True)
tmp2=price_df[['SalePrice','GarageYrBlt']]

col_to_drop=['SalePrice','GarageYrBlt']
price_df.drop(col_to_drop, axis=1, inplace=True)
tmp1=impute(price_df)
price_df=pd.concat([tmp1,tmp2],axis=1)

impute_columns = getMetadata(price_df)
impute_columns = impute_columns[impute_columns["Null_Count"] > 0]
impute_columns.sort_values(by="Null_Count",ascending = False)

#### Populating categorical values

In [None]:
price_df['MSSubClass'].replace({20:"1-STORY 1946 & NEWER",
                               30:"1-STORY 1945 & OLDER",
                               40:"1-STORY W/FINISHED",
                               45:"1-1/2 STORY - UNFINISHED",
                               50:"1-1/2 STORY FINISHED",
                               60:"2-STORY 1946 & NEWER",
                               70:"2-STORY 1945 & OLDER",
                               75:"2-1/2 STORY ALL AGES",
                               80:"SPLIT OR MULTI-LEVEL",
                               85:"SPLIT FOYER",
                               90:"DUPLEX",
                               120:"1-STORY PUD",
                               150:"1-1/2 STORY PUD",
                               160:"2-STORY PUD",
                               180:"PUD - MULTILEVEL",
                               190:"2 FAMILY CONVERSION"                         
                              },inplace=True)
price_df['MoSold'].replace({1:"Jan",2:"Feb",3:"Mar",4:"Apr",5:"May",6:"Jun",
                         7:"Jul",8:"Aug",9:"Sep",10:"Oct",11:"Nov",12:"Dec"}
                        ,inplace=True)

print("Shape after correcting categorical columns ",price_df.shape ,"rows & columns.")
price_df.head(5)

## Step 1.c: Outlier Treatment

In [None]:
cat_list,num_list=col_list(price_df)
print("Category Columns:",cat_list)
print("Continous Columns:",num_list)
price_df[num_list].describe()
outlier_info=outliers(price_df,num_list)
display(outlier_info)
outlier_columns_fix=outlier_info[(outlier_info["Upper Limit"]>20) & (outlier_info.Outliers>0)]["Columns"]
display(list(outlier_columns_fix))
price_df=outlier_treatment(price_df,list(outlier_columns_fix))

## Step 1.d: Feature Engineering

In [None]:
price_df["Age"]=price_df["YrSold"]-price_df["YearBuilt"]
price_df["GarageAge"]=price_df["YrSold"] - price_df["GarageYrBlt"]
price_df["GarageAge"].fillna(99,inplace=True)
price_df["LivLotRatio"] = price_df.GrLivArea / price_df.LotArea
price_df["Spaciousness"] = (price_df["1stFlrSF"] + price_df["2ndFlrSF"]) / price_df.TotRmsAbvGrd
price_df["MedNhbdArea"] = price_df.groupby("Neighborhood")["GrLivArea"].transform("median")
price_df["HouseStyle"]=price_df["HouseStyle"].apply(lambda x: x.replace('Story',''))
price_df["HouseStyle"]=price_df["HouseStyle"].apply(lambda x: x.replace('Fin',''))
price_df["HouseStyle"]=price_df["HouseStyle"].apply(lambda x: x.replace('Unf',''))
price_df["HouseStyle"]=price_df["HouseStyle"].apply(lambda x: x.replace('Lvl',''))
price_df["HouseStyle"]=price_df["HouseStyle"].apply(lambda x: x.replace('Foyer',''))
price_df["HouseStyle"]=price_df["HouseStyle"].apply(lambda x: x.replace('S','1'))
price_df["HouseStyle"]=price_df["HouseStyle"].astype('float64')
MSClass=[k for k,v in zip(list((price_df["MSSubClass"].value_counts()).index),
                          list((price_df["MSSubClass"].value_counts()).values)) if v<70]
Neig=[k for k,v in zip(list((price_df["Neighborhood"].value_counts()).index),
                       list((price_df["Neighborhood"].value_counts()).values)) if v<50]
price_df["MSSubClass"]=price_df["MSSubClass"].apply(lambda x: "Others" if x in MSClass else x)
price_df["Neighborhood"]=price_df["Neighborhood"].apply(lambda x: "Others" if x in Neig else x)
col_to_drop=["YrSold","YearBuilt","GarageYrBlt","YearRemodAdd"]
price_df.drop(col_to_drop,inplace=True,axis=1)
display(price_df.head())
print("Shape after dervived columns ",price_df.shape ,"rows & columns.")
x = price_df["SalePrice"]
sns.set_style("whitegrid")
sns.distplot(x)
plt.show()
price_df["SalePrice_log"] = np.log(price_df.SalePrice)
x = price_df.SalePrice_log
sns.distplot(x)
plt.show()

## Step 2: Data Visualisation:

In [None]:
impute_columns = getMetadata(price_df)
impute_columns = impute_columns[impute_columns["Null_Count"] > 0]
impute_columns.sort_values(by="Null_Count",ascending = False)

In [None]:
cat_list,num_list=col_list(price_df)
print("Category Columns:",cat_list)
print("Continous Columns:",num_list)
display(len(cat_list),len(num_list))
plt.figure(figsize = (25, 15))
sns.heatmap(price_df[num_list].corr(), annot = True, cmap="YlGnBu")
plt.show()

In [None]:
#### Visualising the continous columns columns
Quality_features = ['LotFrontage', 'LotArea', 'MasVnrArea', 'TotalBsmtSF', 'GrLivArea',
                    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 
                    'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
                    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal','Age','GarageAge']
plt.figure(figsize=(20, 30))
sns.set(font_scale= 1.2)
sns.set_style('darkgrid')

for i, feature in enumerate(Quality_features):
    plt.subplot(7, 4, i+1)
    sns.scatterplot(data=price_df, x=feature, y='SalePrice', palette="ch:.10")         
sns.despine()

In [None]:
# We will plot some joint histogram and scatter grphs to look at correlated features in more detail
y = price_df["SalePrice"]
features = [
    "MasVnrArea",
    "BsmtFinSF1",
    "TotalBsmtSF",
    "1stFlrSF",
    "GrLivArea",
    "FullBath",
    "TotRmsAbvGrd",
    "Fireplaces",
    "GarageCars",
    "GarageArea",
    "LotArea",
    "LotFrontage",
]

for features in features:
    sns.set_style("whitegrid")
    plt.figure(figsize=(10, 10))
    x = price_df[features]
    sns.jointplot(x=x, y=y, data=price_df)

In [None]:
#### Visualising the categorical columns
Quality_features = ['MSZoning','LandContour','Utilities','HouseStyle','OverallCond','RoofStyle','Exterior1st','ExterCond',
                    'RoofMatl', 'ExterQual', 'BsmtQual', 'HeatingQC', 'CentralAir', 
                    'Electrical', 'KitchenQual', 'GarageQual','GarageType','SaleCondition','PoolQC','Alley','Fence']

plt.figure(figsize=(30, 20))
sns.set(font_scale= 1.2)
sns.set_style('darkgrid')

for i, feature in enumerate(Quality_features):
    plt.subplot(6, 4, i+1)
    sns.barplot(data=price_df, x=feature, y='SalePrice', palette="ch:.10")         
sns.despine()

In [None]:
'''
features = price_df.select_dtypes(include=['object']).columns
plt.figure(figsize=(30, 20))
sns.set_style('darkgrid')

for feature in features:
    g = sns.FacetGrid(price_df[~price_df.SalePrice.isnull()], col=feature)
    g.map(plt.hist, 'SalePrice');
    sns.despine()
'''

**Analysis** - Lots of categorical columns have data skewed to one/two category like:
'MSZoning','Street','Alley','LandContour','Utilities','LandSlope',
'Condition1','Condition2','BldgType','RoofStyle','RoofMatl','ExterCond','BsmtCond',
'BsmtFinType2','Heating','CentralAir','Electrical','Functional',
'GarageQual','GarageCond','PavedDrive','PoolQC','Fence','MiscFeature','SaleType','SaleCondition'
and needs to binned properly

In [None]:
binn_col=['MSZoning','Street','Alley','LandContour',
'Condition1','Condition2','BldgType','RoofStyle','RoofMatl','Heating','CentralAir','Electrical',
'PavedDrive','Fence','MiscFeature','SaleType','SaleCondition']
x=dict()
x["MSZoning"]=300
x["Street"]=10
x["Alley"]=60
x["LandContour"]=70
x["LandSlope"]=70
x["Condition1"]=100
x["Condition2"]=10
x["BldgType"]=120
x["RoofStyle"]=300
x["RoofMatl"]=15
x["Heating"]=20
x['Electrical']=100
x['Functional']=40
x['PavedDrive']=100
x['Fence']=160
x['MiscFeature']=50
x['SaleType']=130
x['SaleCondition']=130
for p,y in x.items():
    val=[k for k,v in zip(list((price_df[p].value_counts()).index),
                       list((price_df[p].value_counts()).values)) if v<y]
    price_df[p]=price_df[p].apply(lambda x: "Others" if x in val else x)
price_df.head()


## Step 3: Data Prep

In [None]:
ordinal_feature=['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC',
                 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond','LotShape', 'LandSlope', 'BsmtExposure', 
                 'BsmtFinType1', 'BsmtFinType2', 'Functional','GarageFinish','Utilities','PoolQC'
                 ]
nominal_feature= ["MSSubClass", "MSZoning", "Street", "LandContour", 
                "LotConfig", "Neighborhood", "Condition1", "Condition2", "BldgType", 
                 "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
                "MasVnrType", "Foundation", "Heating", "CentralAir", "GarageType", 
                 "SaleType", "SaleCondition","PavedDrive",'Electrical','MoSold','Alley','Fence','MiscFeature']
df=encode(price_df,ordinal_feature,nominal_feature)

## Step 4: Splitting and Scaling the data

In [None]:
train=df[~df.SalePrice.isnull()]
test=df[df.SalePrice.isnull()]
display(train.shape,test.shape)

In [None]:
np.random.seed(0)
df_train, df_validation = train_test_split(train, train_size = 0.7, test_size = 0.3, random_state = 100)

#### Dividing into X and Y sets for the model building

In [None]:
X_train = df_train.drop(["SalePrice_log","SalePrice"], axis = 1)
y_train = df_train["SalePrice_log"]
X_validation = df_validation.drop(["SalePrice_log","SalePrice"], axis = 1)
y_validation = df_validation["SalePrice_log"]
X_test = test.drop(["SalePrice_log","SalePrice"], axis = 1)

In [None]:
num_list=list(X_train.select_dtypes(exclude=['object']).columns)

In [None]:
num_list.remove('LivLotRatio')

In [None]:
### Scaling
scaler = RobustScaler()
#scaler=StandardScaler()
X_train[num_list] = scaler.fit_transform(X_train[num_list])
X_validation[num_list] = scaler.transform(X_validation[num_list])
X_train.head()

In [None]:
X_test[num_list]=scaler.transform(X_test[num_list])

## Step 5: Model Building & Evaluation

#### i. Baseline Score

In [None]:
models=[LinearRegression(),Lasso(),Ridge()]
baseline(X_train,y_train,X_validation,y_validation,models)

#### ii. Linear Regression

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)
lm_metric=prediction_matrix(lm,X_train,X_validation,y_train,y_validation)

#### iii. RFE Implementation for feature selection

In [None]:
#cross_validation(X_train,y_train,lm)
min_features_to_select = 1  # Minimum number of features to consider
rfecv = RFECV(estimator=Ridge(), step=1, cv=10,
              scoring='r2',
              min_features_to_select=min_features_to_select)
rfecv.fit(X_train, y_train)
print("Optimal number of features : %d" % rfecv.n_features_)
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(min_features_to_select,
               len(rfecv.grid_scores_) + min_features_to_select),
         rfecv.grid_scores_)
plt.show()

In [None]:
### selecting columns post rfe
col = X_train.columns[rfecv.support_]
len(list(col))
X_train_rfe = X_train[col]
X_validation_rfe  = X_validation[col]
lm_rfe=LinearRegression()
lm_rfe.fit(X_train_rfe,y_train)
lm_rfe_metric=prediction_matrix(lm_rfe,X_train_rfe,X_validation_rfe,y_train,y_validation)

#### iv. Ridge Regression

In [None]:
# list of alphas to tune - if value too high it will lead to underfitting, if it is too low, 
# it will not handle the overfitting
params = {'alpha': [0.001, 0.01, 0.1, 1.0,10.0,20,50,100,150,200,500]}
estimator = Ridge()
# cross validation
folds = 5
model_cv = GridSearchCV(estimator = estimator, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error',  
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(X_train_rfe, y_train) 
display(model_cv.best_params_)
alpha = model_cv.best_params_['alpha']
ridge = Ridge(alpha=alpha)
ridge.fit(X_train_rfe, y_train)
lm_ridge_metric=prediction_matrix(ridge,X_train_rfe,X_validation_rfe,y_train,y_validation)

#### v. Lasso Regression

In [None]:
# list of alphas to tune - if value too high it will lead to underfitting, if it is too low, 
# it will not handle the overfitting
params = {'alpha': [0.00001,0.0001,0.001, 0.01, 0.1, 1.0,10.0,20,50,100,150,200,500]}
#params = {'alpha': [0.001, 0.0001, 0.0005, 0.005,0.003 ]}
estimator = Lasso()
# cross validation
folds = 5
model_cv = GridSearchCV(estimator = estimator, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error',  
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(X_train, y_train) 
display(model_cv.best_params_)
alpha = model_cv.best_params_['alpha']
lasso = Lasso(alpha=alpha)
lasso.fit(X_train, y_train) 
lm_lasso_metric=prediction_matrix(lasso,X_train,X_validation,y_train,y_validation)

#### vi. Decision Tree

In [None]:
params = {
    'criterion':['mse'],
    'splitter':['best'],
    'max_depth':[5,10,15,20,40,50],
    'min_samples_split':[2,5,10,20,50,100],
    'min_samples_leaf':[1,2,3,5,10,20],
    'random_state':[42,100],
}
estimator = tree.DecisionTreeRegressor()
# cross validation
folds = 5
model_cv = GridSearchCV(estimator = estimator, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error',  
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)  
%%time
model_cv.fit(X_train, y_train) 
display(model_cv.best_estimator_)
lm_dt=model_cv.best_estimator_
lm_dt.fit(X_train, y_train) 
lm_dt_metric=prediction_matrix(lm_dt,X_train,X_validation,y_train,y_validation)

#### vii. Random Forest

In [None]:
params = {
    'criterion':['mse'],
    'max_depth':[5,10,15,20,40],
    'min_samples_split':[5,10,20],
    'min_samples_leaf':[3,5,10,20],
    'random_state':[42,100],
    'max_features': [50,75,100,140],
    'n_estimators':[10,30,50,100],
    'n_jobs':[-1],
    'oob_score':[True]
    
}
estimator = RandomForestRegressor()
# cross validation
folds = 3
model_cv = GridSearchCV(estimator = estimator, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error',  
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)  
model_cv.fit(X_train, y_train) 
display(model_cv.best_estimator_)
lm_rfc=model_cv.best_estimator_
lm_rfc.fit(X_train, y_train) 
lm_rfc_metric=prediction_matrix(lm_rfc,X_train,X_validation,y_train,y_validation)

#### viii. KNN Regression

In [None]:
lm_knn=KNeighborsRegressor(n_neighbors=5)
lm_knn.fit(X_train, y_train) 
lm_knn_metric=prediction_matrix(lm_knn,X_train,X_validation,y_train,y_validation)

#### ix. Gradient Boosting

In [None]:
lm_gbr = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01, max_depth=1, random_state=31)
lm_gbr.fit(X_train, y_train) 
lm_gbr_metric=prediction_matrix(lm_gbr,X_train,X_validation,y_train,y_validation)

#### x. XGB Regrssor

In [None]:
# Creating a table which contain all the metrics
lr_table = {'Metric': ['R2 Score (Train)','R2 Score (Test)','Adjusted R2 Score (Train)','Adjusted R2 Score (Test)','RSS (Train)','RSS (Test)',
                       'RMSE (Train)','RMSE (Test)'], 
        'Linear Regression': lm_metric
        }

metric_lm = pd.DataFrame(lr_table ,columns = ['Metric', 'Linear Regression'] )
metric_lm_rfe = pd.Series(lm_rfe_metric,name = 'RFE Linear Regression')
metric_lm_ridge = pd.Series(lm_ridge_metric, name = 'Ridge Regression')
metric_lm_lasso = pd.Series(lm_lasso_metric, name = 'Lasso Regression')
metric_lm_dt = pd.Series(lm_dt_metric,name = 'Decision Tree Regression')
metric_lm_rfc = pd.Series(lm_rfc_metric, name = 'Random Forest Regression')
metric_lm_knn= pd.Series(lm_knn_metric, name = 'KNN Regression')
metric_lm_gbr = pd.Series(lm_gbr_metric, name = 'Gradient Boosting Regression')

final_metric = pd.concat([metric_lm,metric_lm_rfe,metric_lm_ridge,metric_lm_lasso,metric_lm_dt,metric_lm_rfc,metric_lm_knn,metric_lm_gbr], axis = 1)
print("Model Peformance Metric:")
display(final_metric)

## Step 6: Making Predictions Using the Final Model on the test data

In [None]:
X_test.shape

In [None]:
temp=X_test
temp=temp.reset_index()
output_df=temp['Id']
models=[lm,lasso,lm_dt,lm_rfc,lm_knn,lm_gbr]
for i,m in enumerate(models):
    filename=str(i)+'_submission_file.csv'
    lm_price=m.predict(X_test)
    mp = pd.Series(lm_price, name = 'SalePrice')
    final_metric = pd.concat([output_df,mp], axis = 1)
    final_metric['SalePrice']=final_metric['SalePrice'].apply(lambda x: np.exp(x))
    final_metric.to_csv(filename,index =False)