# <font color='red'> Advanced Regression </font>

## Surprise Housing

A US-based housing company named **Surprise Housing** has decided to enter the Australian market. The company uses data analytics to purchase houses at a price below their actual values and flip them on at a higher price. For the same purpose, the company has collected a data set from the sale of houses in Australia. 

The company is looking at prospective properties to buy to enter the market. You are required to build a regression model using regularisation in order to predict the actual value of the prospective properties and decide whether to invest in them or not.

 

The company wants to know:

    - Which variables are significant in predicting the price of a house, and
    - How well those variables describe the price of a house.

## Business Goal 

You are required to model the price of houses with the available independent variables. This model will then be used by the management to understand how exactly the prices vary with the variables. They can accordingly manipulate the strategy of the firm and concentrate on areas that will yield high returns. Further, the model will be a good way for management to understand the pricing dynamics of a new market.

--------------------------------------------------------------------------------------------------------------

# Loading Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load train data
df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# Describe dataset 
df.describe().T

By looking into the id column we can say that there is no duplicate rows.

In [None]:
# Find the percentage of missing values in dataframe
missing_df = pd.DataFrame({
    "Columns": df.columns[df.isnull().sum()>0],
    "Values": df[df.columns[df.isnull().sum()>0]].isnull().sum()/len(df)*100
})
missing_df = missing_df.reset_index(drop=True)
missing_df

In [None]:
all_missing_df = df.isnull().sum()
all_missing_df = all_missing_df.reset_index()
all_missing_df.T

In [None]:
df.info()

#### Define the numerical and categorical columns in the dataframe

In [None]:
var_numerical = []
var_categorical = []

In [None]:
var_numerical = df.select_dtypes(exclude=['object']).columns
var_categorical = df.select_dtypes(include=['object']).columns

In [None]:
print(var_numerical)

In [None]:
print(var_categorical)

# Exploratory Data Ananlysis

In [None]:
# Function to label the count on top of each bar in graph
def label_values(ax, spacing=5):
    total = 0
    for rect in ax.patches:
        total += rect.get_height()
    for rect in ax.patches:
        
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2

        space = spacing
        
        va = 'bottom'
        
        if y_value < 0:
            space *= -1
            va = 'top'
    
        label = "{:.2f}, {:.2f}".format(y_value, y_value/total*100)
        ax.annotate(
            label,                      
            (x_value, y_value),         
            xytext=(0, space),          
            textcoords="offset points", 
            ha='center',                
            va=va)                      

# i. Univariate Analysis

### Target Variable

In [None]:
# Plot the target variable
sns.distplot(x = df["SalePrice"])
plt.show()
sns.boxplot(y = df["SalePrice"])
plt.show()

We will use log transform target variable to handle the outlier values.

In [None]:
# Plot the target variable
sns.distplot(x = np.log(df["SalePrice"]))
plt.show()
sns.boxplot(y = np.log(df["SalePrice"]))
plt.show()

In [None]:
df[(df["SalePrice"]> 600000)].T

We can see there is no abnormal variable in here (variable which is out of its range)

In [None]:
df["SalePrice"] = df["SalePrice"].apply(lambda x: np.log(x))

### Categorical Variables

In [None]:
# Countplot for each categorical variable
for column in var_categorical:
    print(column.title())
    plt.figure(figsize=(16, 7))
    ax = sns.countplot(x = df[column])
    label_values(ax)
    plt.show()

### Numerical Variables

In [None]:
# Boxplot for all the numerical variables
for column in var_numerical:
    print(column.title())
    plt.figure(figsize=(16, 7))
    ax = sns.boxplot(x = df[column])
    label_values(ax)
    plt.show()

In [None]:
# Distplot for all the numerical variables
for column in var_numerical:
    print(column.title())
    plt.figure(figsize=(16, 6))
    ax = sns.distplot(x = df[column])
    label_values(ax)
    plt.show()

In [None]:
#### Drop the Id column
df = df.drop(['Id'], axis = 1)
df_test_id = df_test["Id"]
df_test = df_test.drop(['Id'], axis = 1)

In [None]:
var_numerical = list(set(var_numerical) - set(['Id']))

# Log Transform the skewed columns

In [None]:
log_transform_columns = ['MSSubClass',
 'LotFrontage',
 'LotArea',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'GarageYrBlt',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal']

In [None]:
for col in log_transform_columns:
    df[col] = df[col].apply(lambda x: 0 if np.log(x)<0 else np.log(x))
    df_test[col] = df_test[col].apply(lambda x: 0 if np.log(x)<0 else np.log(x))

# Handle Missing Values in columns (High missing values only for now)

In [None]:
high_missing_columns = list(missing_df[missing_df["Values"] > 50]["Columns"].values)

In [None]:
high_missing_columns

#### PoolQC: Pool quality <br>
       Ex   Excellent <br>
       Gd	Good <br>
       TA	Average/Typical <br>
       Fa	Fair <br>
       NA	No Pool

In [None]:
# Fill null value with NA as pool is not available in some home. It can be important for our analysis so we will not drop it
df["PoolQC"] = df["PoolQC"].fillna('NA')
df_test["PoolQC"] = df_test["PoolQC"].fillna('NA')

#### Alley: Type of alley access to property

       Grvl	Gravel
       Pave	Paved
       NA 	No alley access

In [None]:
# Fill null value with NA as Alley is not available in some home. It can be important for our analysis so we will not drop it
df["Alley"] = df["Alley"].fillna('NA')
df_test["Alley"] = df_test["Alley"].fillna('NA')

#### Fence: Fence quality
		
       GdPrv	Good Privacy
       MnPrv	Minimum Privacy
       GdWo	Good Wood
       MnWw	Minimum Wood/Wire
       NA	No Fence

In [None]:
# Fill null value with NA as fence is not available in some home. It can be important for our analysis so we will not drop it
df["Fence"] = df["Fence"].fillna('NA')
df_test["Fence"] = df_test["Fence"].fillna('NA')

#### MiscFeature: Miscellaneous feature not covered in other categories
		
       Elev	Elevator
       Gar2	2nd Garage (if not described in garage section)
       Othr	Other
       Shed	Shed (over 100 SF)
       TenC	Tennis Court
       NA	None

In [None]:
# Fill null value with NA as MiscFeature is not available in some home. It can be important for our analysis so we will not drop it
df["MiscFeature"] = df["MiscFeature"].fillna('NA')
df_test["MiscFeature"] = df_test["MiscFeature"].fillna('NA')

# ii. Segmented Univariate Analysis

In [None]:
for column in var_categorical:
    print(column.title())
    plt.figure(figsize=(16, 6))
    ax = sns.boxplot(x = df[column], y = df["SalePrice"])
    label_values(ax)
    plt.show()

# iii. Bivariate Analysis 

In [None]:
for column in var_numerical:
    print(column.title())
    plt.figure(figsize=(16, 6))
    ax = sns.scatterplot(x = df[column], y = df["SalePrice"])
    label_values(ax)
    plt.show()

In [None]:
corr = df[var_numerical].corr()

In [None]:
plt.figure(figsize = (16, 20))
sns.heatmap(corr, annot = True)
plt.show()

There is no highly negative correlated features.

Highly positively correlated features: <br/>
1. GarageArea v/s GarageCars (0.89) <br/>
2. GarageYrBit v/s YearBuilt (0.83) <br/>
3. 1stFlrSF v/s TotalBsmtSF (0.81)

So we will drop one of the variable in highly correlated features

In [None]:
df = df.drop(["GarageCars", "GarageYrBlt", "1stFlrSF"], axis = 1)
df_test = df_test.drop(["GarageCars", "GarageYrBlt", "1stFlrSF"], axis = 1)

In [None]:
# We will remove the column name from var_numerical and var_categorical columns 
for col in ["GarageCars", "GarageYrBlt", "1stFlrSF"]:
    if col in var_numerical:
        var_numerical = list(set(var_numerical) - set([col]))
    elif col in var_categorical:
        var_categorical = list(set(var_categorical) - set([col]))

As we can see some variables shows a linear relationship with target variable. Hence, we can use regression method.

# Impute missing values based on our analysis

In [None]:
sns.boxplot(x = df["FireplaceQu"], y = df["SalePrice"])
plt.show()

#### FireplaceQu: Fireplace quality

       Ex	Excellent - Exceptional Masonry Fireplace
       Gd	Good - Masonry Fireplace in main level
       TA	Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement
       Fa	Fair - Prefabricated Fireplace in basement
       Po	Poor - Ben Franklin Stove
       NA	No Fireplace

In [None]:
# Based on the description of the columns we not that null values has no fireplace at home so we will impute 
# null values with NA
df["FireplaceQu"] = df["FireplaceQu"].fillna('NA')
df_test["FireplaceQu"] = df_test["FireplaceQu"].fillna('NA')

In [None]:
plt.figure(figsize=(16, 10))
sns.boxplot(x = df["Neighborhood"], y= df["LotFrontage"])
plt.show()

In [None]:
# LotFrontage - Numerical variable - Impute it based on the Neighborhood values
# As lot frontage is based on the neighborhood as we can see in the above box plot 
df["LotFrontage"] = df.groupby(by=["Neighborhood"])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
df_test["LotFrontage"] = df_test.groupby(by=["Neighborhood"])['LotFrontage'].transform(lambda x: x.fillna(x.median()))

In [None]:
remaining_missing_df = df[df.columns[df.isnull().sum()>0]].isnull().sum()/len(df)*100

In [None]:
remaining_missing_df

#### MasVnrType: Masonry veneer type

       BrkCmn	Brick Common
       BrkFace	Brick Face
       CBlock	Cinder Block
       None	None
       Stone	Stone
       
#### MasVnrArea: Masonry veneer area in square feet

#### BsmtExposure: Refers to walkout or garden level walls

       Gd	Good Exposure
       Av	Average Exposure (split levels or foyers typically score average or above)	
       Mn	Mimimum Exposure
       No	No Exposure
       NA	No Basement
	
#### BsmtFinType1: Rating of basement finished area

       GLQ	Good Living Quarters
       ALQ	Average Living Quarters
       BLQ	Below Average Living Quarters	
       Rec	Average Rec Room
       LwQ	Low Quality
       Unf	Unfinshed
       NA	No Basement

#### BsmtFinType2: Rating of basement finished area (if multiple types)

       GLQ	Good Living Quarters
       ALQ	Average Living Quarters
       BLQ	Below Average Living Quarters	
       Rec	Average Rec Room
       LwQ	Low Quality
       Unf	Unfinshed
       NA	No Basement

#### BsmtCond: Evaluates the general condition of the basement

       Ex	Excellent
       Gd	Good
       TA	Typical - slight dampness allowed
       Fa	Fair - dampness or some cracking or settling
       Po	Poor - Severe cracking, settling, or wetness
       NA	No Basement
       
#### GarageQual: Garage quality

       Ex	Excellent
       Gd	Good
       TA	Typical/Average
       Fa	Fair
       Po	Poor
       NA	No Garage
		
#### GarageCond: Garage condition

       Ex	Excellent
       Gd	Good
       TA	Typical/Average
       Fa	Fair
       Po	Poor
       NA	No Garage

#### BsmtQual: Evaluates the height of the basement

       Ex	Excellent (100+ inches)	
       Gd	Good (90-99 inches)
       TA	Typical (80-89 inches)
       Fa	Fair (70-79 inches)
       Po	Poor (<70 inches
       NA	No Basement
       
#### Electrical: Electrical system

       SBrkr	Standard Circuit Breakers & Romex
       FuseA	Fuse Box over 60 AMP and all Romex wiring (Average)	
       FuseF	60 AMP Fuse Box and mostly Romex wiring (Fair)
       FuseP	60 AMP Fuse Box and mostly knob & tube wiring (poor)
       Mix	Mixed
       
#### GarageType: Garage location
		
       2Types	More than one type of garage
       Attchd	Attached to home
       Basment	Basement Garage
       BuiltIn	Built-In (Garage part of house - typically has room above garage)
       CarPort	Car Port
       Detchd	Detached from home
       NA	No Garage
       
#### GarageFinish: Interior finish of the garage

       Fin	Finished
       RFn	Rough Finished	
       Unf	Unfinished
       NA	No Garage

Based on the columns descriptions of the columns we will impute the values.
If columns has NA values in there category type then we will impute null values with 'NA' value

In [None]:
remaining_missing_list = list(remaining_missing_df.keys())

In [None]:
column_to_replace_null_with_NA = ['BsmtExposure', 'GarageFinish', 'GarageType', 'BsmtQual', 'GarageCond', 
                                  'GarageQual', 'BsmtCond', 'BsmtFinType2', 'BsmtFinType1']
df[column_to_replace_null_with_NA] = df[column_to_replace_null_with_NA].fillna('NA')
df_test[column_to_replace_null_with_NA] = df_test[column_to_replace_null_with_NA].fillna('NA')

In [None]:
remaining_missing_list = list(set(remaining_missing_list) - set(column_to_replace_null_with_NA))

In [None]:
# Impute the categorical variables with the mode of its values
# Impute the numerical variables with the median of its values
for col in remaining_missing_list:
    if col in var_numerical:
        df[col] = df[col].fillna(df[col].median())
    elif col in var_categorical:
        df[col] = df[col].fillna(df[col].mode()[0])

# Derived Columns

In [None]:
# Drop the year built column and take age of house into consideration
df["BuiltAge"] = df["YearBuilt"].apply(lambda x: max(df["YearBuilt"]) - x)
df = df.drop(['YearBuilt'], axis = 1)

df_test["BuiltAge"] = df_test["YearBuilt"].apply(lambda x: max(df_test["YearBuilt"]) - x)
df_test = df_test.drop(['YearBuilt'], axis = 1)

In [None]:
df["RemodAddAge"] = df["YearRemodAdd"].apply(lambda x: max(df["YearRemodAdd"]) - x)
df = df.drop(['YearRemodAdd'], axis = 1)

df_test["RemodAddAge"] = df_test["YearRemodAdd"].apply(lambda x: max(df_test["YearRemodAdd"]) - x)
df_test = df_test.drop(['YearRemodAdd'], axis = 1)

In [None]:
def combine(x):
    ind = x.index
    ans = []
    for i in ind:
        ans.append(pd.to_datetime(str(x[0]) +"/" + str(df["YrSold"][i]), format='%m/%Y'))
    return ans

def combine_test(x):
    ind = x.index
    ans = []
    for i in ind:
        ans.append(pd.to_datetime(str(x[0]) +"/" + str(df_test["YrSold"][i]), format='%m/%Y'))
    return ans

In [None]:
df["MonthYearSold"] = df[["MoSold"]].apply(combine)
df_test["MonthYearSold"] = df_test[["MoSold"]].apply(combine_test)

In [None]:
# We will take the number of days sold difference into consideration
df["SoldDateDiff"] = df["MonthYearSold"].apply(lambda x: max(df["MonthYearSold"]) - x)
df = df.drop(["MonthYearSold", "MoSold", "YrSold"], axis = 1)

df_test["SoldDateDiff"] = df_test["MonthYearSold"].apply(lambda x: max(df_test["MonthYearSold"]) - x)
df_test = df_test.drop(["MonthYearSold", "MoSold", "YrSold"], axis = 1)

In [None]:
df["SoldDateDiff"] = df["SoldDateDiff"].apply(lambda x: str(x)[:-14])

df_test["SoldDateDiff"] = df_test["SoldDateDiff"].apply(lambda x: str(x)[:-14])

In [None]:
# We will remove the column name from var_numerical and var_categorical columns 
for col in ["YearBuilt", "YearRemodAdd", "MonthYearSold", "MoSold", "YrSold"]:
    if col in var_numerical:
        var_numerical = list(set(var_numerical) - set([col]))
    elif col in var_categorical:
        var_categorical = list(set(var_categorical) - set([col]))

In [None]:
# Plot to the distribution of the new derived columns
sns.distplot(x = df["BuiltAge"])
plt.show()
sns.scatterplot(x = df["BuiltAge"], y = df["SalePrice"])
plt.show()

In [None]:
# Plot to the distribution of the new derived columns
sns.distplot(x = df["RemodAddAge"])
plt.show()
sns.scatterplot(x = df["RemodAddAge"], y = df["SalePrice"])
plt.show()

In [None]:
# Plot to the distribution of the new derived columns
sns.distplot(x = df["SoldDateDiff"])
plt.show()
sns.scatterplot(x = df["SoldDateDiff"], y = df["SalePrice"])
plt.show()

In [None]:
var_numerical = var_numerical + ['BuiltAge', 'RemodAddAge', 'SoldDateDiff']

In [None]:
# We will use the label encoding on the ordinal variable 
ordinal_col = ['BsmtQual', 'BsmtCond', 'BsmtExposure','BsmtFinType1','BsmtFinType2',
 'HeatingQC','KitchenQual', 'FireplaceQu','GarageQual','GarageCond','PoolQC']
nominal_col = list(set(var_categorical) - set(ordinal_col))

In [None]:
df[nominal_col].nunique()

In [None]:
df["Condition2"].value_counts()

We can drop the condition2 column because it has mostly one value in it.

In [None]:
df = df.drop(['Condition2'], axis = 1)

df_test = df_test.drop(['Condition2'], axis = 1)

In [None]:
df["Utilities"].value_counts()

In [None]:
df = df.drop(["Utilities"], axis = 1)

df_test = df_test.drop(['Utilities'], axis = 1)

In [None]:
var_categorical = list(set(var_categorical) - set(['Utilities', 'Condition2']))
nominal_col = list(set(nominal_col)-set(['Utilities', 'Condition2']))

For columns 'Exterior2nd', 'Exterior1st' and 'Neighbourhood', we will use one hot encoding for multiclass variables. <br/>
Based on the winning solution of KDD 2009 Cup i.e. we are going to limit the number of categories in the these 3 variables to 10 most frequent labels.

In [None]:
col_with_multiple_labels = ['Exterior1st', 'Exterior2nd', 'Neighborhood']

In [None]:
def top_labels(df, col, label_cnt):
    top = list(df[col].value_counts().sort_values(ascending=False).head(label_cnt).index)
    for categories in top:
        df[col+ "_" +str(categories)]=np.where(df[col]==categories,1,0)
        print(col + "_" + str(categories))
    print(top)
    return top

def top_labels_test(df_test, col, max_col, top):
    print("Top Labels: ", top)
    for categories in top:
        df_test[col+ "_" +str(categories)]=np.where(df_test[col]==categories,1,0)
        print(col + "_" + str(categories))

In [None]:
df["Exterior1st"].value_counts()

In [None]:
top_exterior1st_labels = top_labels(df, 'Exterior1st', 10)
df = df.drop(['Exterior1st'], axis = 1)

top_labels_test(df_test, 'Exterior1st', 10, top_exterior1st_labels)
df_test = df_test.drop(['Exterior1st'], axis = 1)

In [None]:
df["Exterior2nd"].value_counts()

In [None]:
top_exterior2nd_label = top_labels(df, 'Exterior2nd', 8)
df = df.drop(['Exterior2nd'], axis = 1)

top_labels_test(df_test, 'Exterior2nd', 8, top_exterior2nd_label)
df_test = df_test.drop(['Exterior2nd'], axis = 1)

In [None]:
top_neighborhood_labels = top_labels(df, 'Neighborhood', 10)
df = df.drop(['Neighborhood'], axis = 1)

top_labels_test(df_test, 'Neighborhood', 10, top_neighborhood_labels)
df_test = df_test.drop(['Neighborhood'], axis = 1)

In [None]:
nominal_col = list(set(nominal_col) - set(['Exterior1st', 'Exterior2nd', 'Neighborhood']))

Using the same concept of multiclass one hot encoding, we will create one hot encoding for other categorical columns.

In [None]:
df["Functional"].value_counts()

In [None]:
top_functional_labels = top_labels(df, 'Functional', 3)
df = df.drop(['Functional'], axis = 1)

top_labels_test(df_test, 'Functional', 3, top_functional_labels)
df_test = df_test.drop(['Functional'], axis = 1)

In [None]:
df["Heating"].value_counts()

In [None]:
top_heating_labels = top_labels(df, 'Heating', 2)
df = df.drop(['Heating'], axis = 1)

top_labels_test(df_test, 'Heating', 2, top_heating_labels)
df_test = df_test.drop(['Heating'], axis = 1)

In [None]:
df["SaleType"].value_counts()

In [None]:
top_saletype_labels = top_labels(df, 'SaleType', 3)
df = df.drop(['SaleType'], axis = 1)

top_labels_test(df_test, 'SaleType', 3, top_saletype_labels)
df_test = df_test.drop(['SaleType'], axis = 1)

In [None]:
df["RoofMatl"].value_counts()

In [None]:
top_garagecond_labels = top_labels(df, 'RoofMatl', 2)
df = df.drop(['RoofMatl'], axis = 1)

top_labels_test(df_test, 'RoofMatl', 2, top_garagecond_labels)
df_test = df_test.drop(['RoofMatl'], axis = 1)

In [None]:
df["Condition1"].value_counts()

In [None]:
top_condition1_labels = top_labels(df, 'Condition1', 5)
df = df.drop(['Condition1'], axis = 1)

top_labels_test(df_test, 'Condition1', 5, top_condition1_labels)
df_test = df_test.drop(['Condition1'], axis = 1)

In [None]:
nominal_col = list(set(nominal_col)-set(['Condition1', 'RoofMatl','SaleType', 'Heating', 'Functional']))

In [None]:
# Make dummy variables for the nominal columns
df = pd.get_dummies(df, columns=nominal_col, drop_first=True)
df_test = pd.get_dummies(df_test, columns=nominal_col, drop_first=True)

In [None]:
df.head()

In [None]:
corr = df.corr()

In [None]:
# Calculate the highly correlated features
# https://stackoverflow.com/questions/29294983/how-to-calculate-correlation-between-all-columns-and-remove-highly-correlated-on

def highly_correlated_features(dataset, threshold):
    high_correlated_features = []
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    high_correlated_features.append(colname)
    return high_correlated_features

def highly_negative_correlated_features(dataset, threshold):
    high_correlated_features = []
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] <= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    high_correlated_features.append(colname)
    return high_correlated_features

In [None]:
positively_high_correlated_value = highly_correlated_features(df, 0.7)
negatively_high_correlated_value = highly_negative_correlated_features(df, -0.7)

In [None]:
plt.figure(figsize=(16, 16))
sns.heatmap(df[list(set(positively_high_correlated_value + negatively_high_correlated_value))].corr(), annot=True)
plt.show()

We can see that SaleType_New and SaleCondition_Partial has very high correlation (0.99), so we will drop one of them.

In [None]:
df = df.drop(['SaleCondition_Partial'], axis = 1)
df_test = df_test.drop(['SaleCondition_Partial'], axis = 1)

# Impute test missing values

In [None]:
remaining_missing_df_test = df_test[df_test.columns[df_test.isnull().sum()>0]].isnull().sum()/len(df_test)*100
print(remaining_missing_df_test)
remaining_missing_df_test = list(remaining_missing_df_test.index)
remaining_missing_df_test

In [None]:
for col in remaining_missing_df_test:
    if col in var_numerical:
        df_test[col] = df_test[col].fillna(0.0)
    elif col in var_categorical:
        df_test[col] = df_test[col].fillna('NA')

# Label Encoding for ordinal columns

In [None]:
# Label Encoding categorical variables from x
from sklearn.preprocessing import LabelEncoder
# Feature scaling
from sklearn.preprocessing import MinMaxScaler, RobustScaler

In [None]:
def label_imputation(x):
    if x=="Ex":
        return 5
    elif x=="Gd":
        return 4
    elif x=="TA":
        return 3
    elif x=="Fa":
        return 2
    elif x=="Po":
        return 1
    else:
        return 0

In [None]:
for col in ordinal_col:
    df[col] = df[col].apply(label_imputation)
    df_test[col] = df_test[col].apply(label_imputation)

# Form target and predictor variables

In [None]:
# Divide the train data into X and y

y_train = df.pop('SalePrice')
X_train = df

In [None]:
# Divide the test data into X
X_test = df_test

In [None]:
X_train.shape, y_train.shape, X_test.shape

In [None]:
for col in list(X_train.columns):
    if col not in list(X_test.columns):
        X_train = X_train.drop([col], axis = 1)

In [None]:
var_numerical = list(set(var_numerical) - set(['SalePrice']))

# Scaling numerical variables

In [None]:
# We will use MinMaxScaler for all the numerical variables as some variables are showing very high range of 
# values and many of them are not normally distributed
# We have remove some columns very high skew in data so we will use robust scaler
scaler = RobustScaler()

In [None]:
# Fit and Transform the data
# Fit will calculate our Min and Max values
# Transform will operate on standardisation function and scales our values
X_train[var_numerical] = scaler.fit_transform(X_train[var_numerical])
# Transform will operate on standardisation function and scales our values
X_test[var_numerical] = scaler.transform(X_test[var_numerical])

# Model Building

In [None]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
# Importing RFE (recursive feature elimination)
from sklearn.feature_selection import RFE

In [None]:
# Model evaluation function
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, explained_variance_score
from sklearn.model_selection import cross_val_score
# Grid Search CV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
# K Fold cross validation
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

# i. Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred_train_lr = lr.predict(X_train)
y_pred_test_lr = lr.predict(X_test)

metric_lr = []
r2_train_lr = r2_score(y_train, y_pred_train_lr)
print("R2 Train Score: ", r2_train_lr)
metric_lr.append(r2_train_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train_lr)
print("Mean Squared Train Error: ", mse_train_lr)
metric_lr.append(mse_train_lr**0.5)

#### Linear Regression feature coefficient values in descending order

In [None]:
linear_beta = pd.DataFrame(index=X_train.columns)
linear_beta.rows = X_train.columns
linear_beta["Linear Regression"] = abs(lr.coef_)

In [None]:
pd.set_option('display.max_rows', None)
linear_beta.sort_values(by = 'Linear Regression', ascending=False)

# ii. Linear Regression with RFE 

In [None]:
# running RFE 
# For the first model we are taking half features
rfe = RFE(lr, 50)   
rfe = rfe.fit(X_train, y_train)

In [None]:
# Columns with RFE Support as True
col = X_train.columns[rfe.support_]
len(col)

In [None]:
lr_with_rfe = LinearRegression()
lr_with_rfe.fit(X_train[col], y_train)
y_train_pred_rfe = lr_with_rfe.predict(X_train[col])
y_test_pred_rfe = lr_with_rfe.predict(X_test[col])

print("Linear regression train r2_score: ", r2_score(y_train, y_train_pred_rfe))

We have see we have lower r2 score with 50 features so we will not drop more features.

# iii. Ridge Regression with hyperparameter tuning

In [None]:
seed = 50

In [None]:
# Initialisation of ridge linear regression model
ridge_lr = Ridge(random_state = seed)

In [None]:
# Create the param grid for logistic regression
param_ridge_lr = {
    'alpha': [0.0001, 0.0002, 0.0004, 0.0008, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]
}
print(param_ridge_lr)

In [None]:
folds_ridge = KFold(n_splits = 4, shuffle = True, random_state=100)

In [None]:
grid_ridge = GridSearchCV(estimator = ridge_lr, scoring= 'neg_root_mean_squared_error', param_grid = param_ridge_lr, cv = folds_ridge, 
                           verbose=0, return_train_score=True, n_jobs=3)
grid_ridge.fit(X_train, y_train)

In [None]:
pd.DataFrame(grid_ridge.cv_results_)[['param_alpha', 'mean_test_score', 'mean_train_score']]

#### Ridge regression with best parameter 

In [None]:
#Fitting Ridge model for best ridge parameter and printing coefficients which have been penalised
alpha = grid_ridge.best_estimator_.alpha
ridge = Ridge(alpha=alpha, random_state=seed)

ridge = ridge.fit(X_train, y_train)
ridge

In [None]:
y_pred_train = ridge.predict(X_train)
y_pred_test_ridge = ridge.predict(X_test)

metric2 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print("R2 Train Score: ", r2_train_lr)
metric2.append(r2_train_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print("Mean Squared Train Error: ", mse_train_lr)
metric2.append(mse_train_lr**0.5)

In [None]:
sns.scatterplot(y = y_train - y_pred_train, x = y_pred_train)
plt.show()

In [None]:
sns.scatterplot(y = y_train , x = y_pred_train)
plt.ylabel("Y-Train")
plt.xlabel("Y-Train-Predict")
plt.title("Ridge Regression (train vs pred train)")
plt.show()

#### Ridge with best alpha feature coefficient values in descending order

In [None]:
ridge_beta = pd.DataFrame(index=X_train.columns)
ridge_beta.rows = X_train.columns
ridge_beta["Ridge with best alpha"] = abs(ridge.coef_)
pd.set_option('display.max_rows', None)
ridge_beta = ridge_beta.sort_values(by = 'Ridge with best alpha', ascending=False)
ridge_beta

In [None]:
len(ridge_beta[ridge_beta["Ridge with best alpha"]==0])

# iv. Lasso Regression with hyperparameter tuning

In [None]:
# Initialise the lasso model
lasso_lr = Lasso(random_state = 50)

In [None]:
# Create the param grid for logistic regression
param_lasso_lr = {
    'alpha': [0.0001, 0.0002, 0.0004, 0.0008, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]
}
print(param_lasso_lr)

In [None]:
folds_lasso = KFold(n_splits = 3, shuffle = True, random_state=100)

In [None]:
grid_lasso = GridSearchCV(estimator = lasso_lr, scoring= 'neg_root_mean_squared_error', param_grid = param_lasso_lr, 
                          cv = folds_lasso, verbose=0, return_train_score=True, n_jobs=3)
grid_lasso.fit(X_train, y_train)

In [None]:
pd.DataFrame(grid_lasso.cv_results_)[['param_alpha', 'mean_test_score', 'mean_train_score']]

#### Lasso regression with best parameter 

In [None]:
#Fitting lasso model for best alpha and printing coefficients which have been penalised
alpha = grid_lasso.best_estimator_.alpha
lasso = Lasso(alpha=alpha)

lasso = lasso.fit(X_train, y_train)
lasso

In [None]:
y_pred_train_lasso = lasso.predict(X_train)
y_pred_test_lasso = lasso.predict(X_test)

metric3 = []
r2_train_lr = r2_score(y_train, y_pred_train_lasso)
print("R2 Train Score: ", r2_train_lr)
metric3.append(r2_train_lr)


mse_train_lr = mean_squared_error(y_train, y_pred_train_lasso)
print("Mean Squared Train Error: ", mse_train_lr)
metric3.append(mse_train_lr**0.5)

In [None]:
sns.scatterplot(y = y_train , x = y_pred_train_lasso)
plt.ylabel("Y-Train")
plt.xlabel("Y-Train-Predict")
plt.title("Lasso Regression (train vs pred train)")
plt.show()

#### Lasso with best parameter feature coefficient values in descending order

In [None]:
lasso_beta = pd.DataFrame(index=X_train.columns)
lasso_beta.rows = X_train.columns
lasso_beta["Lasso with the best parameter"] = abs(lasso.coef_)

In [None]:
pd.set_option('display.max_rows', None)
lasso_beta = lasso_beta.sort_values(by = 'Lasso with the best parameter', ascending=False)
lasso_beta

In [None]:
len(lasso_beta[lasso_beta["Lasso with the best parameter"]==0])

# v. Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfr = RandomForestRegressor(n_estimators=100, random_state=50)
rfr.fit(X_train, y_train)

In [None]:
y_pred_train_rfr = rfr.predict(X_train)
y_pred_test_rfr = rfr.predict(X_test)

metric4 = []
r2_train_rfr = r2_score(y_train, y_pred_train_rfr)
print("R2 Train Score: ", r2_train_rfr)
metric4.append(r2_train_rfr)


mse_train_rfr = mean_squared_error(y_train, y_pred_train_rfr)
print("Mean Squared Train Error: ", mse_train_rfr)
metric4.append(mse_train_rfr**0.5)

# vi. XGBoost Regressor

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor(n_estimators=100, random_state=50)
xgb.fit(X_train, y_train)

In [None]:
y_pred_train_xgb = xgb.predict(X_train)
y_pred_test_xgb = xgb.predict(X_test)

metric5 = []
r2_train_xgb = r2_score(y_train, y_pred_train_xgb)
print("R2 Train Score: ", r2_train_xgb)
metric5.append(r2_train_xgb)


mse_train_xgb = mean_squared_error(y_train, y_pred_train_xgb)
print("Mean Squared Train Error: ", mse_train_xgb)
metric5.append(mse_train_xgb**0.5)

#### Lets observe the changes in the coefficients after regularization

In [None]:
# Creating a table which contain all the metrics

lr_table = {'Metric': ['R2 Score (Train)',
                       'MSE (Train)'], 
            'Linear Regression': metric_lr
           }
lr_metric = pd.DataFrame(lr_table ,columns = ['Metric', 'Linear Regression'] )

rg_metric = pd.Series(metric2, name = 'Ridge Regression')
ls_metric = pd.Series(metric3, name = 'Lasso Regression')
rfr_metric = pd.Series(metric4, name = 'Random Forest Regression')
xgb_metric = pd.Series(metric5, name = 'XGBoost Regression')

final_metric = pd.concat([lr_metric, rg_metric, ls_metric, rfr_metric, xgb_metric], axis = 1)

final_metric

#### Lets observe the changes in the coefficients after regularization

In [None]:
betas = pd.DataFrame(index=X_train.columns)

In [None]:
betas.rows = X_train.columns

In [None]:
betas['Linear Regression'] = lr.coef_
betas['Ridge'] = ridge.coef_
betas['Lasso'] = lasso.coef_

In [None]:
pd.set_option('display.max_rows', None)
betas

While predicting we have to be careful about transforming the log prediction back to its normal form.

# Predictions

In [None]:
df_sub = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")
df_sub.head()

In [None]:
df_sub_lr = pd.DataFrame({
    "Id": df_test_id,
    "SalePrice": np.exp(y_pred_test_lr)
})
df_sub_lr.to_csv('submission_lr.csv', index = False)

In [None]:
df_sub_rfe = pd.DataFrame({
    "Id": df_test_id,
    "SalePrice": np.exp(y_test_pred_rfe)
})
df_sub_rfe.to_csv('submission_rfe.csv', index = False)

In [None]:
df_sub_lasso = pd.DataFrame({
    "Id": df_test_id,
    "SalePrice": np.e**(y_pred_test_lasso)
})
df_sub_lasso.to_csv('submission_lasso.csv', index = False)

In [None]:
df_sub_ridge = pd.DataFrame({
    "Id": df_test_id,
    "SalePrice": np.exp(y_pred_test_ridge)
})
df_sub_ridge.to_csv('submission_ridge.csv', index = False)

In [None]:
df_sub_rfr = pd.DataFrame({
    "Id": df_test_id,
    "SalePrice": np.exp(y_pred_test_rfr)
})
df_sub_rfr.to_csv('submission_rfr.csv', index = False)

In [None]:
df_sub_xgb = pd.DataFrame({
    "Id": df_test_id,
    "SalePrice": np.exp(y_pred_test_xgb)
})
df_sub_xgb.to_csv('submission_xgb.csv', index = False)