In [733]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [734]:
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer

In [735]:
df = pd.read_csv('../../../../Datasets/home-data-for-ml-course/train.csv')
df.drop(columns=['Id'], inplace=True)

## Null Value Treatment

In [736]:
def get_null_count(df):
    null_count = {}
    for each in df.columns:
        null_count_each = df[each].isnull().sum()
        if null_count_each != 0:
            null_count[each] = [null_count_each, round((null_count_each/len(df.index))*100,2)]
    
    null_count_df = pd.DataFrame(null_count, index=['Nulls', '% Nulls'])
    null_count_df = null_count_df.T.sort_values(by='Nulls', ascending=False)
    return null_count_df

In [737]:
def drop_columns(df, cols = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'PoolQC', 'MiscFeature']):
    '''
        Drops specified columns from a DataFrame.
    
        Parameters:
        df (pd.DataFrame): The DataFrame from which columns will be dropped.
        cols (list): A list of column names to drop.
    
        Returns:
        df (pd.DataFrame)
    '''   
    df.drop(columns=cols, inplace=True)
    return df


In [738]:
df = drop_columns(df)

In [739]:
def fill_empty(df):
    """
        Fills empty values with respective placeholder.

        Parameters:
        df (pd.DataFrame): The DataFrame from which columns will be dropped.
        cols (list): A list of column names to drop.
        fill_Values (list): A list of place holders. The fill value should follow the order of the cols list.

        Returns:
        df (pd.DataFrame): The DataFrame with fill values
        
    
    """
    cols = ['Alley','Fence','FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual']
    # appended_cols = [f"preprocessing_pipelines__{val}" for val in cols]
    fill_values = ['No Alley', 'No Fence', 'No FirePlace', 'No', 'No','No','No', 'No', 'No','No','No', 'No' ]
        
    for col, fill_value in zip(cols, fill_values):
        df.loc[:,col] = df.loc[:,col].fillna(fill_value) 

    return df    

In [740]:
def fill_masvnr(df):
    """
        Fills missing values in the `MasVnrType` and `MasVnrArea` columns of a DataFrame.
        
        This function performs the following operations:
        
        1. If `MasVnrArea` is `0.0`, `1.0`, or `NaN`, it replaces the corresponding `MasVnrType` with `'No Vnr'`.
        2. Fills any remaining missing values in `MasVnrArea` with `0.0`.
        3. Fills any remaining missing values in `MasVnrType` with `'BrkFace'`.
        
        Parameters:
        -----------
        df : pd.DataFrame
            The input DataFrame containing `MasVnrType` and `MasVnrArea` columns.
        col : str, optional
            The column name to fill missing values (default is `'MasVnrType'`).
    
        Returns:
        --------
        pd.DataFrame
            The modified DataFrame with missing values handled in `MasVnrType` and `MasVnrArea`.
    """
    col='MasVnrType'
    
    ## replacing Vnr Type with MasVnrArea 0, with No Vnr
    df.loc[df['MasVnrArea'] == 0.0, 'MasVnrType'] = df.loc[df['MasVnrArea'] == 0.0, 'MasVnrType'].fillna('No Vnr')

    ## replacing Vnr Type with MasVnrArea 1, with No Vnr
    df.loc[df['MasVnrArea'] == 1.0, 'MasVnrType'] = df.loc[df['MasVnrArea'] == 1.0, 'MasVnrType'].fillna('No Vnr')

    ## replacing Vnr Type with MasVnrArea NaN, with No Vnr
    df.loc[df['MasVnrArea'].isna(), 'MasVnrType'] = df.loc[df['MasVnrArea'].isna(), 'MasVnrType'].fillna('No Vnr')
    
    # Replacing remaining MasVnrArea with 0 since, the for type is NaN
    df['MasVnrArea'] = df['MasVnrArea'].fillna(0)

    # Replacing remaining MasVnrType with the BrkFace
    df['MasVnrType'] = df['MasVnrType'].fillna('BrkFace')

    return df  

## Feature Engineering

In [741]:
def merge_bathromms(df):
    """
        Merges multiple bathroom-related columns into a single column `TotalBathrooms`.
    
        Parameters:
        df (pd.DataFrame): The input DataFrame.
        
        cols (list, optional): The list of columns to merge. Defaults to:
            - 'FullBath': Count of full bathrooms above ground.
            - 'HalfBath': Count of half bathrooms above ground.
            - 'BsmtFullBath': Count of full bathrooms in the basement.
            - 'BsmtHalfBath': Count of half bathrooms in the basement.
    
        Returns:
        pd.DataFrame: The DataFrame with a new `TotalBathrooms` column and the original
                      bathroom-related columns removed.
        """
    cols = df.columns
    df['TotalBathrooms'] = df['FullBath'] + (0.5 * df['HalfBath']) + df['BsmtFullBath'] + (df['BsmtHalfBath'] * 0.5)
    df.drop(columns=cols, inplace=True)
    return df

In [742]:
def add_new_features(df):
    
    # total age of the house when it was sold
    df['HouseAge'] = np.abs(df['YearBuilt'] - df['YrSold'])
    
    df['RemodeledAge'] = np.abs(df['YearBuilt'] - df['YearRemodAdd'])
    df['TotalSquareFootage'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF']
    df['TotalPorchArea'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
    df['TotalOutdoorSpace'] = df['TotalPorchArea'] + df['WoodDeckSF'] + df['PoolArea']
    df['PricePerSF'] = df['SalePrice']/df['TotalSquareFootage']
    # df['CarsToGarageRatio'] = df['GarageCars']/df['GarageArea'] low correlation so removed
    df['QualityScore'] = df['OverallCond'] * df['OverallQual']

    df = df[['HouseAge', 'RemodeledAge', 'TotalSquareFootage', 'TotalPorchArea', 'PricePerSF', 'TotalOutdoorSpace' ,'QualityScore']]

    return df
    

In [759]:
def add_binary_flag(df):
    # df['HasPool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0) low correlation so removed
    df['HasPorch'] = df['TotalPorchArea'].apply(lambda x: 1 if x > 0 else 0)
    df['HasGarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    df['HasFirePlace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
    df['NewHouse'] = df['HouseAge'].apply(lambda x: 1 if x < 10 else 0)
    df['OldHouse'] = df['HouseAge'].apply(lambda x: 1 if x > 50 else 0)

    df = df[['HasPorch','HasGarage','HasFirePlace','NewHouse','OldHouse' ]]
    return df

In [743]:
## Pipelines

In [760]:
def get_bathroom_name(function_transformer, features_names_in):
    return ['TotalBathrooms']

def get_features_names_out(function_transformer, features_names_in):
    return ['HouseAge', 'RemodeledAge', 'TotalSquareFootage', 'TotalPorchArea', 'PricePerSF', 'TotalOutdoorSpace' ,'QualityScore']

def get_binary_flags_cloumn_names(function_transformer, features_names_in):
    return ['HasPorch','HasGarage','HasFirePlace','NewHouse','OldHouse' ]

In [745]:
fill_missing_pipeline = Pipeline([
     ('fill_with_placeholder', FunctionTransformer(fill_empty, feature_names_out='one-to-one')),
     ('masvnr_transform', FunctionTransformer(fill_masvnr, feature_names_out='one-to-one')),    
])


categorical_imputer_pipeline = Pipeline([
    ('categorical_imputer', SimpleImputer(strategy="most_frequent"))
])

numerical_imputer_pipeline = Pipeline([
    ('numerical_imputer', SimpleImputer(strategy="median"))
])




In [761]:
fillna_preprocessing = ColumnTransformer(transformers=[
   ('fill_missing_pipeline',fill_missing_pipeline, ['Alley','Fence','FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual','MasVnrArea', 'MasVnrType']), 
   ('categorical_imputer_pipeline', categorical_imputer_pipeline, ['Electrical', 'GarageYrBlt']),
    ('numerical_imputer_pipeline', numerical_imputer_pipeline, ['LotFrontage']),
    ('merge_bathrooms', FunctionTransformer(merge_bathromms, feature_names_out=get_bathroom_name), ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']),
    ('new_features', FunctionTransformer(add_new_features, feature_names_out=get_features_names_out), ['YearBuilt', 'YrSold', 'YearRemodAdd', '1stFlrSF','2ndFlrSF', 'TotalBsmtSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'WoodDeckSF', 'PoolArea', 'SalePrice', 'OverallCond', 'OverallQual' ])
])

In [747]:
processed_df = fillna_preprocessing.fit_transform(df)

In [748]:
pdf = pd.DataFrame(processed_df, columns=fillna_preprocessing.get_feature_names_out())

In [749]:
cols = [x.split('__')[-1] for x in pdf.columns]

In [750]:
cols

['Alley',
 'Fence',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'BsmtFinType2',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtCond',
 'BsmtQual',
 'MasVnrArea',
 'MasVnrType',
 'Electrical',
 'GarageYrBlt',
 'LotFrontage',
 'TotalBathrooms',
 'HouseAge',
 'RemodeledAge',
 'TotalSquareFootage',
 'TotalPorchArea',
 'PricePerSF',
 'TotalOutdoorSpace',
 'QualityScore']

In [751]:
pdf

Unnamed: 0,fill_missing_pipeline__Alley,fill_missing_pipeline__Fence,fill_missing_pipeline__FireplaceQu,fill_missing_pipeline__GarageType,fill_missing_pipeline__GarageFinish,fill_missing_pipeline__GarageQual,fill_missing_pipeline__GarageCond,fill_missing_pipeline__BsmtFinType2,fill_missing_pipeline__BsmtExposure,fill_missing_pipeline__BsmtFinType1,...,categorical_imputer_pipeline__GarageYrBlt,numerical_imputer_pipeline__LotFrontage,merge_bathrooms__TotalBathrooms,new_features__HouseAge,new_features__RemodeledAge,new_features__TotalSquareFootage,new_features__TotalPorchArea,new_features__PricePerSF,new_features__TotalOutdoorSpace,new_features__QualityScore
0,No Alley,No Fence,No FirePlace,Attchd,RFn,TA,TA,Unf,No,GLQ,...,2003.0,65.0,3.5,5.0,0.0,2566.0,61.0,81.254871,61.0,35.0
1,No Alley,No Fence,TA,Attchd,RFn,TA,TA,Unf,Gd,ALQ,...,1976.0,80.0,2.5,31.0,0.0,2524.0,0.0,71.909667,298.0,48.0
2,No Alley,No Fence,TA,Attchd,RFn,TA,TA,Unf,Mn,GLQ,...,2001.0,68.0,3.5,7.0,1.0,2706.0,42.0,82.594235,42.0,35.0
3,No Alley,No Fence,Gd,Detchd,Unf,TA,TA,Unf,No,ALQ,...,1998.0,60.0,2.0,91.0,55.0,2473.0,307.0,56.611403,307.0,35.0
4,No Alley,No Fence,TA,Attchd,RFn,TA,TA,Unf,Av,GLQ,...,2000.0,84.0,3.5,8.0,0.0,3343.0,84.0,74.783129,276.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,No Alley,No Fence,TA,Attchd,RFn,TA,TA,Unf,No,Unf,...,1999.0,62.0,2.5,8.0,1.0,2600.0,40.0,67.307692,40.0,30.0
1456,No Alley,MnPrv,TA,Attchd,Unf,TA,TA,Rec,No,ALQ,...,1978.0,85.0,3.0,32.0,10.0,3615.0,0.0,58.091286,349.0,36.0
1457,No Alley,GdPrv,Gd,Attchd,RFn,TA,TA,Unf,No,GLQ,...,1941.0,66.0,2.0,69.0,65.0,3492.0,60.0,76.317297,60.0,63.0
1458,No Alley,No Fence,No FirePlace,Attchd,Unf,TA,TA,Rec,Mn,GLQ,...,1950.0,68.0,2.0,60.0,46.0,2156.0,112.0,65.920686,478.0,30.0


In [752]:
pdf.shape

(1460, 25)

In [753]:
cols

['Alley',
 'Fence',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'BsmtFinType2',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtCond',
 'BsmtQual',
 'MasVnrArea',
 'MasVnrType',
 'Electrical',
 'GarageYrBlt',
 'LotFrontage',
 'TotalBathrooms',
 'HouseAge',
 'RemodeledAge',
 'TotalSquareFootage',
 'TotalPorchArea',
 'PricePerSF',
 'TotalOutdoorSpace',
 'QualityScore']

In [754]:
pdf.columns

Index(['fill_missing_pipeline__Alley', 'fill_missing_pipeline__Fence',
       'fill_missing_pipeline__FireplaceQu',
       'fill_missing_pipeline__GarageType',
       'fill_missing_pipeline__GarageFinish',
       'fill_missing_pipeline__GarageQual',
       'fill_missing_pipeline__GarageCond',
       'fill_missing_pipeline__BsmtFinType2',
       'fill_missing_pipeline__BsmtExposure',
       'fill_missing_pipeline__BsmtFinType1',
       'fill_missing_pipeline__BsmtCond', 'fill_missing_pipeline__BsmtQual',
       'fill_missing_pipeline__MasVnrArea',
       'fill_missing_pipeline__MasVnrType',
       'categorical_imputer_pipeline__Electrical',
       'categorical_imputer_pipeline__GarageYrBlt',
       'numerical_imputer_pipeline__LotFrontage',
       'merge_bathrooms__TotalBathrooms', 'new_features__HouseAge',
       'new_features__RemodeledAge', 'new_features__TotalSquareFootage',
       'new_features__TotalPorchArea', 'new_features__PricePerSF',
       'new_features__TotalOutdoorSpace', 

In [755]:
df_transformed = pd.DataFrame()

In [756]:
df_transformed[cols] = pdf

In [757]:
df_transformed

Unnamed: 0,Alley,Fence,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,BsmtFinType2,BsmtExposure,BsmtFinType1,...,GarageYrBlt,LotFrontage,TotalBathrooms,HouseAge,RemodeledAge,TotalSquareFootage,TotalPorchArea,PricePerSF,TotalOutdoorSpace,QualityScore
0,No Alley,No Fence,No FirePlace,Attchd,RFn,TA,TA,Unf,No,GLQ,...,2003.0,65.0,3.5,5.0,0.0,2566.0,61.0,81.254871,61.0,35.0
1,No Alley,No Fence,TA,Attchd,RFn,TA,TA,Unf,Gd,ALQ,...,1976.0,80.0,2.5,31.0,0.0,2524.0,0.0,71.909667,298.0,48.0
2,No Alley,No Fence,TA,Attchd,RFn,TA,TA,Unf,Mn,GLQ,...,2001.0,68.0,3.5,7.0,1.0,2706.0,42.0,82.594235,42.0,35.0
3,No Alley,No Fence,Gd,Detchd,Unf,TA,TA,Unf,No,ALQ,...,1998.0,60.0,2.0,91.0,55.0,2473.0,307.0,56.611403,307.0,35.0
4,No Alley,No Fence,TA,Attchd,RFn,TA,TA,Unf,Av,GLQ,...,2000.0,84.0,3.5,8.0,0.0,3343.0,84.0,74.783129,276.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,No Alley,No Fence,TA,Attchd,RFn,TA,TA,Unf,No,Unf,...,1999.0,62.0,2.5,8.0,1.0,2600.0,40.0,67.307692,40.0,30.0
1456,No Alley,MnPrv,TA,Attchd,Unf,TA,TA,Rec,No,ALQ,...,1978.0,85.0,3.0,32.0,10.0,3615.0,0.0,58.091286,349.0,36.0
1457,No Alley,GdPrv,Gd,Attchd,RFn,TA,TA,Unf,No,GLQ,...,1941.0,66.0,2.0,69.0,65.0,3492.0,60.0,76.317297,60.0,63.0
1458,No Alley,No Fence,No FirePlace,Attchd,Unf,TA,TA,Rec,Mn,GLQ,...,1950.0,68.0,2.0,60.0,46.0,2156.0,112.0,65.920686,478.0,30.0


### Feature Engineering

In [633]:
def add_binary_flag(df):
    # df['HasPool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0) low correlation so removed
    df['HasPorch'] = df['TotalPorchArea'].apply(lambda x: 1 if x > 0 else 0)
    df['HasGarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    df['HasFirePlace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
    df['NewHouse'] = df['HouseAge'].apply(lambda x: 1 if x < 10 else 0)
    df['OldHouse'] = df['HouseAge'].apply(lambda x: 1 if x > 50 else 0)

In [40]:
def get_season_(month_number):
    if month_number in [1,2,12]:
        return "winter"
    elif month_number in [3,4,5]:
        return 'spring'
    elif month_number in [6,7,8]:
        return 'autumn'
    elif month_number in [9,10,11]:
        return 'summer'


def months_sold_to_season(df, col='MoSold'):
    df['SeasonOfSale'] = df['MoSold'].apply(lambda x: get_season_(x)) 
    return df
    

In [41]:
def fill_neighbour_median_price(df, col='Neighborhood', target="SalePrice"):
    neighborhood_median_prices = df.groupby(by="Neighborhood")['SalePrice'].median().sort_values(ascending=False).to_dict()
    df['NeighborhoodMedianPrices'] = df['Neighborhood'].map(neighborhood_median_prices)
    return df
    

## Odinal Category to Numerical Category

In [44]:
def categorical_ordinal_numeric(df, col ,mapping):
    """
        Converts an ordinal categorical column into numerical values based on a given mapping.
    
        Parameters:
        df (pd.DataFrame): The DataFrame containing the column to be transformed.
        
        col (str): The column name to be transformed.
        
        mapping (dict): A dictionary mapping categorical values to numerical values.
    
        Returns:
        pd.DataFrame: The DataFrame with the transformed column.
    """
    df[col] = df[col].map(mapping)
    return df
    