In [2501]:
import numpy as np
import pandas as pd
from  scipy.stats import boxcox
import matplotlib.pyplot as plt
import seaborn as sns

In [2502]:
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer

In [2503]:
from sklearn.metrics import root_mean_squared_error

In [2504]:
from sklearn.linear_model import ridge_regression, Lasso

In [2505]:
df = pd.read_csv('../../../../Datasets/home-data-for-ml-course/train.csv')
df.drop(columns=['Id'], inplace=True)

In [2506]:
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

## Null Value Treatment

In [2507]:
def get_null_count(df):
    null_count = {}
    for each in df.columns:
        null_count_each = df[each].isnull().sum()
        if null_count_each != 0:
            null_count[each] = [null_count_each, round((null_count_each/len(df.index))*100,2)]
    
    null_count_df = pd.DataFrame(null_count, index=['Nulls', '% Nulls'])
    null_count_df = null_count_df.T.sort_values(by='Nulls', ascending=False)
    return null_count_df

In [2508]:
def drop_columns(df, cols):
    '''
        Drops specified columns from a DataFrame.
    
        Parameters:
        df (pd.DataFrame): The DataFrame from which columns will be dropped.
        cols (list): A list of column names to drop.
    
        Returns:
        df (pd.DataFrame)
    '''   
    df.drop(columns=cols, inplace=True)
    return df

def get_season_(month_number):
    if month_number in [1,2,12]:
        return "winter"
    elif month_number in [3,4,5]:
        return 'spring'
    elif month_number in [6,7,8]:
        return 'autumn'
    elif month_number in [9,10,11]:
        return 'summer'

In [2509]:
def fill_empty(df):
    """
        Fills empty values with respective placeholder.

        Parameters:
        df (pd.DataFrame): The DataFrame from which columns will be dropped.
        cols (list): A list of column names to drop.
        fill_Values (list): A list of place holders. The fill value should follow the order of the cols list.

        Returns:
        df (pd.DataFrame): The DataFrame with fill values
        
    
    """
    cols = ['Alley','Fence','FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual']
    # appended_cols = [f"preprocessing_pipelines__{val}" for val in cols]
    fill_values = ['No Alley', 'No Fence', 'No FirePlace', 'No', 'No','No','No', 'No', 'No','No','No', 'No' ]
        
    for col, fill_value in zip(cols, fill_values):
        df.loc[:,col] = df.loc[:,col].fillna(fill_value) 

    return df    

In [2510]:
def fill_masvnr(df):
    """
        Fills missing values in the `MasVnrType` and `MasVnrArea` columns of a DataFrame.
        
        This function performs the following operations:
        
        1. If `MasVnrArea` is `0.0`, `1.0`, or `NaN`, it replaces the corresponding `MasVnrType` with `'No Vnr'`.
        2. Fills any remaining missing values in `MasVnrArea` with `0.0`.
        3. Fills any remaining missing values in `MasVnrType` with `'BrkFace'`.
        
        Parameters:
        -----------
        df : pd.DataFrame
            The input DataFrame containing `MasVnrType` and `MasVnrArea` columns.
        col : str, optional
            The column name to fill missing values (default is `'MasVnrType'`).
    
        Returns:
        --------
        pd.DataFrame
            The modified DataFrame with missing values handled in `MasVnrType` and `MasVnrArea`.
    """
    col='MasVnrType'
    
    ## replacing Vnr Type with MasVnrArea 0, with No Vnr
    df.loc[df['MasVnrArea'] == 0.0, 'MasVnrType'] = df.loc[df['MasVnrArea'] == 0.0, 'MasVnrType'].fillna('No Vnr')

    ## replacing Vnr Type with MasVnrArea 1, with No Vnr
    df.loc[df['MasVnrArea'] == 1.0, 'MasVnrType'] = df.loc[df['MasVnrArea'] == 1.0, 'MasVnrType'].fillna('No Vnr')

    ## replacing Vnr Type with MasVnrArea NaN, with No Vnr
    df.loc[df['MasVnrArea'].isna(), 'MasVnrType'] = df.loc[df['MasVnrArea'].isna(), 'MasVnrType'].fillna('No Vnr')
    
    # Replacing remaining MasVnrArea with 0 since, the for type is NaN
    df['MasVnrArea'] = df['MasVnrArea'].fillna(0)

    # Replacing remaining MasVnrType with the BrkFace
    df['MasVnrType'] = df['MasVnrType'].fillna('BrkFace')

    return df  

In [2511]:
fill_missing_pipeline = Pipeline([
     ('fill_with_placeholder', FunctionTransformer(fill_empty, feature_names_out='one-to-one')),
     ('masvnr_transform', FunctionTransformer(fill_masvnr, feature_names_out='one-to-one')),    
])


categorical_imputer_pipeline = Pipeline([
    ('categorical_imputer', SimpleImputer(strategy="most_frequent"))
])

numerical_imputer_pipeline = Pipeline([
    ('numerical_imputer', SimpleImputer(strategy="median"))
])

In [2512]:
fillna_preprocessing = ColumnTransformer(transformers=[
   ('fill_missing_pipeline',fill_missing_pipeline, ['Alley','Fence','FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual','MasVnrArea', 'MasVnrType']), 
   ('categorical_imputer_pipeline', categorical_imputer_pipeline, ['Electrical', 'GarageYrBlt', 'MSZoning', 'SaleType', 'KitchenQual']),
   ('numerical_imputer_pipeline', numerical_imputer_pipeline, ['LotFrontage', 'GarageArea', 'TotalBsmtSF']),
    
])

In [2513]:
def run_transformer_transform_colums(df):
    df_original = df.copy()
    processed_df = fillna_preprocessing.fit_transform(df)
    pdf = pd.DataFrame(processed_df, columns=fillna_preprocessing.get_feature_names_out())
    cols = [x.split('__')[-1] for x in pdf.columns]
    df_original[cols] = pdf

    return df_original

## Feature Engineering Pipeline

In [2514]:
# def get_bathroom_name(function_transformer, features_names_in):
#     return ['TotalBathrooms']

# def get_features_names_out(function_transformer, features_names_in):
#     print(features_names_in)
#     return list(features_names_in) + ['HouseAge', 'RemodeledAge', 'TotalSquareFootage', 'TotalPorchArea', 'PricePerSF', 'TotalOutdoorSpace' ,'QualityScore']

# def get_binary_flags_cloumn_names(function_transformer, features_names_in):
#     return ['HasPorch','HasGarage','HasFirePlace','NewHouse','OldHouse' ]

In [2515]:
def merge_bathromms(df):
    """
        Merges multiple bathroom-related columns into a single column `TotalBathrooms`.
    
        Parameters:
        df (pd.DataFrame): The input DataFrame.
        
        cols (list, optional): The list of columns to merge. Defaults to:
            - 'FullBath': Count of full bathrooms above ground.
            - 'HalfBath': Count of half bathrooms above ground.
            - 'BsmtFullBath': Count of full bathrooms in the basement.
            - 'BsmtHalfBath': Count of half bathrooms in the basement.
    
        Returns:
        pd.DataFrame: The DataFrame with a new `TotalBathrooms` column and the original
                      bathroom-related columns removed.
        """
    cols = ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']
    df['TotalBathrooms'] = df['FullBath'] + (0.5 * df['HalfBath']) + df['BsmtFullBath'] + (df['BsmtHalfBath'] * 0.5)
    df.drop(columns=cols, inplace=True)
    return df

def add_new_features(df):
    
    # total age of the house when it was sold
    df['HouseAge'] = np.abs(df['YearBuilt'] - df['YrSold'])
    
    df['RemodeledAge'] = np.abs(df['YearBuilt'] - df['YearRemodAdd'])
    df['TotalSquareFootage'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF']
    df['TotalPorchArea'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
    df['TotalOutdoorSpace'] = df['TotalPorchArea'] + df['WoodDeckSF'] + df['PoolArea']
    df['PricePerSF'] = df['SalePrice']/df['TotalSquareFootage']
    # df['CarsToGarageRatio'] = df['GarageCars']/df['GarageArea'] low correlation so removed
    df['QualityScore'] = df['OverallCond'] * df['OverallQual']

    return df

def add_binary_flag(df):
    # df['HasPool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0) low correlation so removed
    df['HasPorch'] = df['TotalPorchArea'].apply(lambda x: 1 if x > 0 else 0)
    df['HasGarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    df['HasFirePlace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
    df['NewHouse'] = df['HouseAge'].apply(lambda x: 1 if x < 10 else 0)
    df['OldHouse'] = df['HouseAge'].apply(lambda x: 1 if x > 50 else 0)
    return df

def months_sold_to_season(df):
    col='MoSold'
    df['SeasonOfSale'] = df['MoSold'].apply(lambda x: get_season_(x)) 
    df.drop(columns=['MoSold'], inplace=True)
    return df

def fill_neighbour_median_price(df):
    col='Neighborhood'
    target="SalePrice"
    neighborhood_median_prices = df.groupby(by=col)[target].median().sort_values(ascending=False).to_dict()
    df['NeighborhoodMedianPrices'] = df[col].map(neighborhood_median_prices)
    df.drop(columns=col, inplace=True)
    return df
    
def categorical_ordinal_numeric(df):
    """
        Converts an ordinal categorical column into numerical values based on a given mapping.
    
        Parameters:
        df (pd.DataFrame): The DataFrame containing the column to be transformed.
        
        col (str): The column name to be transformed.
        
        mapping (dict): A dictionary mapping categorical values to numerical values.
    
        Returns:
        pd.DataFrame: The DataFrame with the transformed column.
    """
    mappings = {
        'lot_shape_mapping': {'Reg':3,'IR3':2,'IR2':1,'IR1':0},
        'land_contor_map': {'Lvl': 3,'Bnk': 2,'HLS': 1,'Low': 0},
        'land_slope_map': {'Gtl':2,'Mod':1,'Sev':0 },
        'ext_quality_mapping': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
        'ext_cond_quality_mapping': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
        'quality_mapping_basement': {'Ex': 6, 'Gd': 5, 'TA': 4, 'Fa': 3, 'Po': 2, 'No':1},
        'condition_mapping_basement': {'Ex': 6, 'Gd': 5, 'TA': 4, 'Fa': 3, 'Po': 2, 'No':1},
        'fintype1_basement_mapping': {'GLQ': 7,'ALQ': 6,'BLQ': 5,'Rec': 4,'LwQ': 3,'Unf': 2,'No': 1},
        'fintype2_basement_mapping': {'GLQ': 7,'ALQ': 6,'BLQ': 5,'Rec': 4,'LwQ': 3,'Unf': 2,'No': 1},
    
        'h_quality_map': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'k_quality_map': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    
        'g1_quality_map':{'No':1,'Po': 2, 'Fa': 3, 'TA': 4, 'Gd': 5, 'Ex': 6},
        'g2_quality_map':{'No':1,'Po': 2, 'Fa': 3, 'TA': 4, 'Gd': 5, 'Ex': 6},
        'f_quality_map':{'No FirePlace':1,'Po': 2, 'Fa': 3, 'TA': 4, 'Gd': 5, 'Ex': 6}
    }
    cols = ['LotShape', 'LandContour', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual','GarageCond', 'GarageQual', 'FireplaceQu']
    
    for col, each_mapping in zip(cols, mappings):
        df[col] = df[col].map(mappings[each_mapping])
    return df


      

In [2516]:
feature_engineering_pipeline = Pipeline([
    ('drop_initial',FunctionTransformer(drop_columns, kw_args={'cols': ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',  'PoolQC' ,'MiscFeature']})),
    ('initialise_fillna_transformer', FunctionTransformer(run_transformer_transform_colums)),
    ('merge_bathrooms', FunctionTransformer(merge_bathromms)),
    ('new_features', FunctionTransformer(add_new_features)),
    ('binary_features', FunctionTransformer(add_binary_flag)),
    ('add_neighbourhood_median_price', FunctionTransformer(fill_neighbour_median_price)),
    ('months_season', FunctionTransformer(months_sold_to_season)),
    ('ordinal_categorical_to_numerical_mappings', FunctionTransformer(categorical_ordinal_numeric)),
    ('drop_less_collinear',FunctionTransformer(drop_columns, kw_args={'cols':['Utilities', 'GarageYrBlt','WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'YearBuilt', 'YearRemodAdd' ,'YrSold', 'GarageCars', 'Condition2', 'Condition1', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'HouseStyle', 'Heating', 'Electrical', 'Functional']}))
])



### Outlier Elimination

In [2517]:
def remove_outliers(df):
    '''
        Please check housing_price_dataset.ipynb for why I choose to drop this values
    
    
    '''
    df = df.copy()
    df.drop(df[df['LotArea'] > 40000].index, inplace=True)
    df.drop(df[(df['OverallQual'] >= 9) & (df['SalePrice'] > 500000)].index, inplace=True)
    df.drop(df[(df['OverallCond'] == 5) & (df['SalePrice'] > 500000)].index, inplace=True)
    df.drop(index=186, inplace=True)

    return df

In [2518]:
outlier_removal_pipeline = Pipeline([
    ('feature_engineering_pipeline', feature_engineering_pipeline),
    ('remove_outlier', FunctionTransformer(remove_outliers))
])

## Feature Transformation

Here, we transform datatypes to float, int and objects and apply log transformation, boxcox for sparse skewed vals and finally scaling and onehot encoding

df_conti = df[['LotArea', 'PricePerSF', 'TotalSquareFootage','TotalOutdoorSpace','TotalBsmtSF','GarageArea', 'NeighborhoodMedianPrices','1stFlrSF', 'QualityScore' ,'2ndFlrSF',  'GrLivArea', 'SalePrice','HouseAge', 'RemodeledAge' ]]

In [2519]:
def to_float64(df, cols):
    df[cols] = df[cols].astype(np.float64)  
    return df

def log_transformer(df):
    '''
        Log Transformer for skewed features
    
    '''
    df_conti = df.select_dtypes(np.float64) # only selecting continuous cols
    skewness = df_conti.skew().sort_values(ascending=False)

    skewed_features  = skewness > 0.3

    for feature in skewed_features.index:
        df[feature] = np.log1p(df[feature])

    return df

# def sparse_transform(df, cols='MasVnrArea'):
#     df[cols], _ = boxcox(df[cols] + 1) 
#     return df


In [2520]:
log_transformer_pipe =  Pipeline([
    ('outlier_removal_pipeline', outlier_removal_pipeline),
    ('select_dtypes', FunctionTransformer(to_float64, kw_args={'cols':['LotArea','MasVnrArea' ,'PricePerSF', 'TotalSquareFootage','TotalOutdoorSpace','TotalBsmtSF','GarageArea', 'NeighborhoodMedianPrices','1stFlrSF', 'QualityScore' ,'2ndFlrSF', 'GrLivArea', 'SalePrice','HouseAge', 'LotFrontage', 'RemodeledAge' ] })),
    ('log_transform', FunctionTransformer(log_transformer)),
    # ('boxcox_transform', FunctionTransformer(sparse_transform, kw_args={'cols': 'MasVnrArea'})) 
])

In [2521]:
def get_training_ready_features_target(df):
    final_df = log_transformer_pipe.fit_transform(df)
    
    preprocessing_transformer = ColumnTransformer([
        ('scaling', StandardScaler(), final_df.select_dtypes(np.float64).columns),
        ('encoder', OneHotEncoder(handle_unknown='ignore'), final_df.select_dtypes(np.object_).columns),], remainder='passthrough')
    X = preprocessing_transformer.fit_transform(final_df)
    dfx = pd.DataFrame(X, columns=preprocessing_transformer.get_feature_names_out())
    dfx.dropna(inplace=True)
    dfy = dfx[['scaling__SalePrice']]
    dfx.drop(columns=['scaling__SalePrice'], inplace=True)
    print(dfx.shape)

    return dfx, dfx.to_numpy(), dfy.to_numpy()  

In [2522]:
dfx, X, y = get_training_ready_features_target(df)
lasso_ = Lasso(alpha=0.001)
lasso_.fit(X,y)
y_pred = lasso_.predict(X)
root_mean_squared_error(y_pred, y.flatten())

(1437, 118)


0.002692442106211033

### Test Set

In [2544]:
dft = pd.read_csv('../../../../Datasets/home-data-for-ml-course/test.csv')
dft.drop(columns=['Id'], inplace=True)

dfy = pd.read_csv('../../../../Datasets/home-data-for-ml-course/sample_submission.csv')
dfy.drop(columns=['Id'], inplace=True)


dft['SalePrice'] = dfy['SalePrice']  

In [2545]:
dfxt , X_test, y_test = get_training_ready_features_target(dft)

(1450, 118)


In [2546]:
y_test_red = lasso_.predict(X_test)

In [2550]:
y_test_red

array([-0.2379972 ,  0.15125836,  0.08724878, ...,  0.43918885,
       -0.02907704,  0.20344791])

In [2554]:
root_mean_squared_error(y_test_red, y_test)

0.7661361035547897

In [2555]:
np.log1p(y_test_red)

array([-0.27180504,  0.14085557,  0.08365045, ...,  0.36407966,
       -0.02950815,  0.1851907 ])

0

In [2563]:
scaler = StandardScaler()

In [2565]:
y_train_scaled = scaler.fit_transform(np.log1p(df['SalePrice'].to_numpy().reshape(-1,1)))

In [2566]:
y_train_scaled

array([[ 0.56006699],
       [ 0.21276333],
       [ 0.73404616],
       ...,
       [ 1.17470887],
       [-0.39965728],
       [-0.30669507]])

In [2570]:
yy  =scaler.inverse_transform(y_test_red.reshape(-1,1))

In [2571]:
np.exp(yy)

array([[151603.24408318],
       [177097.68290565],
       [172628.45775406],
       ...,
       [198676.06702784],
       [164793.18978324],
       [180827.1133713 ]])

In [2572]:
y_test

array([[-0.59061966],
       [ 0.61101062],
       [ 0.35025664],
       ...,
       [ 2.40768166],
       [ 0.43463157],
       [ 0.60998979]])