# Making data preparation pipeline with custom transforms

In [1]:
# main imports
from datetime import datetime, timedelta, date

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.pipeline import Pipeline

from sklearn.base import BaseEstimator, TransformerMixin # for creating custom transformers based on sklearn linrary

In [2]:
# [DOC]
# Настройки numpy
#np.set_printoptions(precision = 4, floatmode='fixed')

# Настройки Pandas
#pd.set_option('display.max_colwidth', None)    # текст в ячейке отражался полностью вне зависимости от длины
#pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Настройки seaborn
#sns.set_style("darkgrid")
#sns.set_context(context='paper', font_scale=1, rc=None)    # 'paper', 'notebook', 'talk', 'poster', None

# Настройки Mathplotlib
#f_size = 8    # пользовательская переменная для задания базового размера шрифта
#plt.rcParams['figure.titlesize'] = f_size + 12    # шрифт заголовка
#plt.rcParams['axes.titlesize'] = f_size + 10      # шрифт заголовка
#plt.rcParams['axes.labelsize'] = f_size + 6       # шрифт подписей осей
#plt.rcParams['xtick.labelsize'] = f_size + 4      # шрифт подписей меток
#plt.rcParams['ytick.labelsize'] = f_size + 4
#plt.rcParams['legend.fontsize'] = f_size + 6      # шрифт легенды

# Пользовательские модули и библиотеки

#Text1 = os.getcwd()    # вывод пути к текущему каталогу
#print(f"Текущий каталог: {Text1}")

#sys.path.insert(1, "D:\REPOSITORY\MyModulePython")

#from my_module__stat import *

In [2]:
# some setups
pd.set_option('display.max_colwidth', None) # text in the cell is fully displayed

Testing playground

In [3]:
df_orig = pd.read_csv('datasets/processed/train_with_cat_dtypes.csv')
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [6]:
df = df_orig.copy()

In [7]:
stat_nans = {}
for column in df.columns:
    nans_num = df[column].isna().sum()
    if nans_num > 0:
        stat_nans[column] = nans_num

Pandas DataFrame Transformations

In [8]:
class DFCreateAdditionalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, create_bsmt=True, create_garage=True, create_remodeled=True) -> None:
        self.create_bsmt = create_bsmt
        self.create_garage = create_garage
        self.create_remodeled = create_remodeled


    def fit(self, X, y=None):
        return self


    def transform(self, X, y=None):
        X_transformed = X.copy()
        if self.create_bsmt:
            X_transformed['HasBsmt'] = X['BsmtQual'].notna()
        if self.create_garage:
            X_transformed['HasGarage'] = X['GarageType'].notna()
        if self.create_remodeled:
            X_transformed['Remodeled'] = X['YearRemodAdd'] > X['YearBuilt']
        return X_transformed

In [9]:
transformer1 = DFCreateAdditionalFeatures()
df_transformed = transformer1.transform(df)

In [10]:
check_cols_before = ['BsmtQual', 'GarageType', 'YearBuilt', 'YearRemodAdd']
check_cols_after = ['BsmtQual', 'HasBsmt', 'GarageType', 'HasGarage', 'YearBuilt', 'YearRemodAdd', 'Remodeled']

In [11]:
df.loc[:20, check_cols_before]

Unnamed: 0,BsmtQual,GarageType,YearBuilt,YearRemodAdd
0,Gd,Attchd,2003,2003
1,Gd,Attchd,1976,1976
2,Gd,Attchd,2001,2002
3,TA,Detchd,1915,1970
4,Gd,Attchd,2000,2000
5,Gd,Attchd,1993,1995
6,Ex,Attchd,2004,2005
7,Gd,Attchd,1973,1973
8,TA,Detchd,1931,1950
9,TA,Attchd,1939,1950


In [12]:
df_transformed.loc[:20, check_cols_after]

Unnamed: 0,BsmtQual,HasBsmt,GarageType,HasGarage,YearBuilt,YearRemodAdd,Remodeled
0,Gd,True,Attchd,True,2003,2003,False
1,Gd,True,Attchd,True,1976,1976,False
2,Gd,True,Attchd,True,2001,2002,True
3,TA,True,Detchd,True,1915,1970,True
4,Gd,True,Attchd,True,2000,2000,False
5,Gd,True,Attchd,True,1993,1995,True
6,Ex,True,Attchd,True,2004,2005,True
7,Gd,True,Attchd,True,1973,1973,False
8,TA,True,Detchd,True,1931,1950,True
9,TA,True,Attchd,True,1939,1950,True


In [None]:
s = df['Alley'].isna()

In [None]:
df.loc[s, 'Alley']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
1455    NaN
1456    NaN
1457    NaN
1458    NaN
1459    NaN
Name: Alley, Length: 1369, dtype: object

In [None]:
df.loc[s, :].fillna({'Alley': 'NoAlley'}, inplace=True)
df.loc[s, 'Alley']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[s, :].fillna({'Alley': 'NoAlley'}, inplace=True)


0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
1455    NaN
1456    NaN
1457    NaN
1458    NaN
1459    NaN
Name: Alley, Length: 1369, dtype: object

In [None]:
nan_replacements = {
        'Street': 'Abs',
        'Alley': 'Abs',
        'Fence': 'Abs',
        # Basement section
        'BsmtQual': 'Abs', 'BsmtCond': 'Abs', 'BsmtExposure': 'Abs',
        'BsmtFinType1': 'Abs', 'BsmtFinSF1': 0, 'BsmtFinType2': 'Abs',
        'BsmtFinSF2': 0, 'BsmtUnfSF': 0, 'TotalBsmtSF': 0, 'BsmtFullBath': 0, 'BsmtHalfBath': 0,
        # Garage section
        'GarageType': 'Abs', 'GarageYrBlt': 0, 'GarageFinish': 'Abs', 'GarageCars': 0,
        'GarageArea': 0, 'GarageQual': 'Abs', 'GarageCond': 'Abs',
        # Other sections
        'FireplaceQu': 'Abs', 'Fireplaces': 0,
        'PoolQC': 'Abs', 'PoolArea': 0,
        'MiscFeature': 'Abs', 'MiscVal': 0,
    }

In [None]:
class DFReplaceMeaningfulNANs(BaseEstimator, TransformerMixin):
    single_features = ['Street', 'Alley', 'Fence']
    basement_features = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
    garage_features = ['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond']
    feature_pairs = [
        ['FireplaceQu', 'Fireplaces'],
        ['PoolQC', 'PoolArea'],
        ['MiscFeature', 'MiscVal']
    ]
    
    def __init__(self, cols_nans) -> None: # cols_nans is a dictionary containig column names and default values to replace NaNs with
        #super().__init__()
        self.cols_nans = cols_nans


    def fit(self, X, y=None):
        return self


    def transform(self, X, y=None):
        X_transformed = X.copy()
        replacements = {x: self.cols_nans[x] for x in self.single_features}
        #print(replacements)
        X_transformed.fillna(replacements, inplace=True)
        
        m = X_transformed['BsmtQual'].isna()
        for column in self.basement_features:
            X_transformed.loc[m, column] = self.cols_nans.get(column, np.nan)

        m = X_transformed['GarageType'].isna()
        for column in self.garage_features:
            X_transformed.loc[m, column] = self.cols_nans.get(column, np.nan)

        
        for pair in self.feature_pairs:
            m = X_transformed[pair[0]].isna()
            for column in pair:
                X_transformed.loc[m, column] = self.cols_nans.get(column, np.nan)
                
        return X_transformed

In [None]:
transformer2 = DFReplaceMeaningfulNANs(nan_replacements)

In [None]:
single_features = ['Street', 'Alley', 'Fence']
df[single_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Street  1460 non-null   object
 1   Alley   91 non-null     object
 2   Fence   281 non-null    object
dtypes: object(3)
memory usage: 34.3+ KB


In [None]:
df[transformer2.basement_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   BsmtQual      1423 non-null   object
 1   BsmtCond      1423 non-null   object
 2   BsmtExposure  1422 non-null   object
 3   BsmtFinType1  1423 non-null   object
 4   BsmtFinSF1    1460 non-null   int64 
 5   BsmtFinType2  1422 non-null   object
 6   BsmtFinSF2    1460 non-null   int64 
 7   BsmtUnfSF     1460 non-null   int64 
 8   TotalBsmtSF   1460 non-null   int64 
 9   BsmtFullBath  1460 non-null   int64 
 10  BsmtHalfBath  1460 non-null   int64 
dtypes: int64(6), object(5)
memory usage: 125.6+ KB


In [None]:
df_transformed = transformer2.transform(df)
df_transformed[single_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Street  1460 non-null   object
 1   Alley   1460 non-null   object
 2   Fence   1460 non-null   object
dtypes: object(3)
memory usage: 34.3+ KB


In [None]:
df_transformed[transformer2.basement_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   BsmtQual      1460 non-null   object
 1   BsmtCond      1460 non-null   object
 2   BsmtExposure  1459 non-null   object
 3   BsmtFinType1  1460 non-null   object
 4   BsmtFinSF1    1460 non-null   int64 
 5   BsmtFinType2  1459 non-null   object
 6   BsmtFinSF2    1460 non-null   int64 
 7   BsmtUnfSF     1460 non-null   int64 
 8   TotalBsmtSF   1460 non-null   int64 
 9   BsmtFullBath  1460 non-null   int64 
 10  BsmtHalfBath  1460 non-null   int64 
dtypes: int64(6), object(5)
memory usage: 125.6+ KB


In [None]:
class DFDropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None) -> None: # cols is a list containig column names to drop
        #super().__init__()
        self.cols = cols


    def fit(self, X, y=None):
        return self


    def transform(self, X, y=None):
        X_transformed = X.copy()
        if (self.cols is not None) and (len(self.cols) > 0):
            X_transformed = X_transformed.drop(columns=self.cols)
        return X_transformed

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [None]:
transformer3 = DFDropColumns(cols=['SalePrice', 'Id', 'MSSubClass'])
df_transformed = transformer3.transform(df)
df_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 78 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSZoning       1460 non-null   object 
 1   LotFrontage    1201 non-null   float64
 2   LotArea        1460 non-null   int64  
 3   Street         1460 non-null   object 
 4   Alley          91 non-null     object 
 5   LotShape       1460 non-null   object 
 6   LandContour    1460 non-null   object 
 7   Utilities      1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemo

In [39]:
class DFJoinDates(BaseEstimator, TransformerMixin):
    def __init__(self, day_col=None, month_col=None, year_col=None, calc_period_to=None, new_column_name=None, drop_columns=False) -> None:
        #super().__init__()
        self.day_col = day_col
        self.month_col = month_col
        self.year_col = year_col
        self.calc_period_to = calc_period_to
        if new_column_name is None:
            if calc_period_to is None:
                self.new_column_name = 'DateCreated'
            else:
                self.new_column_name = 'PeriodCreated'
        else:
            self.new_column_name = new_column_name
        self.drop_columns = drop_columns


    def fit(self, X, y=None):
        return self


    def calc_dates_diff(self, date1=None, date2=None):
        if date1 is None or date2 is None:
            return None
        result = date2 - date1
        return result


    def transform(self, X, y=None):
        X_transformed = X.copy()
        if self.year_col is None:
            return X_transformed

        if self.day_col is None:
            days = pd.Series(['01']*len(X))
        else:
            days = X[self.day_col].apply(str)
        
        if self.month_col is None:
            months = pd.Series(['01']*len(X))
        else:
            months = X[self.month_col].apply(str)
        
        combined_date_str = months + '/' + days + '/' + X[self.year_col].apply(str)
        if self.calc_period_to is None:
            X_transformed[self.new_column_name] = pd.to_datetime(combined_date_str)
        else:
            # [TODO] Need code here to calculate difference between dates in days
            X_transformed[self.new_column_name] = (self.calc_period_to - pd.to_datetime(combined_date_str)).apply(lambda x: x.days)
        if self.drop_columns:
            X_transformed.drop(columns=[x for x in [self.day_col, self.month_col, self.year_col] if x is not None], inplace=True)

        return X_transformed

In [41]:
transformer = DFJoinDates(month_col='MoSold', year_col='YrSold', new_column_name='SoldAge', calc_period_to=datetime.today(), drop_columns=True)

In [42]:
X_transformed = transformer.transform(df)
X_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [43]:
#X_transformed[['MoSold', 'YrSold', 'DateSold']].head(10)
X_transformed['SoldAge'].head(10)

0    5373
1    5649
2    5160
3    6103
4    5069
5    4765
6    5557
7    4734
8    5313
9    5404
Name: SoldAge, dtype: int64

In [27]:
days = pd.Series(['1']*len(df))
dates = pd.to_datetime(df['MoSold'].apply(str) + '/' + days + '/' + df['YrSold'].apply(str))
dates

0      2008-02-01
1      2007-05-01
2      2008-09-01
3      2006-02-01
4      2008-12-01
          ...    
1455   2007-08-01
1456   2010-02-01
1457   2010-05-01
1458   2010-04-01
1459   2008-06-01
Length: 1460, dtype: datetime64[ns]

In [102]:
datetime.today().strftime("%m/%d/%Y")

'10/17/2022'

In [28]:
dates = pd.to_datetime(df['MoSold'].apply(str) + '/1/' + df['YrSold'].apply(str))
dates

0      2008-02-01
1      2007-05-01
2      2008-09-01
3      2006-02-01
4      2008-12-01
          ...    
1455   2007-08-01
1456   2010-02-01
1457   2010-05-01
1458   2010-04-01
1459   2008-06-01
Length: 1460, dtype: datetime64[ns]

In [34]:
periods = datetime.today() - dates
periods

0      5373 days 11:49:37.746044
1      5649 days 11:49:37.746044
2      5160 days 11:49:37.746044
3      6103 days 11:49:37.746044
4      5069 days 11:49:37.746044
                  ...           
1455   5557 days 11:49:37.746044
1456   4642 days 11:49:37.746044
1457   4553 days 11:49:37.746044
1458   4583 days 11:49:37.746044
1459   5252 days 11:49:37.746044
Length: 1460, dtype: timedelta64[ns]

In [122]:
(datetime.today() - pd.to_datetime(df['MoSold'].apply(str) + '/1/' + df['YrSold'].apply(str)))[0].days

5372

In [38]:
periods.apply(lambda x: x.days)

0       5373
1       5649
2       5160
3       6103
4       5069
        ... 
1455    5557
1456    4642
1457    4553
1458    4583
1459    5252
Length: 1460, dtype: int64

In [44]:
class DFCalcAge(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, calc_age_to=None) -> None:
        #super().__init__()
        self.age_columns = columns
        self.calc_age_to = calc_age_to


    def fit(self, X, y=None):
        return self


    def transform(self, X, y=None):
        X_transformed = X.copy()
        if (self.age_columns is not None) and (self.calc_age_to is not None):
            for column in self.age_columns.keys():
                X_transformed[self.age_columns[column]] = self.calc_age_to - X_transformed[column]
        return X_transformed

In [48]:
transformer = DFCalcAge({'YearBuilt': 'BuiltAge', 'YearRemodAdd': 'RemodAge'}, calc_age_to=2022)

In [49]:
X_transformed = transformer.transform(df)
X_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 83 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [50]:
X_transformed[['YearBuilt', 'BuiltAge', 'YearRemodAdd', 'RemodAge']].head(10)

Unnamed: 0,YearBuilt,BuiltAge,YearRemodAdd,RemodAge
0,2003,19,2003,19
1,1976,46,1976,46
2,2001,21,2002,20
3,1915,107,1970,52
4,2000,22,2000,22
5,1993,29,1995,27
6,2004,18,2005,17
7,1973,49,1973,49
8,1931,91,1950,72
9,1939,83,1950,72


In [52]:
class DFConvertToNumpy(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        #super().__init__()
        pass


    def fit(self, X, y=None):
        return self


    def transform(self, X, y=None):
        return X.to_numpy()

In [53]:
converter = DFConvertToNumpy()

In [54]:
converter.transform(df)

array([[1, 60, 'RL', ..., 'WD', 'Normal', 208500],
       [2, 20, 'RL', ..., 'WD', 'Normal', 181500],
       [3, 60, 'RL', ..., 'WD', 'Normal', 223500],
       ...,
       [1458, 70, 'RL', ..., 'WD', 'Normal', 266500],
       [1459, 20, 'RL', ..., 'WD', 'Normal', 142125],
       [1460, 20, 'RL', ..., 'WD', 'Normal', 147500]], dtype=object)

In [51]:
etl_pipeline = Pipeline([
    ('add_has_features', DFCreateAdditionalFeatures()),
    ('fill_meanful_nans', DFReplaceMeaningfulNANs(cols_nans=nan_replacements)),
    ('days_since_sold', DFJoinDates(month_col='MoSold', year_col='YrSold', calc_period_to=datetime.today(), new_column_name='DaysSinceSold', drop_columns=False)),
    ('calc_ages', DFCalcAge({'YearBuilt': 'BuiltAge', 'YearRemodAdd': 'RemodAge'}, calc_age_to=2022)),
    ('drop_columns', DFDropColumns(['Id', 'SalePrice']))
])

NameError: name 'DFReplaceMeaningfulNANs' is not defined

In [91]:
df_transformed = etl_pipeline.fit_transform(df)

In [92]:
df_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 82 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [94]:
for column in stat_nans.keys():
    print('Column \t{}: \t{} => {}'.format(column, stat_nans[column], df_transformed[column].isna().sum()))

Column 	LotFrontage: 	259 => 259
Column 	Alley: 	1369 => 0
Column 	MasVnrType: 	8 => 8
Column 	MasVnrArea: 	8 => 8
Column 	BsmtQual: 	37 => 0
Column 	BsmtCond: 	37 => 0
Column 	BsmtExposure: 	38 => 1
Column 	BsmtFinType1: 	37 => 0
Column 	BsmtFinType2: 	38 => 1
Column 	Electrical: 	1 => 1
Column 	FireplaceQu: 	690 => 0
Column 	GarageType: 	81 => 0
Column 	GarageYrBlt: 	81 => 0
Column 	GarageFinish: 	81 => 0
Column 	GarageQual: 	81 => 0
Column 	GarageCond: 	81 => 0
Column 	PoolQC: 	1453 => 0
Column 	Fence: 	1179 => 0
Column 	MiscFeature: 	1406 => 0
