In [130]:
import pandas as pd
import numpy as np

In [131]:
from pandas_profiling import ProfileReport

In [132]:
df = pd.read_csv('train.csv')

In [133]:
pd.set_option('display.max_rows', 100)

## Separate features by type

In [134]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

### Select categorical features

In [135]:
cols_cat = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
        'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
        'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 
        'CentralAir', 'Electrical', 'Functional', 'GarageType', 'PavedDrive', 'MiscFeature', 'SaleType', 'SaleCondition']

In [136]:
df[cols_cat] = df[cols_cat].astype('string')

### Select numeric features

In [137]:
df.select_dtypes(exclude = ['object', 'string']).columns

Index(['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [138]:
cols_num = ['LotFrontage', 'LotArea','YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice']

In [139]:
type(cols_num)

list

### Select ordinal categories (remove Id)

In [140]:
cols_ord = [col for col in df.columns if col not in cols_cat + cols_num]

In [141]:
cols_ord.remove("Id")

In [142]:
type(cols_ord)

list

## Impute NaNs

In [143]:
df_nans = df.isna().sum().reset_index().rename(columns={0:'num_nans', 'index':'col_names'})

In [144]:
cols_with_nans = df_nans[df_nans.num_nans != 0].col_names.to_list()

In [145]:
df[cols_with_nans].dtypes

LotFrontage     float64
Alley            string
MasVnrType       string
MasVnrArea      float64
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     string
BsmtFinType2     string
Electrical       string
FireplaceQu      object
GarageType       string
GarageYrBlt     float64
GarageFinish     object
GarageQual       object
GarageCond       object
PoolQC           object
Fence            object
MiscFeature      string
dtype: object

In [146]:
cols_cat_with_nans = list(set(cols_with_nans) & set(cols_cat))

In [147]:
cols_ord_with_nans = list(set(cols_with_nans) & set(cols_ord))

In [148]:
cols_num_with_nans = list(set(cols_with_nans) & set(cols_num))

 ### Fill missing values in categorical features

In [149]:
df[cols_cat_with_nans] = df[cols_cat_with_nans].fillna('NoInfo')

### Fill missing values in GarageYrBlt with YearBuilt. The missing values are from variables with no garage taken into account in other feature.

In [150]:
df['GarageYrBlt'] = df['GarageYrBlt'].fillna('YearBuilt')

### Fill missing values in LotFrontage by using LotArea

In [151]:
df['LotFrontage'] = df['LotFrontage'].fillna(np.sqrt(df["LotArea"])/np.mean(np.sqrt(df["LotArea"])/(df["LotFrontage"])))

### Fill missing values in ordinal features with NoInfo

In [165]:
df[cols_ord_with_nans] = df[cols_ord_with_nans].fillna('NoInfo')

### Eliminate the rows with remaining NaNs

In [167]:
df = df.dropna(axis=0, how='any')

In [168]:
prof = ProfileReport(df)

## Probando

In [170]:
prof.to_file(output_file='profilehousetrain.html')

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]