In [27]:
import pandas as pd

In [28]:
from pandas_profiling import ProfileReport

In [29]:
df = pd.read_csv('train.csv')

In [30]:
pd.set_option('display.max_rows', 100)

## Separate features by type

In [31]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [32]:
colscat = df.select_dtypes(include = object).columns.to_list()

### Select categorical features

In [33]:
cols_cat = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
        'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
        'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 
        'CentralAir', 'Electrical', 'Functional', 'GarageType', 'PavedDrive', 'MiscFeature', 'SaleType', 'SaleCondition']

In [34]:
df[cols_cat] = df[cols_cat].astype('string')

### Select numeric features

In [35]:
df.select_dtypes(exclude = ['object', 'string']).columns

Index(['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [36]:
cols_num = ['LotFrontage', 'LotArea','YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice']

In [37]:
type([cols_num, cols_cat])

list

### Select ordinal categories (qué hago con Id)

In [38]:
cols_ord = df.drop(cols_cat + cols_num, axis=1).columns

In [39]:
cols_ord

Index(['Id', 'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual',
       'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC',
       'Fence'],
      dtype='object')

## Impute NaNs

In [40]:
df_nans = df.isna().sum().reset_index().rename(columns={0:'num_nans', 'index':'col_names'})

In [41]:
cols_with_nans = df_nans[df_nans.num_nans != 0].col_names.to_list()

In [42]:
df[cols_with_nans].dtypes

LotFrontage     float64
Alley            string
MasVnrType       string
MasVnrArea      float64
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     string
BsmtFinType2     string
Electrical       string
FireplaceQu      object
GarageType       string
GarageYrBlt     float64
GarageFinish     object
GarageQual       object
GarageCond       object
PoolQC           object
Fence            object
MiscFeature      string
dtype: object

In [43]:
cols_cat_with_nans = list(set(cols_with_nans) & set(cols_cat))

In [44]:
cols_ord_with_nans = list(set(cols_with_nans) & set(cols_ord))

In [45]:
cols_num_with_nans = list(set(cols_with_nans) & set(cols_num))

 ### Fill missing values in categorical features

In [46]:
df[cols_cat_with_nans] = df[cols_cat_with_nans].fillna('NoInfo')

### Fill missing values in GarageYrBlt with YearBuilt. The missing values are from variables with no garage taken into account in other feature.

In [47]:
df['GarageYrBlt'] = df['GarageYrBlt'].fillna('YearBuilt')

In [49]:
prof = ProfileReport(df)

## Probando

In [50]:
prof.to_file(output_file='profilehousetrain.html')

Summarize dataset: 100%|██████████| 932/932 [24:05<00:00,  1.55s/it, Completed]                           
Generate report structure: 100%|██████████| 1/1 [03:35<00:00, 215.47s/it]
Render HTML: 100%|██████████| 1/1 [04:16<00:00, 256.49s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  2.33it/s]
