In [1]:
import pandas as pd
import numpy as np

In [2]:
from pandas_profiling import ProfileReport

In [3]:
df = pd.read_csv('train.csv')

In [4]:
pd.set_option('display.max_rows', 100)

## Separate features by type

In [5]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

### Select categorical features

In [6]:
cols_cat = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
        'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
        'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 
        'CentralAir', 'Electrical', 'Functional', 'GarageType', 'PavedDrive', 'MiscFeature', 'SaleType', 'SaleCondition']

In [7]:
df[cols_cat] = df[cols_cat].astype('string')

### Select numeric features

In [8]:
df.select_dtypes(exclude = ['object', 'string']).columns

Index(['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [9]:
cols_num = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice']

In [10]:
type(cols_num)

list

### Select ordinal categories (remove Id)

In [11]:
cols_ord = [col for col in df.columns if col not in cols_cat + cols_num]

In [12]:
cols_ord.remove("Id")

In [13]:
df[cols_ord] = df[cols_ord].astype('string')

In [14]:
cols_ord

['ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'KitchenQual',
 'FireplaceQu',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence']

## Impute NaNs

In [15]:
df_nans = df.isna().sum().reset_index().rename(columns={0:'num_nans', 'index':'col_names'})

In [16]:
cols_with_nans = df_nans[df_nans.num_nans != 0].col_names.to_list()

In [17]:
df[cols_with_nans].dtypes

LotFrontage     float64
Alley            string
MasVnrType       string
MasVnrArea      float64
BsmtQual         string
BsmtCond         string
BsmtExposure     string
BsmtFinType1     string
BsmtFinType2     string
Electrical       string
FireplaceQu      string
GarageType       string
GarageYrBlt     float64
GarageFinish     string
GarageQual       string
GarageCond       string
PoolQC           string
Fence            string
MiscFeature      string
dtype: object

In [18]:
cols_cat_with_nans = list(set(cols_with_nans) & set(cols_cat))

In [19]:
cols_ord_with_nans = list(set(cols_with_nans) & set(cols_ord))

In [20]:
cols_num_with_nans = list(set(cols_with_nans) & set(cols_num))

 ### Fill missing values in categorical features

In [21]:
df[cols_cat_with_nans] = df[cols_cat_with_nans].fillna('NoInfo')

### Fill missing values in GarageYrBlt with YearBuilt. The missing values are from variables with no garage taken into account in other feature.

In [22]:
df['GarageYrBlt'] = df['GarageYrBlt'].fillna('YearBuilt')

### Fill missing values in LotFrontage by using LotArea

In [23]:
df['LotFrontage'] = df['LotFrontage'].fillna(np.sqrt(df["LotArea"])/np.mean(np.sqrt(df["LotArea"])/(df["LotFrontage"])))

### Fill missing values in ordinal features with NoInfo

In [24]:
df[cols_ord_with_nans] = df[cols_ord_with_nans].fillna('NoInfo')

### Eliminate the rows with remaining NaNs

In [25]:
df = df.dropna(axis=0, how='any')

## Encode Categorical features with dummy variables

In [26]:
from sklearn.preprocessing import OneHotEncoder

In [27]:
enc_df = pd.get_dummies(df, columns=cols_cat, prefix=cols_cat, drop_first=True)

## Encode ordinal categories
### Create train and test subsets for Ordinal Encoding

In [59]:
data = df[cols_ord+['SalePrice']].values
x_enc = df[cols_ord]
y_enc = df['SalePrice']
print('Input', x_enc.shape)
print('Output', y_enc.shape)

Input (1452, 13)
Output (1452,)


In [60]:
from sklearn.model_selection import train_test_split
x_enc_train, x_enc_test, y_enc_train, y_enc_test = train_test_split(x_enc, y_enc, test_size=0.33, random_state=12)

### Fit and encode

In [61]:
map_ord = [['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'No', 'Mn', 'Av', 'Gd'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Unf', 'RFn', 'Fin'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']]           

In [62]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
ordinal_enc = OrdinalEncoder(categories = map_ord)
ordinal_enc.fit(x_enc)

In [63]:
ordinal_enc.categories_

[array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'No', 'Mn', 'Av', 'Gd'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Unf', 'RFn', 'Fin'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'], dtype=object)]

In [64]:
x_enc = ordinal_enc.transform(x_enc)

In [65]:
enc_df[cols_ord] = x_enc

## Probando

In [66]:
prof = ProfileReport(df)

In [None]:
prof.to_file(output_file='profilehousetrain.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  return func(*args, **kwargs)
