In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas_profiling as pp
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder

from functions.preprocessing import Imputer

plt.style.use('ggplot')

In [15]:
def remove_empty_features(data, threshold):
    """..."""
    cols_to_drop = [column for column in data.columns 
                    if data[column].isna().mean() > threshold]
    data = data.drop(columns = cols_to_drop)
    return data, cols_to_drop

def mapping_from_list(order):
    return {label: idx for idx, label in enumerate(order)}

def ordinal_feature(data: pd.DataFrame, dictionary: dict):
    """ Transform ordinal features

    Args:
        data (dataframe)
        dictionary (dict)

    Returns:
        data (dataframe): encoded dataframe
    """
    data_copy = data.copy()
    for key,value in dictionary.items():
        data_copy[key] = data_copy[key].map(mapping_from_list(value))

    return data_copy

In [3]:
# Road raw training data
raw_data = pd.read_csv('data/train.csv', index_col=0)
raw_data.head(5)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
options = {
    "MSSubClass": {"strategy": "most_frequent"},
    "MSZoning": {"strategy": "most_frequent"},
    "LotFrontage": {"strategy": "mean"},
    "LotArea": {"strategy": "mean"},
    "Street": {"strategy": "most_frequent"},
    "Alley": {"strategy": "constant", "fill_value": "NoAccess"},
    "LotShape": {"strategy": "most_frequent"},
    "LandContour": {"strategy": "most_frequent"},
    "Utilities": {"strategy": "most_frequent"},
    "LotConfig": {"strategy": "most_frequent"},
    "LandSlope": {"strategy": "most_frequent"},
    "Neighborhood": {"strategy": "most_frequent"},
    "Condition1": {"strategy": "most_frequent"},
    "Condition2": {"strategy": "most_frequent"},
    "Electrical": {"strategy": "most_frequent"},
    "1stFlrSF": {"strategy": "mean"},
    "2ndFlrSF": {"strategy": "mean"},
    "LowQualFinSF": {"strategy": "mean"},
    "GrLivArea": {"strategy": "mean"},
    "BsmtFullBath": {"strategy": "median"},
    "BsmtHalfBath": {"strategy": "median"},
    "FullBath": {"strategy": "median"},
    "HalfBath": {"strategy": "median"},
    "BedroomAbvGr": {"strategy": "median"},
    "KitchenAbvGr": {"strategy": "median"},
    "KitchenQual": {"strategy": "most_frequent"},
    "TotRmsAbvGrd": {"strategy": "median"},
    "BldgType": {"strategy": "most_frequent"},
    "HouseStyle": {"strategy": "most_frequent"},
    "OverallQual": {"strategy": "median"},
    "OverallCond": {"strategy": "median"},
    "YearBuilt": {"strategy": "median"},
    "YearRemodAdd": {"strategy": "median"},
    "RoofStyle": {"strategy": "most_frequent"},
    "RoofMatl": {"strategy": "most_frequent"},
    "Exterior1st": {"strategy": "most_frequent"},
    "Exterior2nd": {"strategy": "most_frequent"},
    "MasVnrType": {"strategy": "constant", "fill_value": "None"},
    "MasVnrArea": {"strategy": "mean"},
    "ExterQual": {"strategy": "most_frequent"},
    "ExterCond": {"strategy": "most_frequent"},
    "Foundation": {"strategy": "most_frequent"},
    "BsmtQual": {"strategy": "constant", "fill_value": "NoBasement"},
    "BsmtCond": {"strategy": "constant", "fill_value": "NoBasement"},
    "BsmtExposure": {"strategy": "constant", "fill_value": "NoBasement"},
    "BsmtFinType1": {"strategy": "constant", "fill_value": "NoBasement"},
    "BsmtFinSF1": {"strategy": "mean"},
    "BsmtFinType2": {"strategy": "constant", "fill_value": "NoBasement"},
    "BsmtFinSF2": {"strategy": "mean"},
    "BsmtUnfSF": {"strategy": "mean"},
    "TotalBsmtSF": {"strategy": "mean"},
    "Heating": {"strategy": "most_frequent"},
    "HeatingQC": {"strategy": "most_frequent"},
    "CentralAir": {"strategy": "most_frequent"},
    "ScreenPorch": {"strategy": "mean"},
    "PoolArea": {"strategy": "mean"},
    "PoolQC": {"strategy": "constant", "fill_value": "NoPool"},
    "Fence": {"strategy": "constant", "fill_value": "NoFence"},
    "MiscFeature": {"strategy": "constant", "fill_value": "None"},
    "MiscVal": {"strategy": "mean"},
    "MoSold": {"strategy": "median"},
    "YrSold": {"strategy": "median"},
    "SaleType": {"strategy": "most_frequent"},
    "SaleCondition": {"strategy": "most_frequent"},
    "Functional": {"strategy": "most_frequent"},
    "Fireplaces": {"strategy": "most_frequent"},
    "FireplaceQu": {"strategy": "constant", "fill_value": "NoAccess"},
    "GarageType": {"strategy": "constant", "fill_value": "NoAccess"},
    "GarageYrBlt": {"strategy": "most_frequent"},
    "GarageFinish": {"strategy": "constant", "fill_value": "NoAccess"},
    "GarageCars": {"strategy": "most_frequent"},
    "GarageArea": {"strategy": "median"},
    "GarageQual": {"strategy": "constant", "fill_value": "NoAccess"},
    "GarageCond": {"strategy": "constant", "fill_value": "NoAccess"},
    "PavedDrive": {"strategy": "most_frequent"},
    "WoodDeckSF": {"strategy": "most_frequent"},
    "OpenPorchSF": {"strategy": "most_frequent"},
    "EnclosedPorch": {"strategy": "mean"},
    "3SsnPorch": {"strategy": "most_frequent"},
}

In [14]:
params = {
    "threshold_empty_features": 0.3,
}

cols_to_drop = {
    "remove_empty_features": []
}

categorical_colums = ['Exterior1st', 'Foundation', 'MasVnrType', 'Neighborhood', 
                      'PavedDrive', 'Electrical', 'MSSubClass', 'SaleCondition',
                      'GarageType', 'Exterior2nd', 'MSZoning', 'CentralAir']

# Ordinal features options
ordinal_columns = ['HeatingQC', 'GarageQual', 'BsmtFinType1', 'ExterQual', 
                   'GarageFinish', 'BsmtExposure', 'LotShape', 'OverallQual',
                   'BsmtQual', 'KitchenQual']

ordinal_mapping = {
    'BsmtExposure': ['None', 'No', 'Mn', 'Av', 'Gd'],
    'BsmtFinType1': ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'GarageFinish': ['None', 'Unf', 'RFn', 'Fin'],
    'LotShape': ['IR3', 'IR2', 'IR1', 'Reg']
}

ordinal_common = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
                  'KitchenQual', 'FireplaceQu', 'GarageQual', 'PoolQC']
for column in ordinal_common:
    ordinal_mapping[column] = ['None', 'Po', 'Fa', 'TA', 'Gd']

In [17]:
# Removing features with a lot of missing values
data, cols_to_drop["remove_empty_features"] = remove_empty_features(
    raw_data, 
    params["threshold_empty_features"]
)

# Impute missing values
imp = Imputer(options=options)
data = imp.fit_transform(raw_data)

# HOTFIX
for key in imp.options:
    if isinstance(imp.options[key]['_fill'], np.integer):
        imp.options[key]['_fill'] = int(imp.options[key]['_fill'])
imp.save_options('imputer_options.json')

# Encoding categorical features
ohe = OneHotEncoder(drop='first', sparse=False)
ohe.fit(data[categorical_colums])
data_category_transformed = pd.DataFrame(
    ohe.transform(data[categorical_colums]),
    columns=ohe.get_feature_names(input_features=categorical_colums),
    index = data.index)
data = pd.concat([data.drop(categorical_colums, axis=1), 
                  data_category_transformed],
                 axis=1)

# Encoding ordinal features
# data = ordinal_feature(data, ordinal_mapping)

# data
data

Unnamed: 0_level_0,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,...,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,CentralAir_Y
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65.0,8450.0,Pave,NoAccess,Reg,Lvl,AllPub,Inside,Gtl,Norm,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,80.0,9600.0,Pave,NoAccess,Reg,Lvl,AllPub,FR2,Gtl,Feedr,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,68.0,11250.0,Pave,NoAccess,IR1,Lvl,AllPub,Inside,Gtl,Norm,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,60.0,9550.0,Pave,NoAccess,IR1,Lvl,AllPub,Corner,Gtl,Norm,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
5,84.0,14260.0,Pave,NoAccess,IR1,Lvl,AllPub,FR2,Gtl,Norm,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,62.0,7917.0,Pave,NoAccess,Reg,Lvl,AllPub,Inside,Gtl,Norm,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1457,85.0,13175.0,Pave,NoAccess,Reg,Lvl,AllPub,Inside,Gtl,Norm,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1458,66.0,9042.0,Pave,NoAccess,Reg,Lvl,AllPub,Inside,Gtl,Norm,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1459,68.0,9717.0,Pave,NoAccess,Reg,Lvl,AllPub,Inside,Gtl,Norm,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


## TODO:

- Krzysiek:
    - funkcje zwracają indeksy i kolumny
    
- kbdev
    - Encoding ordinal features as a class
    - fix np.int64 bug in json serialization
    - 
 
- miri
    - nie będzie jej (na 50%)
    
- Patryk
    - zapis do pliku Encoder, konstruktor z pliku
    - PR 
    
```python
our_encoder = OurOneHotEncoder(columns=...)
data = our_encoder.fit(data)
our_encoder.save(file.json)
 
our_encoder.from_file(file.json)
our_encoder.transform(other_data)
```
