In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas_profiling as pp
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from functions.preprocessing import Imputer, CategoricalEncoder, remove_outliers

from lazypredict.Supervised import LazyRegressor

plt.style.use('ggplot')

In [2]:
def remove_empty_features(data, threshold):
    """..."""
    cols_to_drop = [column for column in data.columns 
                    if data[column].isna().mean() > threshold]
    data = data.drop(columns = cols_to_drop)
    return data, cols_to_drop

def mapping_from_list(order):
    return {label: idx for idx, label in enumerate(order)}

def ordinal_feature(data: pd.DataFrame, dictionary: dict):
    """ Transform ordinal features

    Args:
        data (dataframe)
        dictionary (dict)

    Returns:
        data (dataframe): encoded dataframe
    """
    data_copy = data.copy()
    for key,value in dictionary.items():
        data_copy[key] = data_copy[key].map(mapping_from_list(value))

    return data_copy

In [3]:
# Road raw training data
raw_data = pd.read_csv('train.csv', index_col=0)
raw_data.head(5)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
options = {
    "MSSubClass": {"strategy": "most_frequent"},
    "MSZoning": {"strategy": "most_frequent"},
    "LotFrontage": {"strategy": "mean"},
    "LotArea": {"strategy": "mean"},
    "Street": {"strategy": "most_frequent"},
    "Alley": {"strategy": "constant", "fill_value": "NoAccess"},
    "LotShape": {"strategy": "most_frequent"},
    "LandContour": {"strategy": "most_frequent"},
    "Utilities": {"strategy": "most_frequent"},
    "LotConfig": {"strategy": "most_frequent"},
    "LandSlope": {"strategy": "most_frequent"},
    "Neighborhood": {"strategy": "most_frequent"},
    "Condition1": {"strategy": "most_frequent"},
    "Condition2": {"strategy": "most_frequent"},
    "Electrical": {"strategy": "most_frequent"},
    "1stFlrSF": {"strategy": "mean"},
    "2ndFlrSF": {"strategy": "mean"},
    "LowQualFinSF": {"strategy": "mean"},
    "GrLivArea": {"strategy": "mean"},
    "BsmtFullBath": {"strategy": "median"},
    "BsmtHalfBath": {"strategy": "median"},
    "FullBath": {"strategy": "median"},
    "HalfBath": {"strategy": "median"},
    "BedroomAbvGr": {"strategy": "median"},
    "KitchenAbvGr": {"strategy": "median"},
    "KitchenQual": {"strategy": "most_frequent"},
    "TotRmsAbvGrd": {"strategy": "median"},
    "BldgType": {"strategy": "most_frequent"},
    "HouseStyle": {"strategy": "most_frequent"},
    "OverallQual": {"strategy": "median"},
    "OverallCond": {"strategy": "median"},
    "YearBuilt": {"strategy": "median"},
    "YearRemodAdd": {"strategy": "median"},
    "RoofStyle": {"strategy": "most_frequent"},
    "RoofMatl": {"strategy": "most_frequent"},
    "Exterior1st": {"strategy": "most_frequent"},
    "Exterior2nd": {"strategy": "most_frequent"},
    "MasVnrType": {"strategy": "constant", "fill_value": "None"},
    "MasVnrArea": {"strategy": "mean"},
    "ExterQual": {"strategy": "most_frequent"},
    "ExterCond": {"strategy": "most_frequent"},
    "Foundation": {"strategy": "most_frequent"},
    "BsmtQual": {"strategy": "constant", "fill_value": "NoBasement"},
    "BsmtCond": {"strategy": "constant", "fill_value": "NoBasement"},
    "BsmtExposure": {"strategy": "constant", "fill_value": "NoBasement"},
    "BsmtFinType1": {"strategy": "constant", "fill_value": "NoBasement"},
    "BsmtFinSF1": {"strategy": "mean"},
    "BsmtFinType2": {"strategy": "constant", "fill_value": "NoBasement"},
    "BsmtFinSF2": {"strategy": "mean"},
    "BsmtUnfSF": {"strategy": "mean"},
    "TotalBsmtSF": {"strategy": "mean"},
    "Heating": {"strategy": "most_frequent"},
    "HeatingQC": {"strategy": "most_frequent"},
    "CentralAir": {"strategy": "most_frequent"},
    "ScreenPorch": {"strategy": "mean"},
    "PoolArea": {"strategy": "mean"},
    "PoolQC": {"strategy": "constant", "fill_value": "NoPool"},
    "Fence": {"strategy": "constant", "fill_value": "NoFence"},
    "MiscFeature": {"strategy": "constant", "fill_value": "None"},
    "MiscVal": {"strategy": "mean"},
    "MoSold": {"strategy": "median"},
    "YrSold": {"strategy": "median"},
    "SaleType": {"strategy": "most_frequent"},
    "SaleCondition": {"strategy": "most_frequent"},
    "Functional": {"strategy": "most_frequent"},
    "Fireplaces": {"strategy": "most_frequent"},
    "FireplaceQu": {"strategy": "constant", "fill_value": "NoAccess"},
    "GarageType": {"strategy": "constant", "fill_value": "NoAccess"},
    "GarageYrBlt": {"strategy": "most_frequent"},
    "GarageFinish": {"strategy": "constant", "fill_value": "NoAccess"},
    "GarageCars": {"strategy": "most_frequent"},
    "GarageArea": {"strategy": "median"},
    "GarageQual": {"strategy": "constant", "fill_value": "NoAccess"},
    "GarageCond": {"strategy": "constant", "fill_value": "NoAccess"},
    "PavedDrive": {"strategy": "most_frequent"},
    "WoodDeckSF": {"strategy": "most_frequent"},
    "OpenPorchSF": {"strategy": "most_frequent"},
    "EnclosedPorch": {"strategy": "mean"},
    "3SsnPorch": {"strategy": "most_frequent"},
}

In [5]:
params = {
    "threshold_empty_features": 0.3,
}

cols_to_drop = {
    "remove_empty_features": []
}

categorical_colums = ['Exterior1st', 'Foundation', 'MasVnrType', 'Neighborhood', 
                      'PavedDrive', 'Electrical', 'MSSubClass', 'SaleCondition',
                      'GarageType', 'Exterior2nd', 'MSZoning', 'CentralAir', 
                      'Street','Alley','LandContour','Utilities','LotConfig',                          'LandSlope',  'Condition1', 'Condition2',    'BldgType',                         'HouseStyle',    'RoofStyle',    'RoofMatl',                                     'BsmtFinType2',  'Heating',    'Functional',                                     'GarageCond',  'Fence',  'MiscFeature',    'SaleType']

# Ordinal features options
ordinal_columns = ['HeatingQC', 'GarageQual', 'BsmtFinType1', 'ExterQual', 
                   'GarageFinish', 'BsmtExposure', 'LotShape', 'OverallQual',
                   'BsmtQual', 'KitchenQual']

ordinal_mapping = {
    'BsmtExposure': ['None', 'No', 'Mn', 'Av', 'Gd'],
    'BsmtFinType1': ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'GarageFinish': ['None', 'Unf', 'RFn', 'Fin'],
    'LotShape': ['IR3', 'IR2', 'IR1', 'Reg']
}

ordinal_common = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
                  'KitchenQual', 'FireplaceQu', 'GarageQual', 'PoolQC']
for column in ordinal_common:
    ordinal_mapping[column] = ['None', 'Po', 'Fa', 'TA', 'Gd']

In [7]:
# Removing features with a lot of missing values
data, cols_to_drop["remove_empty_features"] = remove_empty_features(
    raw_data, 
    params["threshold_empty_features"]
)

# Impute missing values
imp = Imputer(options=options)
data = imp.fit_transform(raw_data)

# HOTFIX
for key in imp.options:
    if isinstance(imp.options[key]['_fill'], np.integer):
        imp.options[key]['_fill'] = int(imp.options[key]['_fill'])
imp.save_options('imputer_options.json')

# Encoding categorical features
ce = CategoricalEncoder(categorical_colums)
data = ce.fit_transform(data)

# Encoding ordinal features
data = ordinal_feature(data, ordinal_mapping)

# data
data

Unnamed: 0_level_0,LotFrontage,LotArea,LotShape,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,MiscFeature_Shed,MiscFeature_TenC,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65.00,8450.00,3,7,5,2003,2003,196.00,4.00,3.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
2,80.00,9600.00,3,6,8,1976,1976,0.00,3.00,3.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
3,68.00,11250.00,2,7,5,2001,2002,162.00,4.00,3.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
4,60.00,9550.00,2,7,5,1915,1970,0.00,3.00,3.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
5,84.00,14260.00,2,8,5,2000,2000,350.00,4.00,3.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,62.00,7917.00,3,6,5,1999,2000,0.00,3.00,3.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
1457,85.00,13175.00,3,6,6,1978,1988,119.00,3.00,3.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
1458,66.00,9042.00,3,7,9,1941,2006,0.00,,4.00,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
1459,68.00,9717.00,3,5,6,1950,1996,0.00,3.00,3.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00


## Model metrics before removing outliers

In [8]:
reg = LazyRegressor()
X = data.drop(columns = ["SalePrice"])
y = data["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,  random_state=42)
models, _ = reg.fit(X_train, X_test, y_train, y_test)
models

100%|██████████| 43/43 [00:36<00:00,  1.19it/s]


Unnamed: 0_level_0,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PoissonRegressor,0.91,24834.0,0.17
GradientBoostingRegressor,0.91,25337.23,1.42
LGBMRegressor,0.9,26547.02,0.48
RandomForestRegressor,0.9,26583.83,3.03
HistGradientBoostingRegressor,0.89,27409.55,8.79
XGBRegressor,0.89,27455.06,1.25
BaggingRegressor,0.89,27790.44,0.51
ExtraTreesRegressor,0.89,28096.42,3.46
GammaRegressor,0.88,28686.06,0.1
PassiveAggressiveRegressor,0.87,30242.04,0.65


## Removing outliers

In [29]:
nan_columns = {column: data[column].isna().sum() for column in data.columns if data[column].isna().sum() > 0}
nan_columns

{'ExterQual': 52,
 'ExterCond': 3,
 'BsmtQual': 158,
 'BsmtCond': 37,
 'BsmtExposure': 38,
 'BsmtFinType1': 37,
 'HeatingQC': 741,
 'KitchenQual': 100,
 'FireplaceQu': 714,
 'GarageFinish': 81,
 'GarageQual': 84,
 'PoolQC': 1455}

In [31]:
data["PoolQC"].sample(10)

Id
242    nan
609    nan
475    nan
649    nan
769    nan
174    nan
1452   nan
1159   nan
830    nan
18     nan
Name: PoolQC, dtype: float64

In [None]:
ordinal_common = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
                  'KitchenQual', 'FireplaceQu', 'GarageQual', 'PoolQC']

In [19]:
outlier_removed_data = remove_outliers(data_no_empty_features, method="IsolationForest", threshold=0.1, model_kwargs = {})

Model to detect outliers is IsolationForest with parameters {}


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
reg = LazyRegressor()
X = outlier_removed_data.drop(columns = ["SalePrice"])
y = outlier_removed_data["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,  random_state=42)
models, _ = reg.fit(X_train, X_test, y_train, y_test)
models

## TODO:

- Krzysiek:
    - funkcje zwracają indeksy i kolumny
    
- kbdev
    - Encoding ordinal features as a class
    - fix np.int64 bug in json serialization
    - 
 
- miri
    - nie będzie jej (na 50%)
    
- Patryk
    - zapis do pliku Encoder, konstruktor z pliku
    - PR 
    
```python
our_encoder = OurOneHotEncoder(columns=...)
data = our_encoder.fit(data)
our_encoder.save(file.json)
 
our_encoder.from_file(file.json)
our_encoder.transform(other_data)
```
