In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest, AdaBoostRegressor

import warnings
warnings.filterwarnings('ignore')


pio.templates.default = "seaborn"

In [21]:
# ==== HELPERS ===== #

TARGET = "SalePrice"

def cat_features(data: pd.DataFrame):
    return data.select_dtypes(exclude="number").columns

def num_features(data: pd.DataFrame):
    return data.select_dtypes(include="number").columns

def plot_hist(data, features, rows=3, title=""):
    _n_cols = int(np.ceil(len(features) / rows))
    fig = make_subplots(rows=rows, cols=_n_cols, subplot_titles=features)

    for idx, feature in enumerate(features):
        r = idx // _n_cols + 1
        c = idx % _n_cols + 1
        fig.add_histogram(
            x=data[feature],
            row=r, col=c,
            name=feature,
            histnorm='probability'
        )
        
    fig.update_layout(height=800, width=3200, title_text=title)
    fig.show()
    
def plot_scatter(data, features, y_name, rows=2, title="",
                 width=900, height=600):
    _num_features = len(features)
    _n_cols = int(np.ceil(_num_features / rows))
    fig = make_subplots(rows=rows, cols=_n_cols, subplot_titles=features)
    
    for idx, feature in enumerate(features):
        r = idx // _n_cols + 1
        c = idx % _n_cols + 1
        fig.add_scatter(
            x=data[feature],
            y=data[y_name],
            mode="markers",
            row=r, col=c,
            name=feature,
        )
        fig.update_yaxes(title=y_name)

    fig.update_layout(height=height, width=width, title_text=title)
    fig.show()
    
def plot_boxes(data, features, y_name, rows=2, title="",
                 width=900, height=600):
    _num_features = len(features)
    _n_cols = int(np.ceil(_num_features / rows))
    fig = make_subplots(rows=rows, cols=_n_cols, subplot_titles=features)
    
    for idx, feature in enumerate(features):
        r = idx // _n_cols + 1
        c = idx % _n_cols + 1
        fig.add_box(
            x=data[feature],
            y=data[y_name],
            row=r, col=c,
            name=feature,
        )
        fig.update_yaxes(title=y_name)

    fig.update_layout(height=height, width=width, title_text=title)
    fig.show()
    
def regression_metrics(model_pipe, X_test, y_test):
    y_hat = model_pipe.predict(X_test)
    print("r2_score: ", r2_score(y_test, y_hat))
    print("rmse: ", np.sqrt(mean_squared_error(y_test, y_hat)))
    print("mae: ", mean_absolute_error(y_test, y_hat))

### Тут предварительная обработка

В целом это копипаста из предыдущего анализа

In [7]:
TARGET = "SalePrice"

houses_raw_df = pd.read_csv('../data/houses.csv', sep=',', index_col='Id')
houses_raw_df.sample(5)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
464,70,RL,74.0,11988,Pave,,IR1,HLS,AllPub,Inside,...,0,,,,0,8,2008,WD,Normal,188700
6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
56,20,RL,100.0,10175,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,7,2008,WD,Normal,180500
1268,20,RL,89.0,13214,Pave,,IR1,HLS,AllPub,Inside,...,0,,,,0,5,2010,WD,Normal,378500
503,20,RL,70.0,9170,Pave,,Reg,Lvl,AllPub,Corner,...,0,,GdPrv,Shed,400,4,2007,WD,Normal,140000


In [8]:
# удалим признаки где много пропусков

_to_remove = houses_raw_df.columns[houses_raw_df.count() < 800] # Снизим порог
print(f"removing features {_to_remove}")
houses_raw_df.drop(_to_remove, axis=1, inplace=True)

removing features Index(['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')


In [9]:
# Переведем качественные шкалы в разряд категориальных
_to_cat = []
for feature in num_features(houses_raw_df).drop(TARGET):
    n_unique = houses_raw_df[feature].nunique()
    if n_unique < 20:
        _to_cat.append(feature)
        
print(f"List of features to be converted to categorical: {_to_cat}")
houses_raw_df[_to_cat] = houses_raw_df[_to_cat].astype(object)
houses_raw_df[_to_cat] 

List of features to be converted to categorical: ['MSSubClass', 'OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'PoolArea', 'MoSold', 'YrSold']


Unnamed: 0_level_0,MSSubClass,OverallQual,OverallCond,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,PoolArea,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,60,7,5,1,0,2,1,3,1,8,0,2,0,2,2008
2,20,6,8,0,1,2,0,3,1,6,1,2,0,5,2007
3,60,7,5,1,0,2,1,3,1,6,1,2,0,9,2008
4,70,7,5,1,0,1,0,3,1,7,1,3,0,2,2006
5,60,8,5,1,0,2,1,4,1,9,1,3,0,12,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,6,5,0,0,2,1,3,1,7,1,2,0,8,2007
1457,20,6,6,1,0,2,0,3,1,7,2,2,0,2,2010
1458,70,7,9,0,0,2,0,4,1,9,2,1,0,5,2010
1459,20,5,6,1,0,1,0,2,1,5,0,1,0,4,2010


In [11]:
_num_features = num_features(houses_raw_df)
plot_hist(data=houses_raw_df, features=_num_features, title="Numeric histograms")

# Имеются признаки в которых более 80% наблюдений сосредоточно в одном бине. Хочется посмотреть есть ли в них польза
# В тот раз я их бездумно удалил, сейчас понимаю что выбросы могли просто сильно сжать первый бин, за которым может
# скрываться вполне нормальное распределение 

In [12]:
def pick_suspicious_features(data, features, treshold):
    suspicious = []
    for feature in features:
        feature_samples = data[feature].count()
        most_frequent = data[feature].value_counts().iloc[0]
        if (most_frequent / feature_samples) > treshold:
            suspicious.append(feature)
            
    return suspicious

#### Посмотрим на описанные признаки в разрезе цены.
suspicious_num_featues = pick_suspicious_features(houses_raw_df, num_features(houses_raw_df), treshold=0.9)

plot_scatter(houses_raw_df, suspicious_num_featues, TARGET, title="Different numeric features to Price")
# Собственно видим что у всех этих признаков > 80% значений находится в нуле. и разраброс цен очень высок. 
# Ненулевые значения не имеют ни тренда ни корреляции с ценой.
# Поэтому признаки не смогут объяснить дисперсию следовательно их можно удалить

print("Removing next features: ", suspicious_num_featues)
houses_raw_df.drop(suspicious_num_featues, axis=1, inplace=True)

Removing next features:  ['LowQualFinSF', '3SsnPorch', 'ScreenPorch', 'MiscVal']


#### Теперь взглянем на категориальные признаки

In [13]:
_cat_features = cat_features(houses_raw_df)
plot_hist(houses_raw_df, _cat_features, title="Categorical histograms") 
# Видимо примерно такую же картину, только кол-во бинов значительно меньше
# Попробуем проанализировать признаки где большая часть выборки одно значение

In [14]:
# Сначала объединим непопулярные классы в один аггрегированный класс

suspicious_cat_features = pick_suspicious_features(
    houses_raw_df, cat_features(houses_raw_df), treshold=0.8
)

plot_boxes(
    houses_raw_df,
    suspicious_cat_features,
    TARGET,
    title="Different cat features to Price",
    rows=3,
    height=1000,
    width=2400,
)

# На мой взгляд эти признаки также достаточно бесполезны.
# Межклассовая вариативность среди этих переменных практически ничего не объясняет. 
# Везде первый класс(на графике) - все портит. А учитвая что сами признаки крайне несбалнсированны
# Делаем вывод что все эти переменные можно спокойно удалить и не потерять в качества ничего
# Оставим разве что RoofMatl - он выглядит перспектинвым.
# Там интерквантильные размахи не соприкасаются у двух самых популярных классов
suspicious_cat_features.remove("RoofMatl")
print("Removing next features: ", suspicious_cat_features)
houses_raw_df.drop(suspicious_cat_features, axis=1, inplace=True)

Removing next features:  ['Street', 'LandContour', 'Utilities', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'ExterCond', 'BsmtCond', 'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'BsmtHalfBath', 'KitchenAbvGr', 'Functional', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolArea', 'SaleType', 'SaleCondition']


In [15]:
print(
    "processed data set has:",
    f"{len(num_features(houses_raw_df))} numeric features",
    f"{len(cat_features(houses_raw_df))} categorical features"
) # Есть заметный перевес в сторону категориальных признаков это не хорошо =(

processed data set has: 18 numeric features 30 categorical features


### Проверим модель без отбора аномалий

Как критерий будем смотреть на метрики регрессии тк, в пространство высокомерное и его не визуализировать

In [89]:
X_train, X_test, y_train, y_test = train_test_split(
    houses_raw_df.drop(TARGET, axis=1),
    houses_raw_df[TARGET],
    test_size=0.2,
    random_state=43
)

In [90]:
num_pipe = Pipeline([
    ("imputer", KNNImputer(n_neighbors=5, weights="uniform")),
    ("scaler", StandardScaler()), # StandardScaler or MinMaxScaler
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot", OneHotEncoder(drop="if_binary", handle_unknown="ignore", sparse=False))
])

preprocessors = ColumnTransformer(transformers=[
    ("num", num_pipe, houses_raw_df.select_dtypes(include="number").columns.drop(TARGET)),
    ("cat", cat_pipe, houses_raw_df.select_dtypes(exclude="number").columns)
])

model_pipe = Pipeline([
    ("preprocessing", preprocessors),
    ("model", AdaBoostRegressor(n_estimators=50, loss="square", learning_rate=0.5))
])

model_pipe.fit(X_train, y_train)

In [91]:
print("Train metrics: ")
regression_metrics(model_pipe, X_train, y_train)
print("Test metrics: ")
regression_metrics(model_pipe, X_test, y_test)

Train metrics: 
r2_score:  0.8549732156142675
rmse:  30203.61485487589
mae:  23871.196841285448
Test metrics: 
r2_score:  0.7466966273972463
rmse:  40156.43397598134
mae:  28191.326530612616


In [92]:
# Проверим где модель ошибается больше всего
predictions = model_pipe.predict(X_train)
X_test_with_diff = X_train.copy()
X_test_with_diff["Diff"] = predictions - y_train
X_test_with_diff["Target"] = y_train
X_test_with_diff["Prediction"] = predictions
X_test_with_diff.sort_values("Diff", ascending=False).head(10)

# Видно что сильно завышаем предсказания для домов дешевле медианы
# Эти дома объединяет низкий OverallCond
# У некоторых из них маленькая площадь меньше .25 квантили
# В остальном сложно увидеть какие-то закономерности

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,OverallCond,...,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,MoSold,YrSold,Diff,Target,Prediction
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1325,20,RL,75.0,9986,Reg,Inside,Somerst,1Story,8,5,...,3,895,0,49,0,2,2007,95343.647059,147000,242343.647059
917,20,C (all),50.0,9000,Reg,Inside,IDOTRR,1Story,2,3,...,1,308,0,0,0,10,2006,90533.467262,35311,125844.467262
969,50,RM,50.0,5925,Reg,Inside,OldTown,1.5Fin,3,6,...,0,0,0,0,0,5,2009,87944.467262,37900,125844.467262
534,20,RL,50.0,5000,Reg,Inside,BrkSide,1Story,1,3,...,0,0,0,0,0,1,2007,86985.982927,39300,126285.982927
31,70,C (all),50.0,8500,Reg,Inside,IDOTRR,2Story,4,4,...,1,250,0,54,172,7,2008,85844.467262,40000,125844.467262
1416,120,RL,51.0,3635,Reg,Inside,Blmngtn,1Story,7,5,...,3,660,143,20,0,5,2009,85151.98,175900,261051.98
715,60,RL,,13517,IR1,CulDSac,Sawyer,2Story,6,8,...,2,475,0,44,0,3,2010,81232.534247,130500,211732.534247
589,20,RL,65.0,25095,IR1,Inside,ClearCr,1Story,5,8,...,1,452,0,48,0,6,2009,77956.618214,143000,220956.618214
1454,20,RL,90.0,17217,Reg,Inside,Mitchel,1Story,5,5,...,0,0,36,56,0,7,2006,77326.467005,84500,161826.467005
667,60,RL,,18450,IR1,Inside,NAmes,2Story,6,5,...,2,596,0,265,0,8,2007,76496.194357,129000,205496.194357


#### Применим детектор аномалий

In [93]:
# Начнем с LocalOutlierFactor

lof = LocalOutlierFactor(n_neighbors=15, contamination=0.15, leaf_size=50)
X = preprocessors.fit_transform(houses_raw_df)
lof_predictions = pd.Series(lof.fit_predict(X))

X = pd.DataFrame(X)[lof_predictions.map({1: True, -1: False})]
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,232,233,234,235,236,237,238,239,240,241
0,-0.250382,-0.207142,1.050994,0.878668,0.506195,0.575425,-0.288653,-0.944591,-0.459303,-0.793434,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.380325,-0.091886,0.156734,-0.429577,-0.575186,1.171992,-0.288653,-0.641228,0.466465,0.257140,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.124241,0.073480,0.984752,0.830215,0.318608,0.092907,-0.288653,-0.301643,-0.313369,-0.627826,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.460618,-0.096897,-1.863632,-0.720298,-0.575186,-0.499274,-0.288653,-0.061670,-0.687324,-0.521734,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.548513,0.375148,0.951632,0.733308,1.355851,0.463568,-0.288653,-0.174865,0.199680,-0.045611,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,-0.376523,-0.260560,0.918511,0.733308,-0.575186,-0.973018,-0.288653,0.873321,-0.238122,-0.542435,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1456,0.590561,0.266407,0.222975,0.151865,0.081367,0.759659,0.722112,0.049262,1.104925,2.355701,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1457,-0.208335,-0.147810,-1.002492,1.024029,-0.575186,-0.369871,-0.288653,0.701265,0.215641,0.065656,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1458,-0.124241,-0.080160,-0.704406,0.539493,-0.575186,-0.865548,6.092188,-1.284176,0.046905,-0.218982,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [94]:
X_train_adjusted, X_test_adjusted, y_test_adjusted, y_test_adjusted = train_test_split(
    X,
    houses_raw_df[TARGET].iloc[X.index],
    test_size=0.25,
    random_state=43
)
abr = AdaBoostRegressor(n_estimators=50, loss="square", learning_rate=0.5)
abr.fit(X_train_adjusted, y_test_adjusted)

In [99]:
print("Train metrics: ")
regression_metrics(abr, X_train_adjusted, y_test_adjusted) # Стало заметно лучше, но я так понимаю это в среднем
print("Test metrics: ")
regression_metrics(abr, X_test_adjusted, y_test_adjusted) # Также стало лучше


Train metrics: 
r2_score:  0.8825757488973306
rmse:  22710.222111804556
mae:  17853.00130919572
Test metrics: 
r2_score:  0.8293783465120924
rmse:  32683.96967262774
mae:  21760.884216200404


In [121]:
predictions = abr.predict(X_train_adjusted)
X_test_with_diff = X_train_adjusted.copy()
X_test_with_diff["Diff"] = predictions - y_test_adjusted
X_test_with_diff["Target"] = y_test_adjusted
X_test_with_diff["Prediction"] = predictions
top_mistakes = X_test_with_diff.sort_values("Diff", ascending=False).head(10)

# top_mistakes
houses_copy = houses_raw_df.copy()
houses_copy["Diff"] = X_test_with_diff["Diff"]
houses_copy["Target"] = X_test_with_diff["Target"]
houses_copy["Prediction"] = X_test_with_diff["Prediction"]
houses_copy.loc[top_mistakes.index] 
# Здесь уже сложнее понять почему для этих примеров такие плохие предсказания
# Судя по всему также плохо предсказываем дешевые дома с низким качеством, возможно надо увеличить contamenation

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,OverallCond,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,MoSold,YrSold,SalePrice,Diff,Target,Prediction
969,50,RM,50.0,5925,Reg,Inside,OldTown,1.5Fin,3,6,...,0,0,0,0,5,2009,37900,73901.973094,37900.0,117081.06
1325,20,RL,75.0,9986,Reg,Inside,Somerst,1Story,8,5,...,895,0,49,0,2,2007,147000,67651.223022,147000.0,113121.293907
813,20,C (all),66.0,8712,Reg,Inside,IDOTRR,1Story,5,5,...,504,0,0,0,6,2010,55993,60818.361204,55993.0,155956.753623
637,30,RM,51.0,6120,Reg,Inside,BrkSide,1Story,2,3,...,0,0,0,0,1,2009,60000,56811.361204,60000.0,119543.402913
1063,190,RM,85.0,13600,Reg,Inside,OldTown,2Story,5,5,...,560,0,57,0,9,2007,90000,56578.443213,90000.0,123096.21673
252,120,RM,44.0,4750,IR1,Inside,Crawfor,1Story,8,5,...,538,123,0,0,12,2007,235000,54858.538023,235000.0,191562.339564
1433,30,RL,60.0,10800,Reg,Inside,OldTown,1Story,4,6,...,216,0,0,0,8,2007,64500,53414.464,64500.0,192223.179487
265,30,RM,30.0,5232,IR3,Inside,OldTown,1Story,5,5,...,504,0,0,0,6,2008,73000,48018.082988,73000.0,188291.95122
1384,30,RL,,25339,Reg,Inside,Sawyer,1Story,5,7,...,576,0,0,112,8,2007,112000,47661.023364,112000.0,113174.918803
1293,70,RM,60.0,6600,Reg,Corner,OldTown,2Story,5,4,...,432,0,287,0,12,2009,107500,46567.0625,107500.0,165559.255172


#### Попробуем другую модель

In [143]:
isoForest = IsolationForest(contamination=0.1, n_estimators = 50)

X = preprocessors.fit_transform(houses_raw_df)
lof_predictions = pd.Series(lof.fit_predict(X))

X = pd.DataFrame(X)[lof_predictions.map({1: True, -1: False})]

X_train_adjusted, X_test_adjusted, y_test_adjusted, y_test_adjusted = train_test_split(
    X,
    houses_raw_df[TARGET].iloc[X.index],
    test_size=0.25,
    random_state=43
)
abr = AdaBoostRegressor(n_estimators=50, loss="square", learning_rate=0.5)
abr.fit(X_train_adjusted, y_test_adjusted)

In [145]:
print("Train metrics: ")
regression_metrics(abr, X_train_adjusted, y_test_adjusted) # Стало лучше по отношению к LocalOutlierFactor
print("Test metrics: ")
regression_metrics(abr, X_test_adjusted, y_test_adjusted) # Соответственно тоже

Train metrics: 
r2_score:  0.8838913495166086
rmse:  22582.643098551995
mae:  17763.642678168177
Test metrics: 
r2_score:  0.8391909157502897
rmse:  31730.21501607252
mae:  21090.54567523122


In [146]:
predictions = abr.predict(X_train_adjusted)
X_test_with_diff = X_train_adjusted.copy()
X_test_with_diff["Diff"] = predictions - y_test_adjusted
X_test_with_diff["Target"] = y_test_adjusted
X_test_with_diff["Prediction"] = predictions
top_mistakes = X_test_with_diff.sort_values("Diff", ascending=False).head(10)

houses_copy = houses_raw_df.copy()
houses_copy["Diff"] = X_test_with_diff["Diff"]
houses_copy["Target"] = X_test_with_diff["Target"]
houses_copy["Prediction"] = X_test_with_diff["Prediction"]
houses_copy.loc[top_mistakes.index] 

### Очень похоже на худшие предсказания при LocalFitOutlier, первые два семпла вообще повторяются
### Не очень понятно почему второй семпл имет низку оценку по всем параметрам дом неплохой должен быть, но видимо OverallCond - имеет большое влияние

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,OverallCond,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,MoSold,YrSold,SalePrice,Diff,Target,Prediction
969,50,RM,50.0,5925,Reg,Inside,OldTown,1.5Fin,3,6,...,0,0,0,0,5,2009,37900,69245.666667,37900.0,116777.662757
1325,20,RL,75.0,9986,Reg,Inside,Somerst,1Story,8,5,...,895,0,49,0,2,2007,147000,63418.554252,147000.0,113951.855769
813,20,C (all),66.0,8712,Reg,Inside,IDOTRR,1Story,5,5,...,504,0,0,0,6,2010,55993,57265.401515,55993.0,159477.051081
637,30,RM,51.0,6120,Reg,Inside,BrkSide,1Story,2,3,...,0,0,0,0,1,2009,60000,54315.557491,60000.0,116777.662757
1433,30,RL,60.0,10800,Reg,Inside,OldTown,1Story,4,6,...,216,0,0,0,8,2007,64500,51766.378738,64500.0,193178.443038
1454,20,RL,90.0,17217,Reg,Inside,Mitchel,1Story,5,5,...,0,36,56,0,7,2006,84500,45803.385159,84500.0,186796.512727
544,120,RH,34.0,4058,Reg,Inside,NAmes,SFoyer,7,5,...,367,120,40,0,6,2007,133000,44865.514451,133000.0,200910.726962
1049,20,RL,100.0,21750,Reg,Inside,Mitchel,1Story,5,4,...,336,0,0,0,11,2009,115000,44254.118812,115000.0,116777.662757
265,30,RM,30.0,5232,IR3,Inside,OldTown,1Story,5,5,...,504,0,0,0,6,2008,73000,43698.623967,73000.0,188697.851852
1345,60,RL,85.0,11103,IR1,Corner,CollgCr,2Story,7,5,...,440,0,0,0,7,2007,155835,43348.658824,155835.0,121917.410853


In [151]:
### Глянем тоже самое для тестовой выборки

predictions = abr.predict(X_test_adjusted)
X_test_with_diff = X_test_adjusted.copy()
X_test_with_diff["Diff"] = predictions - y_test_adjusted
X_test_with_diff["Target"] = y_test_adjusted
X_test_with_diff["Prediction"] = predictions
top_mistakes = X_test_with_diff.sort_values("Diff", ascending=False).head(10)

houses_copy = houses_raw_df.copy()
houses_copy["Diff"] = X_test_with_diff["Diff"]
houses_copy["Target"] = X_test_with_diff["Target"]
houses_copy["Prediction"] = X_test_with_diff["Prediction"]
houses_copy.loc[top_mistakes.index] 

## Кажется гипотеза о том что плохим предсказаниям соответствуют дома с низким качеством - подтверждается
## Возможно потому что остальные признаки дома указывают на то что он хорош, но вот эти два признака могут сильно занижать цену дома
## Возможно веса модели этого признака слегка занижены

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,OverallCond,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,MoSold,YrSold,SalePrice,Diff,Target,Prediction
711,30,RL,56.0,4130,IR1,Inside,BrkSide,1Story,3,6,...,0,0,0,0,7,2008,52000,64101.381119,52000.0,118778.619186
463,20,RL,60.0,8281,IR1,Inside,Sawyer,1Story,5,5,...,360,0,0,236,12,2009,62383,62820.931034,62383.0,167375.383495
851,120,RM,36.0,4435,Reg,Inside,CollgCr,1Story,6,5,...,420,140,0,0,11,2007,131500,52647.041667,131500.0,210461.278626
810,75,RM,90.0,8100,Reg,Corner,OldTown,2.5Unf,5,5,...,360,40,156,0,11,2009,106000,47828.362025,106000.0,148455.421466
621,30,RL,45.0,8248,Reg,Inside,Edwards,1Story,3,3,...,0,0,0,100,9,2008,67000,47475.0,67000.0,215043.080692
1073,50,RL,50.0,7585,Reg,Inside,Edwards,1.5Fin,5,3,...,280,0,0,0,8,2006,91500,42500.091854,91500.0,158912.035928
309,30,RL,,12342,IR1,Inside,Edwards,1Story,4,5,...,539,158,0,0,3,2009,82500,40753.333333,82500.0,355024.169492
1014,30,RM,60.0,7200,Reg,Inside,OldTown,1Story,5,4,...,280,0,30,226,6,2009,85000,40085.070423,85000.0,125203.931034
1365,160,FV,30.0,3180,Reg,Inside,Somerst,2Story,7,5,...,480,0,166,0,4,2006,144152,39995.041667,144152.0,199272.098266
668,20,RL,65.0,8125,Reg,Inside,SawyerW,1Story,6,5,...,575,224,42,0,10,2008,193500,35976.553498,193500.0,169707.0


In [150]:
### Попробуем оптимизировать параметр contamenation

for contamenation in np.arange(0.05, 0.4, 0.05):
    print(f"==== CONTAMENATION {contamenation} ====")
    
    isoForest = IsolationForest(contamination=contamenation, n_estimators = 50)

    X = preprocessors.fit_transform(houses_raw_df)
    lof_predictions = pd.Series(lof.fit_predict(X))

    X = pd.DataFrame(X)[lof_predictions.map({1: True, -1: False})]

    X_train_adjusted, X_test_adjusted, y_test_adjusted, y_test_adjusted = train_test_split(
        X,
        houses_raw_df[TARGET].iloc[X.index],
        test_size=0.25,
        random_state=43
    )
    abr = AdaBoostRegressor(n_estimators=50, loss="square", learning_rate=0.5)
    abr.fit(X_train_adjusted, y_test_adjusted)
    print("Train metrics: ")
    regression_metrics(abr, X_train_adjusted, y_test_adjusted) # Стало лучше по отношению к LocalOutlierFactor
    print("Test metrics: ")
    regression_metrics(abr, X_test_adjusted, y_test_adjusted)
    
    print("\n")
    
# Наиболее качественный результат получился при загрязнении 0.3. Что достаточно много. Подразумеваю что мне не нужно было трогать X_test
# Кажется что правильно очищать только train. Чтобы получать более достоверные метрики

==== CONTAMENATION 0.05 ====
Train metrics: 
r2_score:  0.8763275296494879
rmse:  23306.604160070943
mae:  18257.851821179233
Test metrics: 
r2_score:  0.8363939636419155
rmse:  32004.967260758298
mae:  21623.014840135427


==== CONTAMENATION 0.1 ====
Train metrics: 
r2_score:  0.8784813467093463
rmse:  23102.764768634806
mae:  17966.279506070307
Test metrics: 
r2_score:  0.8324392702872127
rmse:  32389.470418093453
mae:  21606.51685641192


==== CONTAMENATION 0.15000000000000002 ====
Train metrics: 
r2_score:  0.880063643077603
rmse:  22951.861190975276
mae:  17997.112551066526
Test metrics: 
r2_score:  0.8294396349649565
rmse:  32678.098993812036
mae:  22121.24849574286


==== CONTAMENATION 0.2 ====
Train metrics: 
r2_score:  0.8758530845970037
rmse:  23351.266962236485
mae:  18299.658577609087
Test metrics: 
r2_score:  0.8254552421907445
rmse:  33057.5855588988
mae:  22351.675692386045


==== CONTAMENATION 0.25 ====
Train metrics: 
r2_score:  0.8815480033335051
rmse:  22809.39021821

#### Выводы

В целом достаточно хорошие методы которые могут помочь отловить всякие выбросы
Но когда их очень много в выборке это не сильно помогает

Интересно что в совокупности предсказаний модели и избавлении от выбросов - можно проще провести анализ почему модель ошибается,
хоть и гипотетически но все же, ккаая то логика видна =) 