In [681]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, QuantileTransformer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import  KernelPCA
from sklearn.cluster import AgglomerativeClustering

import warnings
warnings.filterwarnings('ignore')

Helpers

In [682]:
def cat_features(df: pd.DataFrame):
    return df.select_dtypes(exclude="number").columns

def number_features(df: pd.DataFrame):
    return df.select_dtypes(include="number").columns

### Кластеризация и визуализация

##### Анализ данных

***Попробуем улучшить предыдущие попытки***

In [683]:
# Продолжу работать с домиками
houses_raw_df = pd.read_csv('houses.csv', sep=',', index_col='Id')
houses_raw_df.sample(5)

# Огромное кол-во признаков: 80 столбцов
# Уже знаем что нет дубликатов и в целом датафрейм хороший, пропусков мало

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
231,20,RL,73.0,8760,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,148000
419,50,RL,60.0,8160,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2007,WD,AdjLand,126000
307,60,RL,116.0,13474,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,6,2007,WD,Normal,225000
312,20,RL,50.0,8000,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,5,2009,WD,Normal,132000
309,30,RL,,12342,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,3,2009,WD,Normal,82500


In [684]:
# И все же удалим признаки для которых много пропусков
_to_remove = houses_raw_df.columns[houses_raw_df.count() < 800]
print(f"removing features {_to_remove}")
houses_raw_df.drop(_to_remove, axis=1, inplace=True)

removing features Index(['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')


In [685]:
TARGET = "OverallQual"

In [686]:
houses_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   LotShape       1460 non-null   object 
 6   LandContour    1460 non-null   object 
 7   Utilities      1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemodAdd 

In [687]:
_num_features = number_features(houses_raw_df)
_n_cols = int(np.ceil(len(_num_features) / 2))
fig = make_subplots(rows=2, cols=_n_cols, subplot_titles=_num_features)

for idx, feature in enumerate(_num_features):
    r = idx // _n_cols + 1
    c = idx % _n_cols + 1
    fig.add_histogram(
        x=houses_raw_df[feature],
        row=r, col=c,
        name=feature,
        histnorm='probability'
    )
    
fig.update_layout(height=700, width=4500, title_text="Houses Histograms")
fig.show()

# Есть признаки которые относятся к категориальным. Переведем их в раздел таковых.
# А есть признаки, в которых > 90% процентов данных это какой-то один класс. Предлагается от таких признаков избавиться
# Также приметим признаки которые стоит логарифмировать или провести иные преобразования

In [688]:
# Рассмотрим категориальные признаки

_cat_features = cat_features(houses_raw_df)
_n_cols = int(np.ceil(len(_num_features) / 2))
fig = make_subplots(rows=2, cols=_n_cols, subplot_titles=_cat_features)

for idx, feature in enumerate(_cat_features):
    r = idx // _n_cols + 1
    c = idx % _n_cols + 1
    fig.add_histogram(
        x=houses_raw_df[feature],
        row=r, col=c,
        name=feature,
    )
    
fig.update_layout(height=700, width=4500, title_text="Houses Histograms")
fig.show()

# Отсюда также предлагается удалить признаки где один единственный класс представляет большую часть признака.
# В целом с категориальными признакми все более менее стандартно.

##### Обработка данных

In [689]:
# Удалим признаки в которых одно значение представляет большую часть признака. Как порог возьмем 0.9
_to_del = []
for feature in houses_raw_df.columns.drop(TARGET):
    feature_samples = houses_raw_df[feature].count()
    most_frequent = houses_raw_df[feature].value_counts().iloc[0]
    if (most_frequent / feature_samples) > 0.9:
        _to_del.append(feature)
        
print(f"Remove highly imbalanced features: {_to_del}")
houses_raw_df.drop(_to_del, axis=1, inplace=True)

houses_raw_df.head(10)
# Окей осталось 54 фичи остальные бесполезные признаки откинем в рамках методов уменьшения размерности

Remove highly imbalanced features: ['Street', 'Utilities', 'LandSlope', 'Condition2', 'RoofMatl', 'BsmtCond', 'Heating', 'CentralAir', 'Electrical', 'LowQualFinSF', 'BsmtHalfBath', 'KitchenAbvGr', 'Functional', 'GarageQual', 'GarageCond', 'PavedDrive', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']


Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,...,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Reg,Lvl,Inside,CollgCr,Norm,1Fam,...,2,548,0,61,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Reg,Lvl,FR2,Veenker,Feedr,1Fam,...,2,460,298,0,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,IR1,Lvl,Inside,CollgCr,Norm,1Fam,...,2,608,0,42,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,IR1,Lvl,Corner,Crawfor,Norm,1Fam,...,3,642,0,35,272,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,IR1,Lvl,FR2,NoRidge,Norm,1Fam,...,3,836,192,84,0,12,2008,WD,Normal,250000
6,50,RL,85.0,14115,IR1,Lvl,Inside,Mitchel,Norm,1Fam,...,2,480,40,30,0,10,2009,WD,Normal,143000
7,20,RL,75.0,10084,Reg,Lvl,Inside,Somerst,Norm,1Fam,...,2,636,255,57,0,8,2007,WD,Normal,307000
8,60,RL,,10382,IR1,Lvl,Corner,NWAmes,PosN,1Fam,...,2,484,235,204,228,11,2009,WD,Normal,200000
9,50,RM,51.0,6120,Reg,Lvl,Inside,OldTown,Artery,1Fam,...,2,468,90,0,205,4,2008,WD,Abnorml,129900
10,190,RL,50.0,7420,Reg,Lvl,Corner,BrkSide,Artery,2fmCon,...,1,205,0,4,0,1,2008,WD,Normal,118000


In [690]:
# Переведем некоторые числовые признаки в разряд категориальных
_to_cat = []
for feature in number_features(houses_raw_df).drop(TARGET): # таргет оставим как есть
    n_unique = houses_raw_df[feature].nunique()
    if n_unique < 20:
        _to_cat.append(feature)
        
print(f"List of features to be converted to categorical: {_to_cat}")
houses_raw_df[_to_cat].astype("category")
# Видно все ок, нужные фичи перевелись в категории

List of features to be converted to categorical: ['MSSubClass', 'OverallCond', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'MoSold', 'YrSold']


Unnamed: 0_level_0,MSSubClass,OverallCond,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,60,5,1,2,1,3,8,0,2,2,2008
2,20,8,0,2,0,3,6,1,2,5,2007
3,60,5,1,2,1,3,6,1,2,9,2008
4,70,5,1,1,0,3,7,1,3,2,2006
5,60,5,1,2,1,4,9,1,3,12,2008
...,...,...,...,...,...,...,...,...,...,...,...
1456,60,5,0,2,1,3,7,1,2,8,2007
1457,20,6,1,2,0,3,7,2,2,2,2010
1458,70,9,0,2,0,4,9,2,1,5,2010
1459,20,6,1,1,0,2,5,0,1,4,2010


In [691]:
print(
    "number of numeric features", len(number_features(houses_raw_df).drop(TARGET)),
    "number of categorical features", len(cat_features(houses_raw_df)),
)

number of numeric features 29 number of categorical features 24


#### Визуализируем данные и обработаем данные
2 в 1, так как нужно визуализировать и после уменьшения размерности

In [692]:
# Чтобы заменить выбросы. 
# UPD: Оказывается в sklearn уже есть QuantileTransformer
class OutliersReplacer(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5) -> None:
        self.factor = factor
        super().__init__()
    
    def _replace_outliers(self, X, y=None):
        _X = pd.Series(X.copy())
        q1 = _X.quantile(0.25)
        q3 = _X.quantile(0.75)
        iqr = q3 - q1
        median = _X.mode().values[0]
        _X.apply(lambda e: e if q1 - (self.factor * iqr) <= e <= q3 + (self.factor * iqr) else median)
        return _X
        
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        _X = pd.DataFrame(X.copy())
        for col in range(_X.shape[1]):
            column = _X.iloc[:, col]
            column = self._replace_outliers(column)
            _X.iloc[:, col] = column
        return _X

In [693]:
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("quantile", QuantileTransformer(n_quantiles=50, output_distribution="normal")),
    # ("quantile", OutliersReplacer()), # Оказался хуже чем QuantileTransformer
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot", OneHotEncoder(drop="if_binary", handle_unknown="ignore", sparse=False))
])

preprocessors = ColumnTransformer(transformers=[
    ("num", num_pipe, number_features(houses_raw_df).drop(TARGET)),
    ("cat", cat_pipe, cat_features(houses_raw_df))
])

preprocessors_pipe = Pipeline([
    ("preprocessing", preprocessors)
])


X = houses_raw_df.drop(TARGET, axis=1)
y = houses_raw_df[TARGET]

X_transformed = preprocessors_pipe.fit_transform(X)

In [694]:
n_iter = 500
perplexity = 15
tsne = TSNE(
    n_components=2, 
    verbose=1, 
    perplexity=perplexity,
    n_iter=n_iter
    )

tsne_res = tsne.fit_transform(X_transformed)

[t-SNE] Computing 46 nearest neighbors...
[t-SNE] Indexed 1460 samples in 0.000s...
[t-SNE] Computed neighbors for 1460 samples in 0.025s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1460
[t-SNE] Computed conditional probabilities for sample 1460 / 1460
[t-SNE] Mean sigma: 1.847979
[t-SNE] KL divergence after 250 iterations with early exaggeration: 73.622971
[t-SNE] KL divergence after 500 iterations: 1.286533


In [695]:
def tsne_vis(results):
    fig = px.scatter(
        x=results[:, 0],
        y=results[:, 1],
        color=y.astype(str).values,
        text=y,
    )
    fig.update_layout(
        height=650,
        width=1000,
        title_text="TSNE vizualization `OverallQual`"
    )
    fig.show()
tsne_vis(tsne_res)

# В первых проход tsne - каких то конкретных групп не получилось. Много 5-ок в разных местах. Попробую другие параметры
# Уменьшение и увеличение perplexity особо не помогло
# При 15 на мой взгляд получилось более менее отчетливое разбиение хоть и много примесей.
# Применим уменьшение размерности и проверим результаты позже

##### Кластеризация

In [696]:
# Попробуем EM kластеризацию хотя кажется здесь она плохо себя покажет

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("quantile", QuantileTransformer(n_quantiles=300, output_distribution="normal")),
    ("scaler", MinMaxScaler())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot", OneHotEncoder(drop="if_binary", handle_unknown="ignore", sparse=False))
])

preprocessors = ColumnTransformer(transformers=[
    ("num", num_pipe, number_features(houses_raw_df).drop(TARGET)),
    # ("cat", cat_pipe, cat_features(houses_raw_df))
])

model_pipe = Pipeline([
    ("preprocessing", preprocessors),
    ("model", GaussianMixture(n_components=10, covariance_type='spherical'))
])

X_train, X_test, y_train, y_test = train_test_split(
    houses_raw_df.drop(np.append(cat_features(houses_raw_df).values, [TARGET]), axis=1), # Тк EM кластеризация удалим категориальные фичи
    houses_raw_df[TARGET],
    test_size=0.2,
    random_state=43,
    stratify=houses_raw_df[TARGET]
)

model_pipe.fit(X_train)

In [697]:
# Проверим результаты
def collect_metrics(model, X, y):
    y_hat = model.predict(X)
    print("f1 score: ", f1_score(y, y_hat, average="macro"))
    print("precision_score: ", precision_score(y, y_hat, average="macro"))
    print("accuracy_score: ", accuracy_score(y, y_hat))
    print("recall: ", recall_score(y, y_hat, average="macro"))

collect_metrics(model_pipe, X_train, y_train)

def highlight(df):
    attrs = pd.DataFrame('', index=df.index, columns=df.columns)
    
    for index in df.index:
        for col in df.columns:
            if index == col:
                attrs.loc[index, col] = "background: green"
            elif df.loc[index, col] > 10:
                attrs.loc[index, col] = "background: red"
    return attrs

cross_tab = pd.crosstab(
    index=y_train,
    columns=model_pipe.predict(X_train),
    rownames=["Классы"],
    colnames=["Предсказания"]
)
cross_tab.style.apply(highlight, axis=None)
# Оч плохие результаты, возможно без преобразования пространства тут ни один алгоритм себя хорошо не покажет
# На тестовой выборке даже смотреть нет смысла =)

f1 score:  0.09809664192504286
precision_score:  0.13065090951283687
accuracy_score:  0.1601027397260274
recall:  0.13673194450935952


Предсказания,0,1,2,3,4,5,6,7,8,9
Классы,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,0,0,2,0,0,0,0,0,0
2,0,0,0,2,0,0,0,0,0,0
3,1,0,0,13,0,1,0,0,0,1
4,2,22,0,38,2,8,0,9,0,12
5,7,70,2,50,9,74,7,56,0,43
6,26,15,21,26,12,50,41,55,9,44
7,36,8,37,2,44,16,48,35,18,11
8,6,0,24,2,25,7,32,6,22,11
9,0,0,8,0,6,4,2,0,14,0
10,0,0,6,0,1,1,1,2,3,0


##### Применим Kernel PCA

###### Но сначала визуализируем их через tsne

In [720]:
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("quantile", QuantileTransformer(n_quantiles=50, output_distribution="normal")),
    ("scaler", StandardScaler()),
    ("kpca", KernelPCA(n_components=8, kernel="sigmoid", degree=5, coef0=2.5, gamma=10))
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot", OneHotEncoder(drop="if_binary", handle_unknown="ignore", sparse=False))
])

preprocessors = ColumnTransformer(transformers=[
    ("num", num_pipe, number_features(houses_raw_df).drop(TARGET)),
    ("cat", cat_pipe, cat_features(houses_raw_df))
])

preprocessors_pipe = Pipeline([
    ("preprocessing", preprocessors)
])


X = houses_raw_df.drop(TARGET, axis=1)
y = houses_raw_df[TARGET]

X_transformed = preprocessors_pipe.fit_transform(X)
X_transformed.shape

(1460, 178)

In [721]:
n_iter = 500
perplexity = 5
tsne = TSNE(
    n_components=2, 
    verbose=1, 
    perplexity=perplexity,
    n_iter=n_iter
    )

tsne_res = tsne.fit_transform(X_transformed)

[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 1460 samples in 0.002s...
[t-SNE] Computed neighbors for 1460 samples in 0.119s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1460
[t-SNE] Computed conditional probabilities for sample 1460 / 1460
[t-SNE] Mean sigma: 0.894834
[t-SNE] KL divergence after 250 iterations with early exaggeration: 83.405609
[t-SNE] KL divergence after 500 iterations: 1.548095


In [722]:
# Попробовав различные ничего линейно или криволинейно разделимого не нашел(
tsne_vis(tsne_res)

In [701]:
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("quantile", QuantileTransformer(n_quantiles=50, output_distribution="normal")),
    ("scaler", StandardScaler()),
    ("kpca", KernelPCA(n_components=8, kernel="sigmoid", degree=5, coef0=2.5, gamma=10))
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot", OneHotEncoder(drop="if_binary", handle_unknown="ignore", sparse=False))
])

preprocessors = ColumnTransformer(transformers=[
    ("num", num_pipe, number_features(houses_raw_df).drop(TARGET)),
])

model_pipe = Pipeline([
    ("preprocessing", preprocessors),
    ("model", GaussianMixture(n_components=10, covariance_type='full'))
])

X_train, X_test, y_train, y_test = train_test_split(
    houses_raw_df.drop(np.append(cat_features(houses_raw_df).values, [TARGET]), axis=1),
    houses_raw_df[TARGET],
    test_size=0.3,
    random_state=43,
    stratify=houses_raw_df[TARGET]
)

model_pipe.fit(X_train)

collect_metrics(model_pipe, X_train, y_train)

cross_tab = pd.crosstab(
    index=y_train,
    columns=model_pipe.predict(X_train),
    rownames=["Классы"],
    colnames=["Предсказания"]
)
cross_tab.style.apply(highlight, axis=None)
# Стало незначительно лучше, без прохода по сетке тут не обойтись

f1 score:  0.07114879900669405
precision_score:  0.08837923224585838
accuracy_score:  0.11937377690802348
recall:  0.07690101647926612


Предсказания,0,1,2,3,4,5,6,7,8,9
Классы,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,1,0,0,0,0,0
3,1,1,0,0,9,0,2,0,1,0
4,4,29,1,0,31,0,12,2,2,0
5,33,81,2,1,59,3,78,13,5,3
6,38,24,5,12,32,9,67,14,24,37
7,25,1,36,25,5,19,17,14,17,64
8,4,0,21,19,1,19,4,14,4,32
9,0,0,5,5,0,15,0,2,0,3
10,0,0,0,3,0,4,0,0,0,6


In [702]:
param_grid = {
    "preprocessing__num__kpca__n_components": np.arange(8, 12, 1),
    "preprocessing__num__kpca__degree": np.arange(3, 5, 1),
    "preprocessing__num__kpca__gamma": np.arange(5, 15, 5),
}
gs = GridSearchCV(estimator=model_pipe,
                  param_grid=param_grid,
                  cv=KFold(n_splits=5))
gs.fit(X_train, y_train)

In [703]:
gs.best_estimator_.get_params()["preprocessing__num__kpca"]

#### Проверим другую модель кластеризации

In [704]:
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("quantile", QuantileTransformer(n_quantiles=30, output_distribution="normal")),
    ("scaler", StandardScaler()),
    ("kpca", KernelPCA(n_components=15, kernel="sigmoid", degree=2, coef0=2.5, gamma=10))
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot", OneHotEncoder(drop="if_binary", handle_unknown="ignore", sparse=False))
])

preprocessors = ColumnTransformer(transformers=[
    ("num", num_pipe, number_features(houses_raw_df).drop(TARGET)),
    ("cat", cat_pipe, cat_features(houses_raw_df))
])

model_pipe = Pipeline([
    ("preprocessing", preprocessors),
    ("model", AgglomerativeClustering(n_clusters=10, linkage="complete"))
])

X_train, X_test, y_train, y_test = train_test_split(
    houses_raw_df.drop(TARGET, axis=1),
    houses_raw_df[TARGET],
    test_size=0.3,
    random_state=43,
    stratify=houses_raw_df[TARGET]
)

model_pipe.fit(X_train)
y_hat = model_pipe.fit_predict(X_train)

In [705]:
def print_metrics(y_hat, y_true):
    print("f1 score", f1_score(y_true, y_hat, average="micro"))
    print("accuracy", accuracy_score(y_true, y_hat))
    print("precision", precision_score(y_true, y_hat, average="micro"))
    print("recall", recall_score(y_true, y_hat, average="micro"))
    
print("Метрики на обучающей выборке")
print_metrics(y_hat, y_train)

Метрики на обучающей выборке
f1 score 0.18003913894324852
accuracy 0.18003913894324852
precision 0.18003913894324852
recall 0.18003913894324852


In [706]:
y_hat = model_pipe.fit_predict(X_test)
print("Метрики на тестовой выборке")
print_metrics(y_hat, y_test)

Метрики на тестовой выборке
f1 score 0.14840182648401826
accuracy 0.14840182648401826
precision 0.14840182648401826
recall 0.14840182648401826


#### Выводы

1. Как-то грустно все получилось. Похоже данные слишком сложные для кластеризации методов основанных на близости или распределении, либо я где-то допуусстил критическую ошибку
2. Тем не менее KPCA показался достаточно сложным в настройке, и подобрать его параметры на сетке не вышло даже за час
3. tsne - при помощи tsne не получилось визуализировать данные в группы