In [112]:
from altair import Column
from category_encoders import OrdinalEncoder
import shap
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
pd.set_option('display.max_columns', None)

import sklearn
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.inspection import permutation_importance
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import f_regression, chi2
from scipy.stats import ttest_ind

sklearn.set_config(transform_output="pandas")
# sklearn pd output


import warnings
warnings.filterwarnings('ignore')

import catboost as cat

train = pd.read_csv('train.csv')
val = pd.read_csv('test.csv')


def preprocess(df):
    rendict={
        'name':'MSSubClass',
        20: '1-STORY 1946 & NEWER ALL STYLES',
        30: '1-STORY 1945 & OLDER',
        40: '1-STORY W/FINISHED ATTIC ALL AGES',
        45: '1-1/2 STORY - UNFINISHED ALL AGES',
        50: '1-1/2 STORY FINISHED ALL AGES',
        60: '2-STORY 1946 & NEWER',
        70: '2-STORY 1945 & OLDER',
        75: '2-1/2 STORY ALL AGES',
        80: 'SPLIT OR MULTI-LEVEL',
        85: 'SPLIT FOYER',
        90: 'DUPLEX - ALL STYLES AND AGES',
        120: '1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
        150: '1-1/2 STORY PUD - ALL AGES',
        160: '2-STORY PUD - 1946 & NEWER',
        180: 'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',
        190: '2 FAMILY CONVERSION - ALL STYLES AND AGES'
    }
    df[rendict['name']]=df[rendict['name']].map( rendict)

    nanistype=[
        'Alley',
        'BsmtQual',
        'BsmtCond',
        'BsmtExposure',
        'BsmtFinType1',
        'BsmtFinType2',
        'FireplaceQu',
        'GarageType',
        'GarageFinish',
        'GarageQual',
        'GarageCond',
        'PoolQC',
        'Fence',
        'MiscFeature',
        'MasVnrType'
    ]

    df[nanistype]=df[nanistype].fillna('Empty')
    return df

def show_nan_count(train):
    d=pd.DataFrame(data={'NaN_count': train.isna().sum(), 'data_type': train.dtypes}).T
    filtered_columns = d.columns[d.loc['NaN_count'] > 0]
    filtered_df = d[filtered_columns]
    return filtered_df

train = preprocess(train)
val = preprocess(val)

fullset=pd.concat([train.drop(['SalePrice'],axis=1),val])

def get_col_cat(data,):
    catdic={}
    noncatdic={}
    for i in data.columns:
        if data[i].dtype=='object':
            catdic[i]=data[i].nunique()
        else:
            noncatdic[i]=data[i].nunique()
    return catdic,noncatdic

catdic,noncatdic=get_col_cat(fullset)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import NearestNeighbors
import numpy as np

class AllImputer(BaseEstimator, TransformerMixin):
    def __init__(self, k=5, column_weights=None):
        self.k = k
        self.column_weights = column_weights

    def fit(self, X, y=None):
        # Определяем столбцы, для которых нужно провести импьютацию
        self.columns_to_impute = X.columns[X.isna().any()].tolist()

        # Создаем NearestNeighbors модель
        self.nn_model = NearestNeighbors(n_neighbors=self.k + 1)  # +1, чтобы исключить саму единицу
        self.nn_model.fit(X.dropna())

        # Сохраняем все данные
        self.fullset = X.dropna()

        return self

    def transform(self, X, y=None):
        # Проходим по всем столбцам для импьютации
        for column in self.columns_to_impute:
            # Ищем строки с пропущенными значениями в текущем столбце
            nan_indices = X[X[column].isna()].index.tolist()

            # Проходим по каждому индексу и заполняем пропущенное значение
            for idx in nan_indices:
                # Получаем ближайшие k+1 соседей (включая саму единицу)
                _, neighbor_indices = self.nn_model.kneighbors(X.loc[[idx]])

                # Убираем самого себя из соседей
                neighbor_indices = neighbor_indices[0][1:]

                # Вычисляем веса для соседей
                if self.column_weights:
                    weights = np.zeros_like(neighbor_indices, dtype=np.float64)
                    for i, neighbor_idx in enumerate(neighbor_indices):
                        for col, weight in self.column_weights.items():
                            if X.at[neighbor_idx, col] != X.at[idx, col]:
                                weights[i] += weight
                else:
                    weights = None

                # Вычисляем среднее значение в соседях для текущего столбца
                if weights is not None and np.sum(weights) > 0:
                    mean_value = np.average(self.fullset.loc[neighbor_indices, column], weights=weights)
                else:
                    mean_value = np.mean(self.fullset.loc[neighbor_indices, column])

                # Заполняем пропущенное значение
                X.at[idx, column] = mean_value

        return X

firtrans= ColumnTransformer (
    transformers=[
        ('Cat_implace', OrdinalEncoder(), list(catdic.keys()))
    ],
    remainder='passthrough'
)
# altrans = ColumnTransformer (
#     transformers=[
#         ('transformator', AllImputer(), list(noncatdic.keys())+list(catdic.keys()))
#     ],
#     remainder='passthrough'
# )
# pipeline = Pipeline([
#     ('imputer', firtrans),
#     ('transformer', altrans)
# ])



fullset = firtrans.fit_transform(fullset.dropna())

In [113]:
fullset

Unnamed: 0,Cat_implace__MSSubClass,Cat_implace__MSZoning,Cat_implace__Street,Cat_implace__Alley,Cat_implace__LotShape,Cat_implace__LandContour,Cat_implace__Utilities,Cat_implace__LotConfig,Cat_implace__LandSlope,Cat_implace__Neighborhood,Cat_implace__Condition1,Cat_implace__Condition2,Cat_implace__BldgType,Cat_implace__HouseStyle,Cat_implace__RoofStyle,Cat_implace__RoofMatl,Cat_implace__Exterior1st,Cat_implace__Exterior2nd,Cat_implace__MasVnrType,Cat_implace__ExterQual,Cat_implace__ExterCond,Cat_implace__Foundation,Cat_implace__BsmtQual,Cat_implace__BsmtCond,Cat_implace__BsmtExposure,Cat_implace__BsmtFinType1,Cat_implace__BsmtFinType2,Cat_implace__Heating,Cat_implace__HeatingQC,Cat_implace__CentralAir,Cat_implace__Electrical,Cat_implace__KitchenQual,Cat_implace__Functional,Cat_implace__FireplaceQu,Cat_implace__GarageType,Cat_implace__GarageFinish,Cat_implace__GarageQual,Cat_implace__GarageCond,Cat_implace__PavedDrive,Cat_implace__PoolQC,Cat_implace__Fence,Cat_implace__MiscFeature,Cat_implace__SaleType,Cat_implace__SaleCondition,remainder__Id,remainder__LotFrontage,remainder__LotArea,remainder__OverallQual,remainder__OverallCond,remainder__YearBuilt,remainder__YearRemodAdd,remainder__MasVnrArea,remainder__BsmtFinSF1,remainder__BsmtFinSF2,remainder__BsmtUnfSF,remainder__TotalBsmtSF,remainder__1stFlrSF,remainder__2ndFlrSF,remainder__LowQualFinSF,remainder__GrLivArea,remainder__BsmtFullBath,remainder__BsmtHalfBath,remainder__FullBath,remainder__HalfBath,remainder__BedroomAbvGr,remainder__KitchenAbvGr,remainder__TotRmsAbvGrd,remainder__Fireplaces,remainder__GarageYrBlt,remainder__GarageCars,remainder__GarageArea,remainder__WoodDeckSF,remainder__OpenPorchSF,remainder__EnclosedPorch,remainder__3SsnPorch,remainder__ScreenPorch,remainder__PoolArea,remainder__MiscVal,remainder__MoSold,remainder__YrSold
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,150.0,856.0,856,854,0,1710,1.0,0.0,2,1,3,1,8,0,2003.0,2.0,548.0,0,61,0,0,0,0,0,2,2008
1,2,1,1,1,1,1,1,2,1,2,2,1,1,2,1,1,2,2,2,2,1,2,1,1,2,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,2,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,284.0,1262.0,1262,0,0,1262,0.0,1.0,2,0,3,1,6,1,1976.0,2.0,460.0,298,0,0,0,0,0,0,5,2007
2,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,3,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,434.0,920.0,920,866,0,1786,1.0,0.0,2,1,3,1,6,1,2001.0,2.0,608.0,0,42,0,0,0,0,0,9,2008
3,3,1,1,1,2,1,1,3,1,3,1,1,1,1,1,1,3,3,2,2,1,3,2,2,1,2,1,1,2,1,1,1,1,3,2,2,1,1,1,1,1,1,1,2,4,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,540.0,756.0,961,756,0,1717,1.0,0.0,1,0,3,1,7,1,1998.0,3.0,642.0,0,35,272,0,0,0,0,2,2006
4,1,1,1,1,2,1,1,2,1,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,4,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,5,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,490.0,1145.0,1145,1053,0,2198,1.0,0.0,2,1,4,1,9,1,2000.0,3.0,836.0,192,84,0,0,0,0,0,12,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,2,1,1,1,1,1,1,1,2,5,1,1,1,2,1,1,9,6,1,2,1,1,2,1,4,4,4,1,4,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,2912,80.0,13384,5,5,1969,1979,194.0,119.0,344.0,641.0,1104.0,1360,0,0,1360,1.0,0.0,1,0,3,1,8,1,1969.0,1.0,336.0,160,0,0,0,0,0,0,5,2006
1452,11,2,1,1,1,1,1,1,1,14,1,1,5,1,1,1,7,7,2,2,1,2,2,1,1,4,1,1,3,1,1,2,1,1,4,2,1,1,1,1,1,1,1,2,2913,21.0,1533,4,5,1970,1970,0.0,408.0,0.0,138.0,546.0,546,546,0,1092,0.0,0.0,1,1,3,1,5,0,1970.0,1.0,286.0,0,0,0,0,0,0,0,12,2006
1455,11,2,1,1,1,1,1,1,1,14,1,1,4,1,1,1,7,7,2,2,1,2,2,1,1,4,1,1,3,1,1,2,1,1,4,2,1,1,1,1,1,1,1,2,2916,21.0,1894,4,5,1970,1970,0.0,252.0,0.0,294.0,546.0,546,546,0,1092,0.0,0.0,1,1,3,1,6,0,1970.0,1.0,286.0,0,24,0,0,0,0,0,4,2006
1456,2,1,1,1,1,1,1,1,1,5,1,1,1,2,1,1,1,1,2,2,1,2,2,1,1,2,1,1,1,1,1,2,1,2,2,2,1,1,1,1,1,1,1,2,2917,160.0,20000,5,7,1960,1996,0.0,1224.0,0.0,0.0,1224.0,1224,0,0,1224,1.0,0.0,1,0,4,1,7,1,1960.0,2.0,576.0,474,0,0,0,0,0,0,9,2006


In [87]:
show_nan_count(val)

Unnamed: 0,MSZoning,LotFrontage,Utilities,Exterior1st,Exterior2nd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,KitchenQual,Functional,GarageYrBlt,GarageCars,GarageArea,SaleType
NaN_count,4,227,2,1,1,15,1,1,1,1,2,2,1,2,78,1,1,1
data_type,object,float64,object,object,object,float64,float64,float64,float64,float64,float64,float64,object,object,float64,float64,float64,object


In [88]:
show_nan_count(train)

Unnamed: 0,LotFrontage,MasVnrArea,Electrical,GarageYrBlt
NaN_count,259,8,1,81
data_type,float64,float64,object,float64
