In [172]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from altair import Column
from category_encoders import OrdinalEncoder
from scipy.stats import ttest_ind
import shap
import catboost as cat

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import f_regression, chi2
from sklearn.neighbors import NearestNeighbors

# Устанавливаем вывод pandas в удобочитаемый формат
pd.set_option('display.max_columns', None)

# Настройка вывода pandas через sklearn
import sklearn
sklearn.set_config(transform_output="pandas")

# Игнорируем предупреждения
warnings.filterwarnings('ignore')

train = pd.read_csv('train.csv')
val_X = pd.read_csv('test.csv')

train_X=train.drop(['SalePrice'],axis=1)
train_y=train['SalePrice']



class mypreprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.nanistype=[
            'Alley',
            'BsmtQual',
            'BsmtCond',
            'BsmtExposure',
            'BsmtFinType1',
            'BsmtFinType2',
            'FireplaceQu',
            'GarageType',
            'GarageFinish',
            'GarageQual',
            'GarageCond',
            'PoolQC',
            'Fence',
            'MiscFeature',
            'MasVnrType'
        ]
        self.rendict={
            'name':'MSSubClass',
            20: '1-STORY 1946 & NEWER ALL STYLES',
            30: '1-STORY 1945 & OLDER',
            40: '1-STORY W/FINISHED ATTIC ALL AGES',
            45: '1-1/2 STORY - UNFINISHED ALL AGES',
            50: '1-1/2 STORY FINISHED ALL AGES',
            60: '2-STORY 1946 & NEWER',
            70: '2-STORY 1945 & OLDER',
            75: '2-1/2 STORY ALL AGES',
            80: 'SPLIT OR MULTI-LEVEL',
            85: 'SPLIT FOYER',
            90: 'DUPLEX - ALL STYLES AND AGES',
            120: '1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
            150: '1-1/2 STORY PUD - ALL AGES',
            160: '2-STORY PUD - 1946 & NEWER',
            180: 'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',
            190: '2 FAMILY CONVERSION - ALL STYLES AND AGES'
        }

    def preprocess( self, df):
       
        df[self.rendict['name']]=df[self.rendict['name']].map( self.rendict)    
        df[ self.nanistype]=df[self.nanistype].fillna('Empty')
        return df
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return self.preprocess(X)



def show_nan_count(train):
    d=pd.DataFrame(data={'NaN_count': train.isna().sum(), 'data_type': train.dtypes}).T
    filtered_columns = d.columns[d.loc['NaN_count'] > 0]
    filtered_df = d[filtered_columns]
    return filtered_df



def get_col_cat(data,):
    catdic={}
    noncatdic={}
    for i in data.columns:
        if data[i].dtype=='object':
            catdic[i]=data[i].nunique()
        else:
            noncatdic[i]=data[i].nunique()
    return catdic,noncatdic




fullset_X=pd.concat([train_X,val_X])

catdic,noncatdic=get_col_cat(fullset_X)

del noncatdic['MSSubClass']
catdic['MSSubClass']=1

fprep = ColumnTransformer (
    transformers=[
        ('preprocess', mypreprocess(), fullset_X.columns)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

firtrans= ColumnTransformer (
    transformers=[
        ('Cat_implace', OrdinalEncoder(), list(catdic.keys()))
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

altrans = ColumnTransformer (
    transformers=[
        ('transformator', KNNImputer(n_neighbors=10), fullset_X.columns)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

fullset_X = fprep.fit_transform(fullset_X)
firtrans.fit(fullset_X.dropna())
fullset_X = firtrans.transform(fullset_X)
fullset_X = altrans.fit_transform(fullset_X)

pipeline = Pipeline([
    ('first', fprep),
    ('imputer', firtrans),
    ('transformer', altrans)
])



NaN_count
data_type


In [173]:
show_nan_count(pipeline.transform(train_X))

NaN_count
data_type


In [154]:
show_nan_count(train)

Unnamed: 0,LotFrontage,Alley,MasVnrType,MasVnrArea,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
NaN_count,259,1369,872,8,37,37,38,37,38,1,690,81,81,81,81,81,1453,1179,1406
data_type,float64,object,object,float64,object,object,object,object,object,object,object,object,float64,object,object,object,object,object,object
