In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Важная настройка для корректной настройки pipeline!
import sklearn
sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, OrdinalEncoder, QuantileTransformer, PowerTransformer, MaxAbsScaler
from sklearn.model_selection import GridSearchCV, KFold
from category_encoders import TargetEncoder
from sklearn.compose import make_column_selector, TransformedTargetRegressor

# for model learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

#models
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.svm import SVC
from catboost import CatBoostRegressor
# import xgboost as xgb
# from xgboost import XGBRegressor

# Metrics
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

import warnings
# Игнорировать все предупреждения
warnings.simplefilter(action='ignore', category=Warning)


# tunning hyperparamters model
import optuna
from optuna.samplers import TPESampler

In [2]:
data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')


In [3]:
numerical_columns = data_train.select_dtypes(include=['int64', 'float64']).columns

# Вычисление корреляций с целевой переменной
correlations = data_train[numerical_columns].apply(lambda x: x.corr(data_train['SalePrice']), axis=0).sort_values(ascending=False)

# Вывод списка корреляций
print(correlations)

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
dtype: float

In [4]:
data_train['MiscVal'].unique()

array([    0,   700,   350,   500,   400,   480,   450, 15500,  1200,
         800,  2000,   600,  3500,  1300,    54,   620,   560,  1400,
        8300,  1150,  2500])

In [5]:
filtered_correlations = correlations[(correlations >= -0.1) & (correlations <= 0.1)]

# Вывод списка имен столбцов
selected_columns = filtered_correlations.index.tolist()
print(selected_columns)

['PoolArea', 'MoSold', '3SsnPorch', 'BsmtFinSF2', 'BsmtHalfBath', 'MiscVal', 'Id', 'LowQualFinSF', 'YrSold', 'OverallCond', 'MSSubClass']


In [6]:
X, y = data_train.drop('SalePrice', axis=1), data_train['SalePrice']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

In [7]:
cat_cols = data_train.select_dtypes(include='object').columns.to_list()
num_cols = data_train.select_dtypes(include=['float64', 'int64']).columns.to_list()

In [8]:
drop_features = ['Id','Alley','MasVnrType','FireplaceQu','PoolQC','Fence','MiscFeature','LandSlope','GarageQual','GarageCond','MiscVal','Utilities','YrSold', 'MSSubClass','OverallCond', 'LowQualFinSF', 'MiscVal', 'BsmtHalfBath', '3SsnPorch', 'MoSold', 'PoolArea']

In [9]:
my_imputer = ColumnTransformer(
    transformers = [
        ('drop_features', 'drop', drop_features),
        ('num_imputer', SimpleImputer(strategy='mean'), make_column_selector(dtype_include=['float64', 'int64'])),
        ('cat_imputer', SimpleImputer(strategy='most_frequent'), make_column_selector(dtype_include='object'))
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough'
) 

filled_data = my_imputer.fit_transform(X_train)

In [10]:
pd.DataFrame(data={'NaN_count': filled_data.isna().sum(), 'data_type':filled_data.dtypes})

Unnamed: 0,NaN_count,data_type
Id,0,float64
MSSubClass,0,float64
LotFrontage,0,float64
LotArea,0,float64
OverallQual,0,float64
...,...,...
PoolQC,0,object
Fence,0,object
MiscFeature,0,object
SaleType,0,object


In [11]:
my_imputer.transform(X_train).isna().sum()

Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
                ..
PoolQC           0
Fence            0
MiscFeature      0
SaleType         0
SaleCondition    0
Length: 80, dtype: int64

In [12]:
scaler_and_encoder = ColumnTransformer(
    [
        ('target_encoding', TargetEncoder(), make_column_selector(dtype_include='object')),
        ('scaling_num_columns', StandardScaler(), make_column_selector(dtype_include=['float64', 'int64']))
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
)

In [13]:
scaler_for_obj = ColumnTransformer(
    [
        ('scaling2', StandardScaler(), make_column_selector(dtype_include=['float64', 'int64']))
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
)

In [14]:
from sklearn.feature_selection import SelectKBest, f_regression
preprocessor = Pipeline(
    [
        ('imputer', my_imputer),
        ('scaler_and_encoder', scaler_and_encoder),
        ('scaling2', scaler_for_obj)
    ]
)

In [15]:
new_df = preprocessor.fit_transform(X_train, y_train)
new_df

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
397,0.423421,0.062684,0.176866,-0.770813,0.056616,0.031296,-0.365252,-0.172473,-0.723815,0.598954,...,-0.995829,-0.757412,1.890607,-0.360412,-0.109151,-0.269692,-0.076003,-0.093955,0.251991,-0.617357
593,-2.321569,0.062684,0.176866,-0.770813,0.056616,0.031296,-0.365252,-0.172473,0.469327,0.322606,...,-0.256081,0.336709,-0.712142,-0.360412,-0.109151,-0.269692,-0.076003,-0.093955,-0.484876,0.888463
361,0.423421,0.062684,-5.654002,-0.770813,0.056616,0.031296,-0.365252,-0.172473,-1.076576,0.322606,...,-1.109636,-0.757412,-0.712142,-0.360412,-0.109151,-0.269692,-0.076003,-0.093955,0.251991,0.135553
1318,0.423421,0.062684,0.176866,1.958855,0.056616,0.031296,3.523470,-0.172473,0.469327,0.322606,...,1.299288,0.789988,1.611741,-0.360412,-0.109151,-0.269692,-0.076003,-0.093955,0.620425,-1.370266
916,-1.045823,0.062684,0.176866,-0.770813,0.056616,0.031296,-0.365252,-0.172473,-1.173379,0.322606,...,-0.787182,-0.757412,-0.712142,-0.360412,-0.109151,-0.269692,-0.076003,-0.093955,1.357292,-1.370266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,0.423421,0.062684,0.176866,-0.770813,0.056616,0.031296,0.441674,-0.172473,-0.921239,-3.131216,...,-0.654407,-0.757412,-0.712142,-0.360412,-0.109151,-0.269692,-0.076003,-0.093955,-0.484876,1.641373
255,0.423421,0.062684,0.176866,1.226862,0.056616,0.031296,-0.365252,-0.172473,0.265383,0.322606,...,-0.213403,0.743097,-0.712142,-0.360412,-0.109151,-0.269692,-0.076003,-0.093955,-1.590177,-1.370266
356,0.423421,0.062684,0.176866,1.226862,0.056616,0.031296,-0.365252,-0.172473,0.265383,0.322606,...,-0.350920,0.180406,-0.309336,-0.360412,-0.109151,-0.269692,-0.076003,-0.093955,0.251991,0.888463
574,0.423421,0.062684,0.176866,-0.770813,0.056616,0.031296,0.441674,-0.172473,-0.723815,0.322606,...,-0.882022,-0.483882,-0.712142,-0.360412,-0.109151,-0.269692,-0.076003,-0.093955,2.094159,-0.617357


In [31]:
from cgi import test
from math import exp
import numpy as np
import optuna
from optuna.samplers import TPESampler
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from altair import Column
from category_encoders import OneHotEncoder, OrdinalEncoder,TargetEncoder
from scipy.stats import ttest_ind
import shap
import catboost as cat

from sklearn import pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor, make_column_selector
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.feature_selection import f_regression, chi2
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.neighbors import KNeighborsRegressor
# init js
shap.initjs()
from sklearn.metrics import make_scorer

# Устанавливаем вывод pandas в удобочитаемый формат
pd.set_option('display.max_columns', None)

# Настройка вывода pandas через sklearn
import sklearn
sklearn.set_config(transform_output="pandas")

# Игнорируем предупреждения
warnings.filterwarnings('ignore')

train = pd.read_csv('train.csv')
val = pd.read_csv('test.csv')

x_train_X=train.drop(['SalePrice'],axis=1)
y_train_y=train['SalePrice']
y_train_ylog=np.log1p (train['SalePrice'])


class mypreprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.nanistype=[
            'Alley',
            'BsmtQual',
            'BsmtCond',
            'BsmtExposure',
            'BsmtFinType1',
            'BsmtFinType2',
            'FireplaceQu',
            'GarageType',
            'GarageFinish',
            'GarageQual',
            'GarageCond',
            'PoolQC',
            'Fence',
            'MiscFeature',
            'MasVnrType'
        ]
        self.rendict={
            'name':'MSSubClass',
            20: '1-STORY 1946 & NEWER ALL STYLES',
            30: '1-STORY 1945 & OLDER',
            40: '1-STORY W/FINISHED ATTIC ALL AGES',
            45: '1-1/2 STORY - UNFINISHED ALL AGES',
            50: '1-1/2 STORY FINISHED ALL AGES',
            60: '2-STORY 1946 & NEWER',
            70: '2-STORY 1945 & OLDER',
            75: '2-1/2 STORY ALL AGES',
            80: 'SPLIT OR MULTI-LEVEL',
            85: 'SPLIT FOYER',
            90: 'DUPLEX - ALL STYLES AND AGES',
            120: '1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
            150: '1-1/2 STORY PUD - ALL AGES',
            160: '2-STORY PUD - 1946 & NEWER',
            180: 'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',
            190: '2 FAMILY CONVERSION - ALL STYLES AND AGES'
        }

    def preprocess( self, df):
       
        df[self.rendict['name']]=df[self.rendict['name']].map( self.rendict)    
        df[ self.nanistype]=df[self.nanistype].fillna('Empty')
        return df
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return self.preprocess(X)

def show_nan_count(train):
    d=pd.DataFrame(data={'NaN_count': train.isna().sum(), 'data_type': train.dtypes}).T
    filtered_columns = d.columns[d.loc['NaN_count'] > 0]
    filtered_df = d[filtered_columns]
    return filtered_df

#drop_features = ['Id','Alley','MasVnrType','FireplaceQu','PoolQC','Fence','MiscFeature','LandSlope','GarageQual','GarageCond','MiscVal','Utilities','YrSold', 'MSSubClass','OverallCond', 'LowQualFinSF', 'MiscVal', 'BsmtHalfBath', '3SsnPorch', 'MoSold', 'PoolArea']
drop_features=['Id']
firimputer= ColumnTransformer(
    transformers=[
        ('mypreprocess', mypreprocess(), x_train_X.columns)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)
my_imputer = ColumnTransformer(
    transformers = [
        ('drop_features', 'drop', drop_features),
        ('num_imputer', KNNImputer(n_neighbors=5), make_column_selector(dtype_include=['float64', 'int64'])),
        ('cat_imputer', SimpleImputer(strategy='most_frequent'), make_column_selector(dtype_include='object'))
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough'
) 
scaler_and_encoder = ColumnTransformer(
    [
        ('target_encoding', TargetEncoder(), make_column_selector(dtype_include='object')),
        ('scaling_num_columns', StandardScaler(), make_column_selector(dtype_include=['float64', 'int64']))
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
)

prepreprocessor = Pipeline(
    [
        ('firimputer', firimputer),
        ('imputer', my_imputer),
        ('scaler_and_encoder', scaler_and_encoder),
    ]
)

pretrain_X = prepreprocessor.fit_transform(x_train_X,y_train_ylog)
pretrain_y = y_train_ylog

randomseed = 42
def rmse(y_true, y_pred):
    return -np.sqrt(np.mean((y_true - y_pred) ** 2))

# Создание кастомной метрики для использования в cross_val_score
rmse_scorer = make_scorer(rmse, greater_is_better=False)

def objectiveknn(trial):
    params = {'n_neighbors': trial.suggest_int('n_neighbors', 1, 10)}
    model = KNeighborsRegressor(**params)
    scores =  cross_val_score(model, pretrain_X, pretrain_y, cv=5, scoring=rmse_scorer)
    return np.mean(scores)

def objectivelasso(trial):
    params = {'alpha': trial.suggest_float('alpha', 0.0, 1.0)}
    model = Lasso(**params)
    scores =  cross_val_score(model, pretrain_X, pretrain_y, cv=5, scoring=rmse_scorer)
    return np.mean(scores)

def objectiveRidge(trial):
    params = {'alpha': trial.suggest_float('alpha', 0.0, 1.0)}
    model = Ridge(**params)
    scores =  cross_val_score(model, pretrain_X, pretrain_y, cv=5, scoring=rmse_scorer)
    return np.mean(scores)

def ojectiveRandomForest(trial):
    params = {'n_estimators': trial.suggest_int('n_estimators', 3, 10),
              'max_depth': trial.suggest_int('max_depth', 3, 8),
              'min_samples_split': trial.suggest_int('min_samples_split', 2, 7)}
    model = RandomForestRegressor(**params)
    scores =  cross_val_score(model, pretrain_X, pretrain_y, cv=5, scoring=rmse_scorer)
    return np.mean(scores)

# studyknn = optuna.create_study(direction='minimize')
# studylasso = optuna.create_study(direction='minimize')
# studyRidge = optuna.create_study(direction='minimize')
# studyRandomForest = optuna.create_study(direction='minimize')
# studylasso.optimize(objectivelasso, n_trials=10)
# studyknn.optimize(objectiveknn, n_trials=10)
# studyRidge.optimize(objectiveRidge, n_trials=10)
# studyRandomForest.optimize(ojectiveRandomForest, n_trials=1)

preprocessor = Pipeline(
    [
        ('firimputer', firimputer),
        ('imputer', my_imputer),
        ('scaler_and_encoder', scaler_and_encoder),
    ]
)
def gettt(model):
    return TransformedTargetRegressor(regressor=model, func=np.expm1, inverse_func=np.log1p)
cb =gettt(cat.CatBoostRegressor(random_seed=randomseed,iterations=30))
rf = gettt(RandomForestRegressor(**studyRandomForest.best_params))
las=gettt(Lasso(**studylasso.best_params))
ridge=gettt(Ridge(**studyRidge.best_params))
knn = gettt(KNeighborsRegressor( **studyknn.best_params))

train_X = preprocessor.fit_transform(x_train_X,y_train_ylog)
train_y = y_train_ylog
train_X,test_X,train_y,test_y = train_test_split(train_X, train_y, test_size=0.3,random_state=42)

predcb=cb.fit(train_X,train_y).predict(test_X)
predrf=rf.fit(train_X,train_y).predict(test_X)
predlas=las.fit(train_X,train_y).predict(test_X)
predridge=ridge.fit(train_X,train_y).predict(test_X)
predknn=knn.fit(train_X,train_y).predict(test_X)

def ojectiveweights(trial):
    cbweight = trial.suggest_float('cbweight', 0.0, 10.0)
    rfweight = trial.suggest_float('rfweight', 0.0, 10.0)
    lasweight = trial.suggest_float('lasweight', 0.0, 10.0)
    ridgeweight = trial.suggest_float('ridgeweight', 0.0, 10.0)
    knnweight = trial.suggest_float('knnweight', 0.0, 10.0)
    pred= (cbweight*predcb+rfweight*predrf+lasweight*predlas+ridgeweight*predridge+knnweight*predknn)/(cbweight+rfweight+lasweight+ridgeweight+knnweight)
    return rmse(test_y, pred)

studyweights = optuna.create_study(direction='minimize')
studyweights.optimize(ojectiveweights, n_trials=10)
weights=[studyweights.best_params['cbweight'],studyweights.best_params['rfweight'],studyweights.best_params['lasweight'],studyweights.best_params['ridgeweight'],studyweights.best_params['knnweight']]
v = VotingRegressor(estimators=[('cb', cb), ('rf', rf), ('las', las), ('ridge', ridge), ('knn', knn)], weights=weights)
# Создание и обучение модели с логарифмическим преобразованием целевой переменной
ml_pipeline_log = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model_log', v)
    ]
)

ml_pipeline_log.fit(train_X,train_y)

pred = ml_pipeline_log.predict(test_X) 
-rmse(test_y, pred)




Learning rate set to 0.115677
0:	learn: 73223.8093097	total: 4.3ms	remaining: 1.29s
1:	learn: 67614.9809608	total: 10ms	remaining: 1.49s
2:	learn: 62926.4171711	total: 16.5ms	remaining: 1.64s
3:	learn: 58991.5537579	total: 24ms	remaining: 1.77s
4:	learn: 55054.2695503	total: 32.2ms	remaining: 1.9s
5:	learn: 51849.9339992	total: 38.9ms	remaining: 1.91s
6:	learn: 48971.5061083	total: 45.4ms	remaining: 1.9s
7:	learn: 46194.4401313	total: 50.9ms	remaining: 1.86s
8:	learn: 44031.7946279	total: 57.6ms	remaining: 1.86s
9:	learn: 41901.2586725	total: 62.3ms	remaining: 1.8s
10:	learn: 40180.0679383	total: 66.6ms	remaining: 1.75s
11:	learn: 38672.8799523	total: 76ms	remaining: 1.82s
12:	learn: 36867.6167753	total: 80.1ms	remaining: 1.77s
13:	learn: 35419.5207987	total: 86ms	remaining: 1.76s
14:	learn: 34063.9452960	total: 91.4ms	remaining: 1.74s
15:	learn: 32768.5408398	total: 95.6ms	remaining: 1.7s
16:	learn: 31577.8939693	total: 104ms	remaining: 1.73s
17:	learn: 30638.6506914	total: 111ms	rema

In [29]:
y_valid

181     200100
1184    186700
1059    220000
350     318061
911     143500
         ...  
236     185500
115     176000
87      164500
323     126175
709     109900
Name: SalePrice, Length: 438, dtype: int64

In [28]:
y_pred

array([190422.01647212, 192521.04152563, 213182.47312097, 315280.2611552 ,
       150848.60026469, 178587.31116175, 282420.85089104, 153077.19096346,
       149402.6409096 , 123856.60641709, 144303.78534595, 315839.17249978,
       135303.62845019, 107698.39312883, 175444.85134685, 192218.66701229,
       221577.28811889, 307539.49474055, 129257.20958524, 120972.65728537,
       265758.91670666, 125054.93495486, 146365.78086947, 136421.0032453 ,
       211210.9543149 , 132834.68721107, 147251.69431061,  84148.6948791 ,
       165520.19241961, 145671.04414003, 126739.91686454, 157527.12433083,
       121305.0336352 , 243138.59573055, 132947.21981532, 175718.95786953,
       194011.03361803, 164211.52596297,  86349.52539406, 574802.08933995,
       138923.7112835 , 219407.43596529, 136837.71496232, 108530.79929219,
       244660.8314915 , 257135.02556715, 191740.93842574, 436528.53077526,
       261963.14693246, 185996.00521973, 158749.0355518 , 183287.68432418,
       138498.46568712,  

In [18]:
submission = pd.DataFrame({'Id': data_test.Id, 'SalePrice': predictions})
print(submission.head())
submission.to_csv('submission9.csv', index=False)

     Id      SalePrice
0  1461  125901.076242
1  1462  166688.911191
2  1463  187303.574282
3  1464  196341.358726
4  1465  187206.275733


In [19]:
print(f"Mean squared log error: {err}")

Mean squared log error: 0.04456827227348986


In [998]:
# ml_pipeline = Pipeline(
#     [
#         ('preprocessor', preprocessor),
#         ('model', CatBoostRegressor())
#     ]
# )

In [999]:
# ml_pipeline = Pipeline(
#     [
#         ('preprocessor', preprocessor),
#         ('model', XGBRegressor())
#     ]
# )

In [1000]:
# model = ml_pipeline.fit(X_train, y_train)

In [1001]:
# from math import sqrt
# y_pred = ml_pipeline.predict(X_valid)

# # Вычисление средней квадратичной ошибки
# mse = mean_squared_error(y_valid, y_pred)
# print('Mean Squared Error:', mse)

# # Вычисление коэффициента детерминации (R^2)
# r2 = r2_score(y_valid, y_pred)
# print('R^2 Score:', r2)

# rmse = sqrt(mse)
# print('Root Mean Squared Error:', rmse)



In [1002]:
# from sklearn.metrics import mean_squared_log_error
# err = np.sqrt(mean_squared_log_error(y_valid, y_pred))
# err

In [1003]:
# model = ml_pipeline.fit(X, y)
# predictions = model.predict(data_test)

In [1004]:
# submission = pd.DataFrame({'Id': data_test.Id, 'SalePrice': predictions})
# print(submission.head())
# submission.to_csv('submission8.csv', index=False)

In [None]:
from cgi import test
from math import exp
import numpy as np
import optuna
from optuna.samplers import TPESampler
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from altair import Column
from category_encoders import OneHotEncoder, OrdinalEncoder,TargetEncoder
from scipy.stats import ttest_ind
import shap
import catboost as cat

from sklearn import pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor, make_column_selector
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.feature_selection import f_regression, chi2
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.neighbors import KNeighborsRegressor
# init js
shap.initjs()
from sklearn.metrics import make_scorer

# Устанавливаем вывод pandas в удобочитаемый формат
pd.set_option('display.max_columns', None)

# Настройка вывода pandas через sklearn
import sklearn
sklearn.set_config(transform_output="pandas")

# Игнорируем предупреждения
warnings.filterwarnings('ignore')

train = pd.read_csv('train.csv')
val = pd.read_csv('test.csv')

x_train_X=train.drop(['SalePrice'],axis=1)
y_train_y=np.log1p (train['SalePrice'])



class mypreprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.nanistype=[
            'Alley',
            'BsmtQual',
            'BsmtCond',
            'BsmtExposure',
            'BsmtFinType1',
            'BsmtFinType2',
            'FireplaceQu',
            'GarageType',
            'GarageFinish',
            'GarageQual',
            'GarageCond',
            'PoolQC',
            'Fence',
            'MiscFeature',
            'MasVnrType'
        ]
        self.rendict={
            'name':'MSSubClass',
            20: '1-STORY 1946 & NEWER ALL STYLES',
            30: '1-STORY 1945 & OLDER',
            40: '1-STORY W/FINISHED ATTIC ALL AGES',
            45: '1-1/2 STORY - UNFINISHED ALL AGES',
            50: '1-1/2 STORY FINISHED ALL AGES',
            60: '2-STORY 1946 & NEWER',
            70: '2-STORY 1945 & OLDER',
            75: '2-1/2 STORY ALL AGES',
            80: 'SPLIT OR MULTI-LEVEL',
            85: 'SPLIT FOYER',
            90: 'DUPLEX - ALL STYLES AND AGES',
            120: '1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
            150: '1-1/2 STORY PUD - ALL AGES',
            160: '2-STORY PUD - 1946 & NEWER',
            180: 'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',
            190: '2 FAMILY CONVERSION - ALL STYLES AND AGES'
        }

    def preprocess( self, df):
       
        df[self.rendict['name']]=df[self.rendict['name']].map( self.rendict)    
        df[ self.nanistype]=df[self.nanistype].fillna('Empty')
        return df
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return self.preprocess(X)

def show_nan_count(train):
    d=pd.DataFrame(data={'NaN_count': train.isna().sum(), 'data_type': train.dtypes}).T
    filtered_columns = d.columns[d.loc['NaN_count'] > 0]
    filtered_df = d[filtered_columns]
    return filtered_df

#drop_features = ['Id','Alley','MasVnrType','FireplaceQu','PoolQC','Fence','MiscFeature','LandSlope','GarageQual','GarageCond','MiscVal','Utilities','YrSold', 'MSSubClass','OverallCond', 'LowQualFinSF', 'MiscVal', 'BsmtHalfBath', '3SsnPorch', 'MoSold', 'PoolArea']
drop_features=['Id']
firimputer= ColumnTransformer(
    transformers=[
        ('mypreprocess', mypreprocess(), x_train_X.columns)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)
my_imputer = ColumnTransformer(
    transformers = [
        ('drop_features', 'drop', drop_features),
        ('num_imputer', KNNImputer(n_neighbors=5), make_column_selector(dtype_include=['float64', 'int64'])),
        ('cat_imputer', SimpleImputer(strategy='most_frequent'), make_column_selector(dtype_include='object'))
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough'
) 
scaler_and_encoder = ColumnTransformer(
    [
        ('target_encoding', TargetEncoder(), make_column_selector(dtype_include='object')),
        ('scaling_num_columns', StandardScaler(), make_column_selector(dtype_include=['float64', 'int64']))
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
)

preprocessor = Pipeline(
    [
        ('firimputer', firimputer),
        ('imputer', my_imputer),
        ('scaler_and_encoder', scaler_and_encoder),
    ]
)

train_X = preprocessor.fit_transform(x_train_X,y_train_y)
train_y = y_train_y


randomseed = 42
def rmse(y_true, y_pred):
    return -np.sqrt(np.mean((y_true - y_pred) ** 2))

# Создание кастомной метрики для использования в cross_val_score
rmse_scorer = make_scorer(rmse, greater_is_better=False)



def objectiveknn(trial):
    params = {'n_neighbors': trial.suggest_int('n_neighbors', 1, 10)}
    model = KNeighborsRegressor(**params)
    scores =  cross_val_score(model, train_X, train_y, cv=5, scoring=rmse_scorer)
    return np.mean(scores)
studyknn = optuna.create_study(direction='minimize')
def objectivelasso(trial):
    params = {'alpha': trial.suggest_float('alpha', 0.0, 1.0)}
    model = Lasso(**params)
    scores =  cross_val_score(model, train_X, train_y, cv=5, scoring=rmse_scorer)
    return np.mean(scores)
studylasso = optuna.create_study(direction='minimize')
def objectiveRidge(trial):
    params = {'alpha': trial.suggest_float('alpha', 0.0, 1.0)}
    model = Ridge(**params)
    scores =  cross_val_score(model, train_X, train_y, cv=5, scoring=rmse_scorer)
    return np.mean(scores)
studyRidge = optuna.create_study(direction='minimize')
def ojectiveRandomForest(trial):
    params = {'n_estimators': trial.suggest_int('n_estimators', 3, 10),
              'max_depth': trial.suggest_int('max_depth', 3, 8),
              'min_samples_split': trial.suggest_int('min_samples_split', 2, 7)}
    model = RandomForestRegressor(**params)
    scores =  cross_val_score(model, train_X, train_y, cv=5, scoring=rmse_scorer)
    return np.mean(scores)
studyRandomForest = optuna.create_study(direction='minimize')


studylasso.optimize(objectivelasso, n_trials=100)
studyknn.optimize(objectiveknn, n_trials=100)
studyRidge.optimize(objectiveRidge, n_trials=100)
studyRandomForest.optimize(ojectiveRandomForest, n_trials=10)

def gettt(model):
    return TransformedTargetRegressor(regressor=model, func=np.expm1, inverse_func=np.log1p)
cb =gettt(cat.CatBoostRegressor(random_seed=randomseed,iterations=300))
rf = gettt(RandomForestRegressor(n_estimators=112, max_depth=12, min_samples_split=6))
las=gettt(Lasso(**studylasso.best_params))
ridge=gettt(Ridge())
knn = gettt(KNeighborsRegressor( **studyknn.best_params))

train_X,test_X,train_y,test_y = train_test_split(train_X, train_y, test_size=0.3,random_state=42)

predcb=cb.fit(train_X,train_y).predict(test_X)
predrf=rf.fit(train_X,train_y).predict(test_X)
predlas=las.fit(train_X,train_y).predict(test_X)
predridge=ridge.fit(train_X,train_y).predict(test_X)
predknn=knn.fit(train_X,train_y).predict(test_X)

def ojectiveweights(trial):
    cbweight = trial.suggest_float('cbweight', 0.0, 1.0)
    rfweight = trial.suggest_float('rfweight', 0.0, 1.0)
    lasweight = trial.suggest_float('lasweight', 0.0, 1.0)
    ridgeweight = trial.suggest_float('ridgeweight', 0.0, 1.0)
    knnweight = trial.suggest_float('knnweight', 0.0, 1.0)
    pred= (cbweight*predcb+rfweight*predrf+lasweight*predlas+ridgeweight*predridge+knnweight*predknn)/(cbweight+rfweight+lasweight+ridgeweight+knnweight)
    return rmse(test_y, pred)

studyweights = optuna.create_study(direction='minimize')
studyweights.optimize(ojectiveweights, n_trials=100)
weights=[studyweights.best_params['cbweight'],studyweights.best_params['rfweight'],studyweights.best_params['lasweight'],studyweights.best_params['ridgeweight'],studyweights.best_params['knnweight']]
v = VotingRegressor(estimators=[('cb', cb), ('rf', rf), ('las', las), ('ridge', ridge), ('knn', knn)], weights=weights)
# Создание и обучение модели с логарифмическим преобразованием целевой переменной
ml_pipeline_log = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model_log', v)
    ]
)

ml_pipeline_log.fit(train_X,train_y)

pred = ml_pipeline_log.predict(test_X) 
RMSLE(test_y, pred)




In [1005]:
#Пробуем модель randomforest результат 0.14

In [1006]:
# ml_pipeline = Pipeline(
#     [
#         ('preprocessor', preprocessor),
#         ('model', RandomForestRegressor(n_estimators=112, max_depth=12, min_samples_split=6))
#     ]
# )

In [1007]:
# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 50, 200),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'min_samples_split': trial.suggest_int('min_samples_split', 2, 20)
#     }

#     model = RandomForestRegressor(**params)

#     # Создаем Pipeline с предобработкой и моделью
#     pipeline = Pipeline([
#         ('preprocessor', preprocessor),
#         ('model', model)
#     ])

#     # Оцениваем модель с помощью кросс-валидации
#     scores = -cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    
#     return scores.mean()

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=30)

# print('Лучшие гиперпараметры:', study.best_params)
# print('Минимальное значение MSE:', study.best_value)

In [1008]:
#Пробуем модель xgboost 0.15764922452888183
#n_estimators=129, max_depth=6, learning_rate=0.04291606041049112, subsample=0.6739178958863075, colsample_bytree=0.7508549367177377, gamma=0.5050428979601813

In [1009]:
#Пробуем модель CatBoostRegressor 0.10821851593205858
#пока что лучшие показатели, попробуем прогнать через оптюну
#Лучшие гиперпараметры: {'n_estimators': 179, 'max_depth': 7, 'learning_rate': 0.06210223900648785, 'subsample': 0.7224302068857389, 'colsample_bylevel': 0.8319331845603969, 'reg_lambda': 0.9923582305360312}
#Минимальное значение MSE: 5653456242.055735

In [1010]:
# ml_pipeline = Pipeline(
#     [
#         ('preprocessor', preprocessor),
#         ('model', CatBoostRegressor())
#     ]
# )

In [1011]:
# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 50, 200),
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
#         'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
#     }

#     model = CatBoostRegressor(**params, verbose=0)

#     # Создаем Pipeline с предобработкой и моделью
#     pipeline = Pipeline([
#         ('preprocessor', preprocessor),
#         ('model', model)
#     ])

#     # Оцениваем модель с помощью кросс-валидации
#     scores = -cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
    
#     return scores.mean()

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=30)

# print('Лучшие гиперпараметры:', study.best_params)
# print('Минимальное значение MSE:', study.best_value)

In [1012]:
# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 50, 200),
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'gamma': trial.suggest_float('gamma', 0.0, 1.0)
#     }

#     model = XGBRegressor(**params)

#     # Создаем Pipeline с предобработкой и моделью
#     pipeline = Pipeline([
#         ('preprocessor', preprocessor),
#         ('model', model)
#     ])

#     # Оцениваем модель с помощью кросс-валидации
#     scores = -cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
    
#     return scores.mean()

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=30)

# print('Лучшие гиперпараметры:', study.best_params)
# print('Минимальное значение MSE:', study.best_value)

In [1013]:
# ml_pipeline.fit(X_train, y_train)

In [1014]:
# from sklearn.metrics import mean_squared_log_error
# y_train_log = np.log1p(y_train)
# y_valid_log = np.log1p(y_valid)

# # Создание модели с логарифмическим преобразованием целевой переменной
# ml_pipeline_log = Pipeline(
#     [
#         ('preprocessor', preprocessor),
#         ('model_log', TransformedTargetRegressor(regressor=CatBoostRegressor(), func=np.expm1, inverse_func=np.log1p))
#     ]
# )

# # Обучение модели
# ml_pipeline_log.fit(X_train, y_train_log)

# # Предсказания на валидационном наборе данных
# y_pred_log = ml_pipeline_log.predict(X_valid)

# # Оценка модели по mean_squared_log_error
# err = np.sqrt(mean_squared_log_error(y_valid_log, y_pred_log))
# print("RMSE на логарифмированной целевой переменной: ", err)


# # y_log = np.log1p(y)

# # # Создание и обучение модели с логарифмическим преобразованием целевой переменной
# # ml_pipeline_log = Pipeline(
# #     [
# #         ('preprocessor', preprocessor),
# #         ('model_log', TransformedTargetRegressor(regressor=CatBoostRegressor(), func=np.expm1, inverse_func=np.log1p))
# #     ]
# # )

# model = ml_pipeline_log.fit(X, y)

# # Предсказания на тестовом наборе данных
# predictions = model.predict(data_test)

In [1015]:
# from sklearn.metrics import mean_squared_log_error
# y_train_log = np.log1p(y_train)
# y_valid_log = np.log1p(y_valid)

# ml_pipeline_log = Pipeline(
#     [
#         ('preprocessor', preprocessor),
#         ('model_log', TransformedTargetRegressor(regressor=CatBoostRegressor(), func=np.expm1, inverse_func=np.log1p))
#     ]
# )

# ml_pipeline_log.fit(X_train, y_train_log)

# # Скопировать обработку данных из ml_pipeline_log
# preprocessor = ml_pipeline_log.named_steps['preprocessor']

# # Создание и обучение модели без логарифмического преобразования целевой переменной
# ml_pipeline = Pipeline(
#     [
#         ('preprocessor', preprocessor),
#         ('model', CatBoostRegressor())
#     ]
# )

# # Обучение модели без логарифмического преобразования целевой переменной
# model = ml_pipeline.fit(X, y)

# # Предсказания на тестовом наборе данных
# predictions = model.predict(data_test)

# # Оценка модели по mean_squared_log_error
# y_pred_log = ml_pipeline_log.predict(X_valid)
# err = np.sqrt(mean_squared_log_error(np.expm1(y_valid_log), np.expm1(y_pred_log)))
# print(f"Mean squared log error: {err}")

In [1016]:
# submission = pd.DataFrame({'Id': data_test.Id, 'SalePrice': predictions})
# print(submission.head())
# submission.to_csv('submission3.csv', index=False)

In [1017]:
# from sklearn.metrics import mean_squared_log_error

# y_log = np.log1p(y)

# # Создание и обучение модели с логарифмическим преобразованием целевой переменной
# ml_pipeline_log = Pipeline(
#     [
#         ('preprocessor', preprocessor),
#         ('model_log', TransformedTargetRegressor(regressor=CatBoostRegressor(), func=np.expm1, inverse_func=np.log1p))
#     ]
# )

# model = ml_pipeline_log.fit(X, y_log)
# y_pred_l = ml_pipeline_log.predict(X_valid)
# y_pred = np.expm1(y_pred_l)

# err = np.sqrt(mean_squared_log_error(y_valid, y_pred))
# print(f"Mean squared log error: {err}")

# model = ml_pipeline_log.fit(X, y_log)
# predictions_log = model.predict(data_test)
# predictions = np.expm1(predictions_log)

In [1018]:
# submission = pd.DataFrame({'Id': data_test.Id, 'SalePrice': predictions})
# print(submission.head())
# submission.to_csv('submission9.csv', index=False)

In [1019]:
# submission = pd.DataFrame({'Id': data_test.Id, 'SalePrice': predictions})
# print(submission.head())
# submission.to_csv('submission4.csv', index=False)

In [1020]:
# from sklearn.metrics import mean_squared_log_error
# msle = mean_squared_log_error(np.expm1(y_log), np.expm1(predictions))
# print("Mean Squared Log Error:", msle)

In [1021]:
# from math import sqrt
# y_pred = ml_pipeline.predict(X_valid)

# # Вычисление средней квадратичной ошибки
# mse = mean_squared_error(y_valid, y_pred)
# print('Mean Squared Error:', mse)

# # Вычисление коэффициента детерминации (R^2)
# r2 = r2_score(y_valid, y_pred)
# print('R^2 Score:', r2)

# rmse = sqrt(mse)
# print('Root Mean Squared Error:', rmse)

#Mean Squared Error: 1505777279.764108
# R^2 Score: 0.7450421931135391
# Root Mean Squared Error: 38804.346145298055

In [1022]:
# from sklearn.metrics import mean_squared_log_error
# err = np.sqrt(mean_squared_log_error(y_valid, y_pred))
# err

In [1023]:
# model = ml_pipeline.fit(X, y)
# predictions = model.predict(data_test)

In [1024]:
# submission = pd.DataFrame({'Id': data_test.Id, 'SalePrice': predictions})
# print(submission.head())
# submission.to_csv('submission2.csv', index=False)

In [1025]:
# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 50, 200),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
#         'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
#     }

#     rf = RandomForestRegressor(**params)
#     scores = -1 * cross_val_score(rf, X, y, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_squared_error')

#     return np.mean(scores)

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

# # Получаем лучшие параметры модели
# best_params = study.best_params
# best_score = study.best_value

# print("Лучшие параметры модели, полученные с кросс-валидацией:")
# print(best_params)
# print("Средний score с кросс-валидацией:")
# print(best_score)

In [1026]:
# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 50, 200),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'min_samples_split': trial.suggest_int('min_samples_split', 2, 20)
#     }

#     model = RandomForestRegressor(**params)
#     ml_pipeline = Pipeline(
#     [
#             ('preprocessor', preprocessor),
#             ('model', model)
#     ]
#     )

#     # Оцениваем модель с помощью кросс-валидации
#     scores = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    
#     return scores.mean()

# # Создаем и запускаем Optuna Study
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=50)

# # Получаем лучшие гиперпараметры
# best_params = study.best_params
# print("Лучшие гиперпараметры:", best_params)