In [14]:
import pandas as pd
import geopandas as gpd

data = gpd.read_file('data.geojson', driver='GeoJSON')

y = data["price"]
X = data.drop(columns=["price"])

In [81]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder, QuantileTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import mutual_info_regression
from krigeExtrenstions import ErrorKrigeRegressionAdapter
from sklearn.feature_selection import SelectPercentile
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numeric_features = [
    "floor", "floors_count", "rooms_count", "total_meters", "living_meters", "kitchen_meters"
]
categorical_features = [
    'year_of_construction', 'object_type', 'house_material_type', 'heating_type', 'finish_type'
]

basic_regressor = Pipeline(
    memory="test",
    steps=[
        (
            "preprocessor", 
            ColumnTransformer(
                transformers=[
                    (
                        "nums", 
                        Pipeline(steps=[
                            ("imputer", SimpleImputer(strategy="median", add_indicator=True)), 
                            ("scaler", StandardScaler()),
                        ]), 
                        numeric_features
                    ),
                    (
                        "cat_one_hot", 
                        Pipeline(steps=[
                            ("encoder", OneHotEncoder(handle_unknown="ignore")),
                        ]), 
                        categorical_features
                    ),
                    (
                        "cat_mean", 
                        Pipeline(steps=[
                            ("encoder", TargetEncoder(target_type="continuous")),
                        ]), 
                        categorical_features
                    )
                ]
            )
        ),
        ("selector", SelectPercentile(mutual_info_regression, percentile=90)),
        ('reg', RandomForestRegressor(random_state=0))
    ]
)
krigeAdapter = ErrorKrigeRegressionAdapter(basic_regressor)

basic_param_grid = [{
    "basic_regressor__reg__n_estimators": [150, 300, 450],
#     "basic_regressor__selector__percentile": [92, 94, 96, 98, 100],
    'moranThreshold': [0.1, 0.5],
    'krige__n_closest_points': [1, 2, 3, 5],
    'weightsParams': [{'type': 'knn', 'k': 5}, {'type': 'knn', 'k': 4}]
}]

estimator = GridSearchCV(krigeAdapter, basic_param_grid, cv=5, n_jobs=-1)
estimator.fit(X, y)
estimator.best_score_

 0.25332626 0.25332626 0.43771448 0.43771448 0.25332626 0.25332626
 0.40562219 0.40562219 0.25332626 0.25332626        nan        nan
 0.27805219 0.27805219 0.4178725  0.4178725  0.27805219 0.27805219
 0.46013047 0.46013047 0.27805219 0.27805219 0.43015204 0.43015204
 0.27805219 0.27805219        nan        nan 0.2834643  0.2834643
 0.41785561 0.41785561 0.2834643  0.2834643  0.45986066 0.45986066
 0.2834643  0.2834643  0.42941985 0.42941985 0.2834643  0.2834643 ]
 There are 6 disconnected components.


0.4601304708174558

In [82]:
estimator.best_params_

{'basic_regressor__reg__n_estimators': 300,
 'krige__n_closest_points': 3,
 'moranThreshold': 0.1,
 'weightsParams': {'type': 'knn', 'k': 5}}