In [64]:
import os

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from autofeat import AutoFeatClassifier

In [69]:
df = pd.read_excel("/Users/nikolaistepanov/YandexPracticum/data/realty.xlsx")

In [70]:
df.head(2)

Unnamed: 0,CityHash,area,balcony,build_year,building_series_id,building_type_int,ceiling_height,first_day_exposition,first_month_exposition,flats_count,...,longitude,offer_id_hash,parking,price_sqm,renovation,rooms,rooms_offered,studio,total_area,unified_address
0,uint64,double,int32,int32,int64,int32,double,uint64,string,int32,...,double,uint64,int32,double,int32,int32,int32,boolean,double,string
1,9846558176160253361,35.099998474121094,1,1965,663294,6,2.640000104904175,1647032400000000,2022-03,84,...,37.78112030029297,4710561669182977508,8,270655.2824212874,10,1,1,False,35.099998474121094,"Россия, Москва, 1-я Новокузьминская улица, 6"


In [71]:
df = df.iloc[1:]

In [72]:
df.columns

Index(['CityHash', 'area', 'balcony', 'build_year', 'building_series_id',
       'building_type_int', 'ceiling_height', 'first_day_exposition',
       'first_month_exposition', 'flats_count', 'floor', 'floors_total',
       'has_elevator', 'is_apartment', 'kitchen_area', 'last_day_exposition',
       'last_price', 'latitude', 'living_area', 'locality_name', 'longitude',
       'offer_id_hash', 'parking', 'price_sqm', 'renovation', 'rooms',
       'rooms_offered', 'studio', 'total_area', 'unified_address'],
      dtype='object')

In [73]:
df.head(2)

Unnamed: 0,CityHash,area,balcony,build_year,building_series_id,building_type_int,ceiling_height,first_day_exposition,first_month_exposition,flats_count,...,longitude,offer_id_hash,parking,price_sqm,renovation,rooms,rooms_offered,studio,total_area,unified_address
1,9846558176160253361,35.099998474121094,1,1965,663294,6,2.640000104904175,1647032400000000,2022-03,84,...,37.78112030029297,4710561669182977508,8,270655.2824212874,10,1,1,False,35.099998474121094,"Россия, Москва, 1-я Новокузьминская улица, 6"
2,7756789824974483327,31.5,0,1964,663294,6,2.640000104904175,1606165200000000,2020-11,0,...,37.59383010864258,11210712509691726138,8,276190.4761904762,1,1,1,False,31.5,"Россия, Москва, Нагорный бульвар, 5к1"


In [74]:
df.dtypes.value_counts()

object    30
Name: count, dtype: int64

In [76]:
cat_columns = ["building_type_int"] # может быть больше

encoder_oh = OneHotEncoder(
    categories="auto",
    handle_unknown='ignore',
    max_categories=10,
    sparse_output=False,
    drop="first"
)
encoded_features = encoder_oh.fit_transform(df[cat_columns].to_numpy())

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_oh.get_feature_names_out(cat_columns)
)
df = pd.concat([df, encoded_df], axis=1)

In [80]:
num_columns = [
    "ceiling_height",
    "area",
]

In [81]:
num_columns = [
    "ceiling_height",
    "area",
]

df[num_columns[0]] = df[num_columns[0]].fillna(9999).astype(float)
df[num_columns[1]] = df[num_columns[1]].fillna(9999).astype(float)

degree = 3

encoder_pol = PolynomialFeatures(degree=degree)
encoded_features = encoder_pol.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(
    encoded_features,
    columns=encoder_pol.get_feature_names_out(num_columns)
)
df = pd.concat([df, encoded_df], axis=1)

In [53]:
n_bins = 5

encoder_kbd = KBinsDiscretizer(
    n_bins=n_bins, 
    encode='ordinal', 
    strategy='uniform', 
    subsample=None
)
encoded_features = encoder_kbd.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_kbd.get_feature_names_out()
)
df = pd.concat([df, encoded_df], axis=1)

In [61]:
numeric_transformer = ColumnTransformer(
    transformers=[
        ("pol", encoder_pol, num_columns),
        ("kbd", encoder_kbd, num_columns),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("encoder", encoder_oh),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_columns),
        ("cat", categorical_transformer, cat_columns),
    ],
    n_jobs=-1
)

In [None]:
df = pd.read_excel("/Users/nikolaistepanov/YandexPracticum/data/realty.xlsx")
df[num_columns[0]] = df[num_columns[0]].fillna(9999).astype(float)
df[num_columns[1]] = df[num_columns[1]].fillna(9999).astype(float)

In [82]:
encoded_features = preprocessor.fit_transform(df)

transformed_df = pd.DataFrame(
    encoded_features, 
    columns=preprocessor.get_feature_names_out()
)

In [83]:
preprocessor

### AutoFeat

In [None]:
features = [] # список признаков, которые студент захочет преобразовать при помощи данные библиотеки
transformations = (
    "1/", "1+", "1-", 
    'exp', 'log', 'abs', 'sqrt',
    "2^"
) # список преобразований, модет быть другой

afc = AutoFeatClassifier(
    categorical_cols=cat_features,
    transformations=transformations,
    feateng_steps=1,
    n_jobs=-1
)
transformed_df_auto = afc.fit_transform(transformed_df[features])

In [None]:
pip_requirements = "../../requirements.txt"

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.log_metrics(metrics)
    preprocessor_info = mlflow.sklearn.log_model(preprocessor, artifact_path="preprocessor")
    afc_info = mlflow.sklearn.log_model(afc, artifact_path="afc")
    model_info = mlflow.catboost.log_model(
        cb_model=model,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60, # опционально
        pip_requirements=pip_requirements,
        # signature=signature,
        # input_example=input_example,
        # metadata=metadata,
    )