# Импорт библиотек

In [2]:
!pip install catboost -q

In [26]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import mlflow
from mlflow.tracking import MlflowClient

from sklearn.pipeline import Pipeline

from catboost import CatBoostClassifier

# Импорт данных

### Для бэйслайна возмем основной датасет

In [6]:
df = pd.read_csv('../data/application_train.csv')
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df.shape

(307511, 122)

# Разведочный анализ

In [8]:
df.describe()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,307511.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,265992.0,265992.0,265992.0,265992.0,265992.0,265992.0
mean,278180.518577,0.080729,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,-16036.995067,63815.045904,...,0.00813,0.000595,0.000507,0.000335,0.006402,0.007,0.034362,0.267395,0.265474,1.899974
std,102790.175348,0.272419,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,4363.988632,141275.766519,...,0.089798,0.024387,0.022518,0.018299,0.083849,0.110757,0.204685,0.916002,0.794056,1.869295
min,100002.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189145.5,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278202.0,0.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,367142.5,0.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,456255.0,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,365243.0,...,1.0,1.0,1.0,1.0,4.0,9.0,8.0,27.0,261.0,25.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB


### Уникальные значения и константные значения

In [10]:
(df.nunique() == 1).sum()

0

### Пропуски

In [11]:
df.isna().sum()

SK_ID_CURR                        0
TARGET                            0
NAME_CONTRACT_TYPE                0
CODE_GENDER                       0
FLAG_OWN_CAR                      0
                              ...  
AMT_REQ_CREDIT_BUREAU_DAY     41519
AMT_REQ_CREDIT_BUREAU_WEEK    41519
AMT_REQ_CREDIT_BUREAU_MON     41519
AMT_REQ_CREDIT_BUREAU_QRT     41519
AMT_REQ_CREDIT_BUREAU_YEAR    41519
Length: 122, dtype: int64

Пропусков не сильно много, при предобработке заменим их на медиану

# Предобработка данных

Строим пайплайн предобработки с помощью ColumnTransformer

In [12]:
X = df.drop(['TARGET'], axis=1)
y = df['TARGET']

cat_features = X.describe(include='object').columns
num_features = X.describe().columns

In [13]:
cat_tranformer = Pipeline([
    ('inputer', SimpleImputer(strategy='most_frequent')),
])

num_transformer = Pipeline([
    ('inputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False))
])


transformer = ColumnTransformer([
    ('cat', cat_tranformer, cat_features),
    ('num', num_transformer, num_features)
])


transformer.fit(X)

X = pd.DataFrame(transformer.transform(X), columns=transformer.get_feature_names_out())
# X.head()

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [14]:
cat_features = [f'cat__{col}' for col in cat_features]
model = CatBoostClassifier(verbose=1000, cat_features=cat_features,task_type="GPU")

Learning rate set to 0.0246
0:	learn: 0.6658910	total: 60.6ms	remaining: 1m
999:	learn: 0.2401249	total: 1m 9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1a099e08390>

# MLFLOW

In [15]:
%%bash
export MLFLOW_REGISTRY_URI=../mlflow/

In [24]:
mlflow.set_tracking_uri("http://localhost:5001")
mlflow.set_experiment('Credit')
with mlflow.start_run():
    model.fit(X_train, y_train)
    print(model.predict_proba(X_test))

    # Логирование модели и параметров
    mlflow.log_param(
        'f1', f1_score(y_test, model.predict(X_test)))

    mlflow.sklearn.log_model(
    model,
    artifact_path="catbost_credit",
    registered_model_name=f"catbost_credit")

Learning rate set to 0.024904
0:	learn: 0.6656862	total: 45.4ms	remaining: 45.4s
999:	learn: 0.2392829	total: 45.2s	remaining: 0us
[[0.9641758  0.0358242 ]
 [0.94347872 0.05652128]
 [0.96229508 0.03770492]
 ...
 [0.92326359 0.07673641]
 [0.92832033 0.07167967]
 [0.81421547 0.18578453]]


Successfully registered model 'Catbost Credit'.
2025/02/22 19:40:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Catbost Credit, version 1
Created version '1' of model 'Catbost Credit'.


🏃 View run ambitious-cod-537 at: http://localhost:5001/#/experiments/1/runs/37459f7441a146a5819d4c9377e8f1e5
🧪 View experiment at: http://localhost:5001/#/experiments/1


In [25]:
mlflow.get_artifact_uri()

'file:///C:/Users/Руслан/Desktop/jupyter/auto-pipeline-airflow-mlflow-master/mlflow/1/887dfd0dea474d96b85bb67be60fe856/artifacts'

In [30]:
def get_version_model(config_name, client):
    """
    Получение последней версии модели из MLFlow
    """
    dict_push = {}
    for count, value in enumerate(
        client.search_model_versions(f"name='{config_name}'")):
        # client.list_registered_models()):
        # Все версии модели
        dict_push[count] = value
    return dict_push

In [32]:
client = MlflowClient()
last_version = get_version_model('Catbost Credit', client)
last_version

{0: <ModelVersion: aliases=[], creation_timestamp=1740235201142, current_stage='None', description='', last_updated_timestamp=1740235201142, name='Catbost Credit', run_id='37459f7441a146a5819d4c9377e8f1e5', run_link='', source='file:///C:/Users/Руслан/Desktop/jupyter/auto-pipeline-airflow-mlflow-master/mlflow/1/37459f7441a146a5819d4c9377e8f1e5/artifacts/catbost_credit', status='READY', status_message=None, tags={}, user_id='', version='1'>}