In [1]:
# импортируем необходимые библиотеки, функции и классы
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# записываем CSV-файл в объект DataFrame
data = pd.read_csv('Data/Response.csv', encoding='cp1251', sep=';')
data.head(5)

Unnamed: 0,mortgage,life_ins,cre_card,deb_card,mob_bank,curr_acc,internet,perloan,savings,atm_user,markpl,age,cus_leng,response
0,No,No,No,No,No,No,No,No,No,No,No,18.0,less than 3 years,No
1,Yes,Yes,,,Yes,No,,,,Yes,No,18.0,,Yes
2,Yes,Yes,,Yes,No,No,No,No,No,No,Yes,,from 3 to 7 years,Yes
3,Yes,Yes,Yes,Yes,,Yes,No,No,No,,Yes,18.0,from 3 to 7 years,Yes
4,Yes,Yes,No,Yes,No,No,No,Yes,No,Yes,No,,,No


In [3]:
# создаем обучающий массив признаков, обучающий массив меток,
# тестовый массив признаков, тестовый массив меток
train, test, y_train, y_test = train_test_split(data.drop('response', axis=1), 
                                                data['response'], 
                                                test_size=.3, 
                                                stratify=data['response'], 
                                                random_state=100)

In [4]:
categorical_features = train.dtypes[train.dtypes == 'object'].index
numeric_features = train.dtypes[train.dtypes != 'object'].index

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipe = Pipeline([('preprocessor', preprocessor),
                 ('classifier', LogisticRegression(solver='lbfgs', 
                                                   max_iter=400))])  

In [5]:
param_grid = [
    {'classifier': [LogisticRegression(solver='lbfgs', max_iter=400)],
     'classifier__C': [.01], 
     'preprocessor__num__scaler': [None]},
    {'classifier': [LogisticRegression(solver='lbfgs', max_iter=400)],
     'classifier__C': [.05]},
     {'classifier': [LogisticRegression(solver='lbfgs', max_iter=400)],
     'classifier__C': [.05], 'preprocessor': [preprocessor]},
    {'classifier': [GradientBoostingClassifier(n_estimators=50)],
     'classifier__learning_rate': [.025],
     'preprocessor__num__scaler': [None]}]

In [6]:
# создаем экземпляр класса KFold
kf = KFold(n_splits=5, shuffle=True, random_state=123)
# создаем экземпляр класса GridSearchCV, передав конвейер,
# сетку гиперпараметров и указав количество
# блоков перекрестной проверки, отключив запись метрик 
# для обучающих блоков перекрестной проверки в атрибут cv_results_
gs = GridSearchCV(pipe, param_grid, scoring='roc_auc', cv=kf, return_train_score=False)
# выполняем решетчатый поиск
gs.fit(train, y_train)
# смотрим наилучшие значения гиперпараметров
print('Наилучшие значения гиперпараметров: {}'.format(gs.best_params_))
# смотрим наилучшее значение AUC
print('Наилучшее значение AUC: {:.3f}'.format(gs.best_score_))
# смотрим значение AUC на тестовой выборке
print("AUC на тестовом наборе: {:.3f}".format(
    roc_auc_score(y_test, gs.predict_proba(test)[:, 1])))

Наилучшие значения гиперпараметров: {'classifier': LogisticRegression(C=0.05, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=400, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False), 'classifier__C': 0.05}
Наилучшее значение AUC: 0.905
AUC на тестовом наборе: 0.904


In [7]:
results = pd.DataFrame(gs.cv_results_)
results = results.sort_values(by='mean_test_score', ascending=False)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__C,param_preprocessor__num__scaler,param_preprocessor,param_classifier__learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.129748,0.006157,0.017563,0.00062,"LogisticRegression(C=0.05, class_weight=None, ...",0.05,,,,"{'classifier': LogisticRegression(C=0.05, clas...",0.902135,0.907747,0.903857,0.910517,0.903164,0.905484,0.003153,1
2,0.125108,0.004985,0.017053,0.000151,"LogisticRegression(C=0.05, class_weight=None, ...",0.05,,"ColumnTransformer(n_jobs=None, remainder='drop...",,"{'classifier': LogisticRegression(C=0.05, clas...",0.902135,0.907747,0.903857,0.910517,0.903164,0.905484,0.003153,1
0,0.285697,0.094654,0.017278,0.000483,"LogisticRegression(C=0.01, class_weight=None, ...",0.01,,,,"{'classifier': LogisticRegression(C=0.01, clas...",0.90121,0.906877,0.903497,0.90897,0.901728,0.904456,0.003004,3
3,0.862429,0.013701,0.020729,0.001024,([DecisionTreeRegressor(criterion='friedman_ms...,,,,0.025,{'classifier': ([DecisionTreeRegressor(criteri...,0.896991,0.904202,0.898026,0.909195,0.899996,0.901682,0.004494,4
