# Построение baseline моделей

## 1. Подготовка

Импорт необходимых библиотек

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, Imputer, OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, classification_report, roc_curve

Загрузка данных:

In [22]:
X = pd.read_csv('F:\\Work\\My\\Python\\courses\\courserra\\ml-specialization\\materials\\course6\\churn\\orange_small_churn_data.txt')
y = pd.read_csv('F:\\Work\\My\\Python\\courses\\courserra\\ml-specialization\\materials\\course6\\churn\\orange_small_churn_labels.txt', header=None, names=['target'])
y['target'] = y['target'].apply(lambda x: 1 if x==1 else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=9,
                                                    shuffle=True,
                                                    stratify=y)

X_train = X_train.iloc[:10000, :]
y_train = y_train.iloc[:10000, :].values.ravel()

В выборке всего 32000 параметров, что приводит к значительному времени обучения моделей. Ограничим выборку до 10000. Также временно будем кросс-валидироваться на 5 фолдах, а не на 7. Это снизит время обучения и не слишком скажется на качестве baseline моделей.

Для начала реализуем препроцессинг данных.

Данные содержат много пропусков и выбросов. С выбросами пока ничего делать не будем, а пропуски заполним средними значениями для случая числовых признаков и отдельным признаком для случая категориальных признаков. Кроме того, у некоторых категориальных признаков большое число возможных значений. Ограничим его, взяв токо первые 200 наиболее часто встречаемых значения для каждого категориального признака.

Нам потребуется отдельный класс, который будет делать dummy-encoding категориальных переменных с указанными выше особенностями

In [23]:
class DummyEncoder(BaseEstimator, TransformerMixin):
    '''
    Encodes categorical features as one-hot variables with max_categories restriction
    '''
    def __init__(self, columns=None, max_categories=None):
        self.columns = columns
        self.dummy_columns = None
        self.max_categories = max_categories


    def fit(self, X, y=None, **kwargs):
        self.dummy_columns = None
        return self


    def transform(self, X, y=None, **kwargs):
        if self.max_categories is not None:
            X = X[self.columns] if self.columns is not None else X.copy()
            for col in X.columns:
                top_cats = X[col].value_counts()[:self.max_categories].index.values
                X[col] = X[col].apply(lambda x: x if (x in top_cats or x is None) else 'aggr')

        dummy_df = pd.get_dummies(X, columns=self.columns, sparse=True, dummy_na=True)
        new_cols = dummy_df.columns.values
        if self.dummy_columns is None:
            self.dummy_columns = new_cols
            return dummy_df
        else:
            res_df = pd.DataFrame()
            for col in self.dummy_columns:
                res_df[col] = dummy_df[col] if col in new_cols else np.zeros((len(X),), dtype=int)
        return res_df

Так как будет использоваться кросс-валидация, даже baseline модели нужно тренировать в пайплайне, чтобы информация о holdout-выборках не использовалась при обучении. Пайплайн будет:
- Удалять полностью NaN признаки
- Запонять средними значенийми пропуски числовых переменных
- Масштабировать числовые переменные
- Dummy-кодировать категориальные переменные по описанному выше алгоритму

Реализуем функцию, конструирующую пайплайн:

In [24]:
def get_baseline_pipeline(X, alg):

    # get non-NaN columns
    vc = X.apply(lambda col: len(col.value_counts()))
    vc = vc[vc > 0]
    all_cols = X.columns.values
    num_cols = set(all_cols[:190])
    cat_cols = set(all_cols[190:])
    non_nan_all_cols = set(vc.index.values)
    non_nan_num_cols = sorted(list(non_nan_all_cols.intersection(num_cols)))
    non_nan_cat_cols = sorted(list(non_nan_all_cols.intersection(cat_cols)))


    pipeline = Pipeline(steps=[
            # get rid of fully NaN columns
            ('filter_out_useless_columns', FunctionTransformer(lambda data: data.loc[:, non_nan_all_cols], validate=False)),

            # processing
            ('processing', FeatureUnion([

                # numeric features
                ('numeric', Pipeline(steps=[
                    ('selecting', FunctionTransformer(lambda data: data.loc[:, non_nan_num_cols], validate=False)),
                    ('float_nan_mean', Imputer(strategy='mean')),
                    ('scaling', StandardScaler())
                ])),

                # categorical features
                ('categorical', Pipeline(steps=[
                    ('selecting', FunctionTransformer(lambda data: data.loc[:, non_nan_cat_cols], validate=False)),
                    ('encoding', DummyEncoder(max_categories=200))
                ]))
            ])),

            #model
            ('model', alg)
        ])

    return pipeline

В качестве основной метрики выберем ROC-AUC метрику. В качестве дополнительных: F1, Precision и Recall.

In [25]:
metrics = [ 
    ('roc_auc',   roc_auc_score), 
    ('precision', precision_score), 
    ('recall',    recall_score), 
    ('f1',        f1_score) 
]

In [45]:
model_results = pd.DataFrame(data=np.zeros((3, 5)), columns=['model'] + [m[0] for m in metrics])


def run_model(model, params, n_folds):
    print('*'*64)
    print(' Running model: {0} '.format(type(model).__name__).center(64, '*'))
    print('*'*64)

    pipeline = get_baseline_pipeline(X_train, model)

    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=9)

    results = []
    
    for metric, score_func in metrics:
        print('Metric: {0}'.format(metric).center(64))

        grid = GridSearchCV(pipeline, params, scoring=metric, cv=cv, verbose=10)
        grid.fit(X_train, y_train)

        cv_score = grid.best_score_
        
        y_pred = grid.predict_proba(X_test)[:, 1]
        
        fpr, tpr, thr = roc_curve(y_test, y_pred)
        threshold = get_optimal_threshold(fpr, tpr, thr)
        print('optimal threshold:', threshold)
        if metric != 'roc_auc':
            y_pred = y_pred > threshold

        ho_score = score_func(y_test, y_pred)

        results.append(ho_score)
        
        print('Metric {0} results:'.format(metric))
        print('CV score:', cv_score)
        print('Holdout score:', ho_score)
        print('Report:')
        print('Best params:', grid.best_params_)
        print(classification_report(y_test, y_pred>threshold))

    print('\n-------------------------------------\n')

    return results


def get_optimal_threshold(fprs, tprs, thrs):
    n = len(fprs)
    dist = 10
    thr_opt = None
    for i in range(n):
        fpr, tpr, thr = fprs[i], tprs[i], thrs[i]
        d = fpr*fpr + (tpr - 1)*(tpr - 1)
        if d < dist:
            thr_opt = thr
            dist = d
    return thr_opt

Можно приступать к построению и обучению моделей. В качестве baseline моделей будем использовать
- Случайный лес
- Логистическую регрессию
- Градиентный бустинг

Наилучшие параметры алгоритмов выбираются GridSearch-ем

## 2. Случайный лес

In [46]:
model = RandomForestClassifier(random_state=9, n_jobs=-1)
params = { 'model__n_estimators': [100, 150, 200] }
results = run_model(model, params, 5)

model_results.loc[0] = np.array(['RandomForestClassifier', *results])

****************************************************************
************ Running model: RandomForestClassifier *************
****************************************************************
                        Metric: roc_auc                         
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] model__n_estimators=100 .........................................
[CV]  model__n_estimators=100, score=0.6049992352188666, total=   8.5s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.5s remaining:    0.0s


[CV]  model__n_estimators=100, score=0.6132345375159909, total=   8.4s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   27.0s remaining:    0.0s


[CV]  model__n_estimators=100, score=0.6266460175760611, total=   8.5s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   40.6s remaining:    0.0s


[CV]  model__n_estimators=100, score=0.6300492936203348, total=   8.4s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   54.2s remaining:    0.0s


[CV]  model__n_estimators=100, score=0.6221894293342233, total=   8.4s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min remaining:    0.0s


[CV]  model__n_estimators=150, score=0.6228099449357584, total=   9.3s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.4min remaining:    0.0s


[CV]  model__n_estimators=150, score=0.6246853968518828, total=   9.3s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.6min remaining:    0.0s


[CV]  model__n_estimators=150, score=0.6322410868235163, total=   9.3s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.8min remaining:    0.0s


[CV]  model__n_estimators=150, score=0.6287995717225652, total=   9.4s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.1min remaining:    0.0s


[CV] . model__n_estimators=150, score=0.631483258245731, total=   9.1s
[CV] model__n_estimators=200 .........................................
[CV]  model__n_estimators=200, score=0.6330597502641971, total=  10.4s
[CV] model__n_estimators=200 .........................................
[CV]  model__n_estimators=200, score=0.6340661327103844, total=  10.3s
[CV] model__n_estimators=200 .........................................
[CV]  model__n_estimators=200, score=0.6308870765893543, total=   9.9s
[CV] model__n_estimators=200 .........................................
[CV]  model__n_estimators=200, score=0.6377249151788197, total=  10.8s
[CV] model__n_estimators=200 .........................................
[CV]  model__n_estimators=200, score=0.6423779131208632, total=  10.4s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  3.6min finished


optimal threshold: 0.075
Metric roc_auc results:
CV score: 0.635623157573
Holdout score: 0.655579865977
Report:
Best params: {'model__n_estimators': 200}
             precision    recall  f1-score   support

          0       0.95      0.69      0.80      7405
          1       0.12      0.53      0.20       595

avg / total       0.89      0.68      0.76      8000

                       Metric: precision                        
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] model__n_estimators=100 .........................................


  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=100, score=0.0, total=   8.8s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.9s remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=100, score=0.0, total=   8.8s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   27.9s remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=100, score=0.0, total=   8.7s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   41.6s remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=100, score=0.0, total=   8.3s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   55.1s remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=100, score=0.0, total=   9.1s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=150, score=0.0, total=   9.9s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.4min remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=150, score=0.0, total=  10.0s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.7min remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=150, score=0.0, total=  10.4s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.9min remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=150, score=0.0, total=   9.7s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.2min remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=150, score=0.0, total=   9.1s
[CV] model__n_estimators=200 .........................................


  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=200, score=0.0, total=  10.5s
[CV] model__n_estimators=200 .........................................


  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=200, score=0.0, total=  10.3s
[CV] model__n_estimators=200 .........................................


  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=200, score=0.0, total=  10.3s
[CV] model__n_estimators=200 .........................................


  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=200, score=0.0, total=  10.9s
[CV] model__n_estimators=200 .........................................


  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=200, score=0.0, total=  11.0s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  3.7min finished


optimal threshold: 0.08
Metric precision results:
CV score: 0.0
Holdout score: 0.125274725275
Report:
Best params: {'model__n_estimators': 100}
             precision    recall  f1-score   support

          0       0.95      0.73      0.82      7405
          1       0.13      0.48      0.20       595

avg / total       0.88      0.71      0.78      8000

                         Metric: recall                         
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] model__n_estimators=100 .........................................
[CV] ............... model__n_estimators=100, score=0.0, total=   8.2s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.9s remaining:    0.0s


[CV] ............... model__n_estimators=100, score=0.0, total=   8.2s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   26.0s remaining:    0.0s


[CV] ............... model__n_estimators=100, score=0.0, total=   8.2s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   39.1s remaining:    0.0s


[CV] ............... model__n_estimators=100, score=0.0, total=   8.2s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   52.3s remaining:    0.0s


[CV] ............... model__n_estimators=100, score=0.0, total=   8.2s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min remaining:    0.0s


[CV] ............... model__n_estimators=150, score=0.0, total=   9.2s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0s


[CV] ............... model__n_estimators=150, score=0.0, total=   9.0s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.6min remaining:    0.0s


[CV] ............... model__n_estimators=150, score=0.0, total=   9.3s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.8min remaining:    0.0s


[CV] ............... model__n_estimators=150, score=0.0, total=   9.9s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.1min remaining:    0.0s


[CV] ............... model__n_estimators=150, score=0.0, total=   9.9s
[CV] model__n_estimators=200 .........................................
[CV] ............... model__n_estimators=200, score=0.0, total=  10.2s
[CV] model__n_estimators=200 .........................................
[CV] ............... model__n_estimators=200, score=0.0, total=  10.4s
[CV] model__n_estimators=200 .........................................
[CV] ............... model__n_estimators=200, score=0.0, total=  10.7s
[CV] model__n_estimators=200 .........................................
[CV] ............... model__n_estimators=200, score=0.0, total=  10.6s
[CV] model__n_estimators=200 .........................................
[CV] ............... model__n_estimators=200, score=0.0, total=  10.4s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  3.6min finished


optimal threshold: 0.08
Metric recall results:
CV score: 0.0
Holdout score: 0.478991596639
Report:
Best params: {'model__n_estimators': 100}
             precision    recall  f1-score   support

          0       0.95      0.73      0.82      7405
          1       0.13      0.48      0.20       595

avg / total       0.88      0.71      0.78      8000

                           Metric: f1                           
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] model__n_estimators=100 .........................................


  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=100, score=0.0, total=   8.8s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.7s remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=100, score=0.0, total=   8.3s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   27.1s remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=100, score=0.0, total=   8.2s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   40.4s remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=100, score=0.0, total=   8.3s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   53.5s remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=100, score=0.0, total=   8.5s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=150, score=0.0, total=   9.8s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.4min remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=150, score=0.0, total=   9.9s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.6min remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=150, score=0.0, total=   9.1s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.9min remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=150, score=0.0, total=   9.3s
[CV] model__n_estimators=150 .........................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.1min remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=150, score=0.0, total=   9.6s
[CV] model__n_estimators=200 .........................................


  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=200, score=0.0, total=  10.4s
[CV] model__n_estimators=200 .........................................


  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=200, score=0.0, total=   9.9s
[CV] model__n_estimators=200 .........................................


  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=200, score=0.0, total=   9.9s
[CV] model__n_estimators=200 .........................................


  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=200, score=0.0, total=  10.2s
[CV] model__n_estimators=200 .........................................


  'precision', 'predicted', average, warn_for)


[CV] ............... model__n_estimators=200, score=0.0, total=  10.0s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  3.6min finished


optimal threshold: 0.08
Metric f1 results:
CV score: 0.0
Holdout score: 0.198606271777
Report:
Best params: {'model__n_estimators': 100}
             precision    recall  f1-score   support

          0       0.95      0.73      0.82      7405
          1       0.13      0.48      0.20       595

avg / total       0.88      0.71      0.78      8000


-------------------------------------



Результаты:

In [51]:
model_results.iloc[0, :]

model        RandomForestClassifier
roc_auc              0.655579865977
precision            0.125274725275
recall               0.478991596639
f1                   0.198606271777
Name: 0, dtype: object

## 3. Логистическая регрессия


In [52]:
model = LogisticRegression(random_state=9, n_jobs=-1)
params = { 'model__C': [0.1, 1, 10] }
results = run_model(model, params, 5)

model_results.loc[1] = np.array(['LogisticRegression', *results])

****************************************************************
************** Running model: LogisticRegression ***************
****************************************************************
                        Metric: roc_auc                         
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] model__C=0.1 ....................................................


  " = {}.".format(self.n_jobs))


[CV] ........... model__C=0.1, score=0.6201262584125925, total=   7.5s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.3s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ........... model__C=0.1, score=0.6528693197619444, total=   7.3s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   24.5s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ........... model__C=0.1, score=0.6680884643194838, total=   7.4s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   37.0s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ........... model__C=0.1, score=0.6806378274653764, total=   8.1s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   50.1s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ........... model__C=0.1, score=0.6743805272818288, total=   7.4s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.0min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............. model__C=1, score=0.5943357528227377, total=   9.3s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............. model__C=1, score=0.6083208187329663, total=   9.7s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.5min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............. model__C=1, score=0.6351229211858278, total=   9.1s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.8min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............. model__C=1, score=0.6420650481116859, total=   8.6s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.0min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............. model__C=1, score=0.6521914455753937, total=   8.6s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ............ model__C=10, score=0.5795407141665276, total=  11.2s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ............ model__C=10, score=0.5816334334501363, total=  10.6s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ............ model__C=10, score=0.5970889649034985, total=  11.5s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ............ model__C=10, score=0.6062767673396741, total=  11.9s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ............ model__C=10, score=0.6433964625396296, total=   9.0s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  3.5min finished
  " = {}.".format(self.n_jobs))


optimal threshold: 0.0737237091099
Metric roc_auc results:
CV score: 0.659220479448
Holdout score: 0.650846180471
Report:
Best params: {'model__C': 0.1}
             precision    recall  f1-score   support

          0       0.95      0.61      0.74      7405
          1       0.11      0.61      0.19       595

avg / total       0.89      0.61      0.70      8000

                       Metric: precision                        
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] model__C=0.1 ....................................................


  " = {}.".format(self.n_jobs))


[CV] .......... model__C=0.1, score=0.42857142857142855, total=   7.3s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.1s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] .......................... model__C=0.1, score=0.0, total=   7.3s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   24.3s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] .......................... model__C=0.1, score=0.0, total=   7.3s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   36.4s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] .......................... model__C=0.1, score=0.0, total=   7.5s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   48.8s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ........... model__C=0.1, score=0.2857142857142857, total=   7.5s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.0min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ........................... model__C=1, score=0.25, total=   9.1s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............................ model__C=1, score=0.3, total=   8.9s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.5min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............. model__C=1, score=0.2222222222222222, total=   9.2s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.7min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............ model__C=1, score=0.09090909090909091, total=   8.9s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.0min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............. model__C=1, score=0.2413793103448276, total=   8.6s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ........... model__C=10, score=0.14084507042253522, total=  11.2s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ........... model__C=10, score=0.25316455696202533, total=  10.3s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ............ model__C=10, score=0.1891891891891892, total=  11.2s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ........... model__C=10, score=0.10294117647058823, total=  11.3s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ........... model__C=10, score=0.20512820512820512, total=   9.0s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  3.5min finished
  " = {}.".format(self.n_jobs))


optimal threshold: 0.0500099781408
Metric precision results:
CV score: 0.220902124695
Holdout score: 0.101038450744
Report:
Best params: {'model__C': 1}
             precision    recall  f1-score   support

          0       0.95      0.57      0.71      7405
          1       0.10      0.61      0.17       595

avg / total       0.88      0.57      0.67      8000

                         Metric: recall                         
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] model__C=0.1 ....................................................


  " = {}.".format(self.n_jobs))


[CV] ......... model__C=0.1, score=0.019230769230769232, total=   7.2s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.9s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] .......................... model__C=0.1, score=0.0, total=   7.3s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   24.2s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] .......................... model__C=0.1, score=0.0, total=   7.4s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   36.6s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] .......................... model__C=0.1, score=0.0, total=   7.6s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   49.0s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] .......... model__C=0.1, score=0.01282051282051282, total=   7.5s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.0min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ........... model__C=1, score=0.038461538461538464, total=   8.8s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ........... model__C=1, score=0.057692307692307696, total=   8.8s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.5min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ........... model__C=1, score=0.038461538461538464, total=   9.1s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.7min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............ model__C=1, score=0.01282051282051282, total=   9.1s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.0min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............ model__C=1, score=0.04487179487179487, total=   8.8s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ............ model__C=10, score=0.0641025641025641, total=  11.7s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ............ model__C=10, score=0.1282051282051282, total=  10.9s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ........... model__C=10, score=0.08974358974358974, total=  11.3s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ........... model__C=10, score=0.04487179487179487, total=  11.8s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ........... model__C=10, score=0.05128205128205128, total=   9.0s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  3.5min finished
  " = {}.".format(self.n_jobs))


optimal threshold: 0.05084752634
Metric recall results:
CV score: 0.075641025641
Holdout score: 0.621848739496
Report:
Best params: {'model__C': 10}
             precision    recall  f1-score   support

          0       0.95      0.55      0.70      7405
          1       0.10      0.62      0.17       595

avg / total       0.89      0.56      0.66      8000

                           Metric: f1                           
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] model__C=0.1 ....................................................


  " = {}.".format(self.n_jobs))


[CV] .......... model__C=0.1, score=0.03680981595092025, total=   7.3s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.0s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] .......................... model__C=0.1, score=0.0, total=   7.4s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   24.3s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] .......................... model__C=0.1, score=0.0, total=   7.4s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   36.5s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] .......................... model__C=0.1, score=0.0, total=   7.4s
[CV] model__C=0.1 ....................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   48.9s remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ......... model__C=0.1, score=0.024539877300613494, total=   7.4s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.0min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............ model__C=1, score=0.06666666666666668, total=   9.0s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............. model__C=1, score=0.0967741935483871, total=   9.0s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.5min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............. model__C=1, score=0.0655737704918033, total=   9.6s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.7min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............ model__C=1, score=0.02247191011235955, total=   9.0s
[CV] model__C=1 ......................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.0min remaining:    0.0s
  " = {}.".format(self.n_jobs))


[CV] ............ model__C=1, score=0.07567567567567568, total=   9.0s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ........... model__C=10, score=0.08810572687224669, total=  11.8s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ............ model__C=10, score=0.1702127659574468, total=  10.6s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ............ model__C=10, score=0.1217391304347826, total=  11.4s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ........................ model__C=10, score=0.0625, total=  11.7s
[CV] model__C=10 .....................................................


  " = {}.".format(self.n_jobs))


[CV] ........... model__C=10, score=0.08205128205128205, total=   9.7s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  3.6min finished
  " = {}.".format(self.n_jobs))


optimal threshold: 0.05084752634
Metric f1 results:
CV score: 0.104921781063
Holdout score: 0.173464603844
Report:
Best params: {'model__C': 10}
             precision    recall  f1-score   support

          0       0.95      0.55      0.70      7405
          1       0.10      0.62      0.17       595

avg / total       0.89      0.56      0.66      8000


-------------------------------------



Результаты

In [55]:
model_results.iloc[1, :]

model        LogisticRegression
roc_auc          0.650846180471
precision        0.101038450744
recall           0.621848739496
f1               0.173464603844
Name: 1, dtype: object

## 4. Градиентный бустинг

In [56]:
model = GradientBoostingClassifier(learning_rate=0.1, random_state=9)
params = { 'model__n_estimators': [100] }
results = run_model(model, params, 5)

model_results.loc[2] = np.array(['GradientBoostingClassifier', *results])

****************************************************************
********** Running model: GradientBoostingClassifier ***********
****************************************************************
                        Metric: roc_auc                         
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] model__n_estimators=100 .........................................
[CV]  model__n_estimators=100, score=0.7090233744924634, total=  34.7s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   39.8s remaining:    0.0s


[CV]  model__n_estimators=100, score=0.7050986567662273, total=  34.0s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s


[CV]  model__n_estimators=100, score=0.7270913287724569, total=  33.8s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.0min remaining:    0.0s


[CV]  model__n_estimators=100, score=0.7623181906668891, total=  34.6s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.6min remaining:    0.0s


[CV]  model__n_estimators=100, score=0.7052637799655154, total=  34.9s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.3min finished


optimal threshold: 0.0845452355231
Metric roc_auc results:
CV score: 0.721759066133
Holdout score: 0.723944870318
Report:
Best params: {'model__n_estimators': 100}
             precision    recall  f1-score   support

          0       0.96      0.72      0.82      7405
          1       0.15      0.61      0.24       595

avg / total       0.90      0.71      0.78      8000

                       Metric: precision                        
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] model__n_estimators=100 .........................................
[CV] ............... model__n_estimators=100, score=0.4, total=  33.7s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   38.6s remaining:    0.0s


[CV] .............. model__n_estimators=100, score=0.25, total=  33.8s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s


[CV]  model__n_estimators=100, score=0.5714285714285714, total=  32.7s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.9min remaining:    0.0s


[CV] ............... model__n_estimators=100, score=0.0, total=  33.3s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.6min remaining:    0.0s


[CV] ............... model__n_estimators=100, score=0.3, total=  34.6s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.2min finished


optimal threshold: 0.0845452355231
Metric precision results:
CV score: 0.304285714286
Holdout score: 0.14977244518
Report:
Best params: {'model__n_estimators': 100}
             precision    recall  f1-score   support

          0       0.96      0.72      0.82      7405
          1       0.15      0.61      0.24       595

avg / total       0.90      0.71      0.78      8000

                         Metric: recall                         
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] model__n_estimators=100 .........................................
[CV]  model__n_estimators=100, score=0.01282051282051282, total=  33.2s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   38.1s remaining:    0.0s


[CV]  model__n_estimators=100, score=0.00641025641025641, total=  34.6s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s


[CV]  model__n_estimators=100, score=0.02564102564102564, total=  34.1s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.0min remaining:    0.0s


[CV] ............... model__n_estimators=100, score=0.0, total=  33.5s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.6min remaining:    0.0s


[CV]  model__n_estimators=100, score=0.019230769230769232, total=  34.1s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.2min finished


optimal threshold: 0.0845452355231
Metric recall results:
CV score: 0.0128205128205
Holdout score: 0.608403361345
Report:
Best params: {'model__n_estimators': 100}
             precision    recall  f1-score   support

          0       0.96      0.72      0.82      7405
          1       0.15      0.61      0.24       595

avg / total       0.90      0.71      0.78      8000

                           Metric: f1                           
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] model__n_estimators=100 .........................................
[CV]  model__n_estimators=100, score=0.024844720496894408, total=  32.9s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   38.3s remaining:    0.0s


[CV] ............ model__n_estimators=100, score=0.0125, total=  34.9s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s


[CV]  model__n_estimators=100, score=0.04907975460122699, total=  32.9s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.9min remaining:    0.0s


[CV] ............... model__n_estimators=100, score=0.0, total=  33.9s
[CV] model__n_estimators=100 .........................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.6min remaining:    0.0s


[CV]  model__n_estimators=100, score=0.03614457831325302, total=  33.9s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.2min finished


optimal threshold: 0.0845452355231
Metric f1 results:
CV score: 0.0245138106823
Holdout score: 0.24037184595
Report:
Best params: {'model__n_estimators': 100}
             precision    recall  f1-score   support

          0       0.96      0.72      0.82      7405
          1       0.15      0.61      0.24       595

avg / total       0.90      0.71      0.78      8000


-------------------------------------



Результаты:

In [57]:
model_results.iloc[2, :]

model        GradientBoostingClassifier
roc_auc                  0.723944870318
precision                 0.14977244518
recall                   0.608403361345
f1                        0.24037184595
Name: 2, dtype: object

## 5. Результаты

In [58]:
model_results

Unnamed: 0,model,roc_auc,precision,recall,f1
0,RandomForestClassifier,0.655579865977,0.125274725275,0.478991596639,0.198606271777
1,LogisticRegression,0.650846180471,0.101038450744,0.621848739496,0.173464603844
2,GradientBoostingClassifier,0.723944870318,0.14977244518,0.608403361345,0.24037184595
