# Задание
1. обучить несколько разных моделей на наборе данных ССЗ (train_case2.csv): логрег, бустинг, лес и т.д - на ваш выбор 2-3 варианта, при обучении моделей обязательно использовать кроссвалидацию
1. вывести сравнение полученных моделей по основным метрикам классификации: pr/rec/auc/f_score (можно в виде таблицы, где строки - модели, а столбцы - метрики)
1. сделать выводы о том, какая модель справилась с задачей лучше других
1. (опциональный вопрос) какая метрика (precision_recall_curve или roc_auc_curve) больше подходит в случае сильного дисбаланса классов? (когда объектов одного из классов намного больше чем другого).

In [2]:
# pip install lazypredict

In [82]:
# imports
import numpy as np
import pandas as pd

from lazypredict.Supervised import LazyClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix, log_loss
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import lightgbm as lgb
import xgboost as xgb


In [2]:
# Load data
df = pd.read_csv(filepath_or_buffer='train_case2.csv', sep=';')
df.head(3)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1


In [3]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df.drop('cardio', 1), 
                                                    df['cardio'], random_state=0)

In [4]:
# Compare models
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████| 29/29 [1:07:28<00:00, 139.60s/it]  

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.74               0.74     0.74      0.74   
XGBClassifier                      0.73               0.73     0.73      0.73   
SVC                                0.73               0.73     0.73      0.73   
AdaBoostClassifier                 0.73               0.73     0.73      0.73   
RandomForestClassifier             0.72               0.72     0.72      0.72   
LogisticRegression                 0.72               0.72     0.72      0.72   
SGDClassifier                      0.72               0.72     0.72      0.72   
BernoulliNB                        0.71               0.71     0.71      0.71   
ExtraTreesClassifier               0.71               0.71     0.71      0.71   
BaggingClassifier                  0.70               0.70     0.70      0.70   
LinearSVC                   




In [6]:
models.to_csv('models.csv', sep='\t', encoding='utf-8')

По результатам сравнения возмем 5 лучших моделей

In [12]:
models.index[:5]

Index(['LGBMClassifier', 'XGBClassifier', 'SVC', 'AdaBoostClassifier',
       'RandomForestClassifier'],
      dtype='object', name='Model')

К полям:
- gender, cholesterol применим OHE-кодирование
- age, height, weight, ap_hi, ap_lo - standardScaler
- gluc, smoke, alco, active - оставим пока как есть

In [15]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]


from sklearn.preprocessing import StandardScaler


continuos_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
cat_cols = ['gender', 'cholesterol']
base_cols = ['gluc', 'smoke', 'alco', 'active']

continuos_transformers = []
cat_transformers = []
base_transformers = []

for cont_col in continuos_cols:
    transfomer =  Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    continuos_transformers.append((cont_col, transfomer))
    
for cat_col in cat_cols:
    cat_transformer = Pipeline([
                ('selector', ColumnSelector(key=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    cat_transformers.append((cat_col, cat_transformer))
    
for base_col in base_cols:
    base_transformer = Pipeline([
                ('selector', NumberSelector(key=base_col))
            ])
    base_transformers.append((base_col, base_transformer))

объединим все наши трансформеры с помощью FeatureUnion

In [17]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion(continuos_transformers+cat_transformers+base_transformers)
feature_processing = Pipeline([('feats', feats)])

feature_processing.fit_transform(X_train);

Добавим 1 классификатор и запустим кросс-валидацию

In [29]:
models.index[:5]

Index(['LGBMClassifier', 'XGBClassifier', 'SVC', 'AdaBoostClassifier',
       'RandomForestClassifier'],
      dtype='object', name='Model')

In [83]:
# models
cls_objects = [lgb.LGBMClassifier(random_state=42), 
               xgb.XGBClassifier(random_state=42), 
#                SVC(random_state=42, gamma='auto'), очень долго!
               AdaBoostClassifier(random_state=42), 
               RandomForestClassifier(random_state=42), 
               LogisticRegression(random_state=42)]

In [84]:
# define finale scores dataset
res = pd.DataFrame(columns=['Model', 'CV score', 'Best Threshold', 'F-Score', 'Precision', 'Recall', 'roc auc score', 'log loss score'])

for cls in cls_objects:
    classifier = Pipeline([
        ('features',feats),
        ('classifier', cls),
    ])

    print(f'\n{type(cls)}')

    #запустим кросс-валидацию
    cv_scores = cross_val_score(classifier, X_train, y_train, cv=16, scoring='roc_auc')
    cv_score = np.mean(cv_scores)
    cv_score_std = np.std(cv_scores)
    print('CV score is {}+-{}'.format(cv_score, cv_score_std))

    #обучим пайплайн на всем тренировочном датасете
    classifier.fit(X_train, y_train)
    y_score = classifier.predict_proba(X_test)[:, 1]

    b=1
    precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
    fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                            fscore[ix],
                                                                            precision[ix],
                                                                            recall[ix]))
    
    roc_score = roc_auc_score(y_true=y_test, y_score=classifier.predict_proba(X_test)[:,1])
    log_score = log_loss(y_true=y_test, y_pred=classifier.predict_proba(X_test)[:,1])
    print("roc auc score: {}".format(roc_score))
    print("log loss score: {}".format(log_score))
    print('* ' * 20)
    # add scores to final score dataset
    res.loc[len(res)] = [type(cls), cv_score, thresholds[ix], fscore[ix], precision[ix], recall[ix], roc_score, log_score]
res


<class 'lightgbm.sklearn.LGBMClassifier'>
CV score is 0.8024619512142214+-0.007068221093288977
Best Threshold=0.355647, F-Score=0.740, Precision=0.667, Recall=0.830
roc auc score: 0.8011820838166295
log loss score: 0.5409625775691054
* * * * * * * * * * * * * * * * * * * * 

<class 'xgboost.sklearn.XGBClassifier'>
CV score is 0.7974879407860951+-0.006447877492510205
Best Threshold=0.347103, F-Score=0.738, Precision=0.665, Recall=0.828
roc auc score: 0.797227760535858
log loss score: 0.5471106075466078
* * * * * * * * * * * * * * * * * * * * 

<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>
CV score is 0.7952419477681905+-0.0071441451920620375
Best Threshold=0.497430, F-Score=0.738, Precision=0.692, Recall=0.789
roc auc score: 0.7945722371129712
log loss score: 0.6869766413646277
* * * * * * * * * * * * * * * * * * * * 

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
CV score is 0.7734501681056019+-0.007171140345435727
Best Threshold=0.350000, F-Score=0.719, P

Unnamed: 0,Model,CV score,Best Threshold,F-Score,Precision,Recall,roc auc score,log loss score
0,<class 'lightgbm.sklearn.LGBMClassifier'>,0.8,0.36,0.74,0.67,0.83,0.8,0.54
1,<class 'xgboost.sklearn.XGBClassifier'>,0.8,0.35,0.74,0.67,0.83,0.8,0.55
2,<class 'sklearn.ensemble._weight_boosting.AdaB...,0.8,0.5,0.74,0.69,0.79,0.79,0.69
3,<class 'sklearn.ensemble._forest.RandomForestC...,0.77,0.35,0.72,0.64,0.82,0.77,0.6
4,<class 'sklearn.linear_model._logistic.Logisti...,0.79,0.39,0.73,0.65,0.84,0.78,0.58
