# Построение и оптимизация модели

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn import metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from mlxtend.evaluate import lift_score
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [14]:
df = pd.read_csv('orange_small_churn_train_data.csv')
df_test = pd.read_csv('orange_small_churn_test_data.csv')
target = 'labels'
IDcol = 'ID'

In [15]:
cat_columns = df.select_dtypes(include='object').columns
df[cat_columns] = df[cat_columns].astype(np.str)
df_test[cat_columns] = df_test[cat_columns].astype(np.str)
df.loc[df[target]==-1, target] = 0

In [16]:
# вспомогательная функция для кодирования всех категориальных признаков
class MultiColumnLabelEncoder:
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        res = X.copy()
        if self.columns is not None:
            for col in self.columns:
                res[col] = LabelEncoder().fit_transform(res[col])
        else:
            for colname, col in res.iteritems():
                res[colname] = LabelEncoder().fit_transform(col)
        return res

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

In [17]:
# закодируем категориальные признакии с помощью Label Encoder
encoder = MultiColumnLabelEncoder(columns=cat_columns)
df = encoder.fit_transform(df)
df_test = encoder.transform(df_test)

In [7]:
# разобъем выборку на обучающую и валидационную
train, test, y_train, y_test = train_test_split(df.drop(target, axis=1), df[target],
                                                stratify=df[target], test_size=0.3, random_state=42)
train[target] = y_train.values
test[target] = y_test.values

In [8]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=100):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, stratified=True,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=False, seed=27)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    alg.fit(dtrain[predictors], dtrain[target], early_stopping_rounds=early_stopping_rounds, verbose=False,
            eval_metric='auc', eval_set=[(dtrain[predictors], dtrain[target]), (test[predictors], test[target])])
        
    test_predictions = alg.predict(test[predictors])
    test_predprob = alg.predict_proba(test[predictors])[:,1]
        
    print("Accuracy : %.4g" % metrics.accuracy_score(test[target].values, test_predictions))
    print("PR-AUC: %.4g" % metrics.average_precision_score(test[target], test_predictions))
    print("ROC-AUC: %.4g" % metrics.roc_auc_score(test[target], test_predprob))
    print("F1: %.4g" % metrics.f1_score(test[target], test_predictions))
    print("Presision: %.4g" % metrics.precision_score(test[target], test_predictions))
    print("Recall: %.4g" % metrics.recall_score(test[target], test_predictions))
    print("Lift : %.4g" % lift_score(test[target], test_predictions))

In [9]:
# вручную подберем гиперпараметры модели на кросс-валидации
predictors = [x for x in train.columns if x not in [target, IDcol]]

xgb1 = XGBClassifier(learning_rate =0.01, n_estimators=2000, max_depth=2, min_child_weight=1,
                     gamma=0, subsample=0.9, colsample_bytree=0.68, reg_alpha=0,
                     objective= 'binary:logistic', nthread=4, scale_pos_weight=4, seed=27)
modelfit(xgb1, train, predictors)

Accuracy : 0.9156
PR-AUC: 0.1035
ROC-AUC: 0.7346
F1: 0.1731
Presision: 0.3193
Recall: 0.1187
Lift : 4.29


In [10]:
# исключим из выборки признаки, имеющие низкую корреляцию с целевой переменной (из задания первой недели)
predictors = [x for x in train.columns if x not in [target, IDcol, 'Var198', 'Var220', 'Var133', 'Var140', 'Var32',
                                                    'Var39', 'Var15', 'Var8', 'Var48', 'Var141', 'Var20', 'Var31',
                                                    'Var42', 'Var52', 'Var55', 'Var79', 'Var167']]

xgb2 = XGBClassifier(learning_rate =0.01, n_estimators=2000, max_depth=2, min_child_weight=1,
                     gamma=0, subsample=0.9, colsample_bytree=0.68, reg_alpha=0,
                     objective= 'binary:logistic', nthread=4, scale_pos_weight=4, seed=27)
modelfit(xgb2, train, predictors)

Accuracy : 0.9167
PR-AUC: 0.1019
ROC-AUC: 0.7347
F1: 0.164
Presision: 0.3245
Recall: 0.1097
Lift : 4.361


In [18]:
# сформируем submission на kaggle
preds = xgb2.predict_proba(df_test[predictors])
preds = pd.DataFrame(preds[:,1], columns=['result'])
preds.to_csv("sub2.csv", index=True, index_label='Id')

![](kaggle.png)