In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import cross_validation, metrics
from sklearn.cluster import KMeans
from sklearn import preprocessing
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import time
import numpy as np
import pandas as pd

## Первичная обработка данных

In [123]:
def add_features(data):
    # Не очень информативные признаки с примерно одинаковым распределением. Возьмем их сумму, а сами признаки удалим.
    prefix = "FEATURE_"
    feature_nums = [189, 187, 190, 191, 192, 193, 194]
    features = [prefix + str(feature) for feature in feature_nums]
    data['DROPPED_SUM'] = data[features[1:]].sum(axis=1)
    data = data.drop(features, axis=1)


    # Интересный признак, значения которого разбиваются на несколько групп. Сделаем бинарные признаки для каждой группы. 
    data['FEATURE_148_1'] = data.FEATURE_148 < 0.2
    data['FEATURE_148_2'] = ((data.FEATURE_148 < 0.4) & (data.FEATURE_148 > 0.2)).astype(float)
    data['FEATURE_148_3'] = ((data.FEATURE_148 < 0.6) & (data.FEATURE_148 > 0.4)).astype(float)
    data['FEATURE_148_4'] = ((data.FEATURE_148 < 0.8) & (data.FEATURE_148 > 0.6)).astype(float)
    data['FEATURE_148_5'] = ((data.FEATURE_148 < 1.0) & (data.FEATURE_148 > 0.8)).astype(float)

    # Признаки с явно выраженными модами. Построим бинарные признаки, указывающие, к какому пику относится значение.
    to_categories_nums = [(127, 95), (135, 95), (183, 1400), (77, 20), (87, 4), (88, 5), (186, 2500), (244, 40)]
    for index, value in to_categories_nums:
        data[prefix + str(index) + '_1'] = (data[prefix + str(index)] > value).astype(float) 

    # Признаки, которые суммируются в другой признак, но не всегда, поэтому добавим отличаются ли суммы как отдельный признак.
    column_names = []
    prefix = 'FEATURE_'
    for i in range(149, 154):
        column_names.append(prefix + str(i))

    df149_first = data[column_names[:-1]]
    df153 = data[column_names[-1:]]
    data['SUM_FEATURE'] = ((pd.Series(df153.values.flatten()) - df149_first.sum(axis=1)) > 0).astype('float')
    return data

def preprocess_train_data(data):
    target = data['TARGET']
    data = data.drop(labels=['TARGET', 'ID'], axis=1)

    data = add_features(data)

    # Заполним пропуски, отшкалируем значения и разобьем на обучающую и валидационную выборки.
    clean_data = data.fillna(data.median(axis=0))
    scaler = preprocessing.MinMaxScaler()
    clean_data[clean_data.columns] = scaler.fit_transform(clean_data[clean_data.columns])
    return clean_data, target, scaler

def preprocess_test_data(data, scaler):
    ID = data['ID']
    data = data.drop(labels=['ID'], axis=1)
    data = add_features(data)
    clean_data = data.fillna(data.median(axis=0))
    clean_data[clean_data.columns] = scaler.fit_transform(clean_data[clean_data.columns])
    return clean_data, ID

Считываем данные

In [14]:
data = pd.read_csv('data/contest_train.csv')
clean_data, target, scaler = preprocess_train_data(data)

In [15]:
clean_data.head()

Unnamed: 0,FEATURE_0,FEATURE_1,FEATURE_2,FEATURE_3,FEATURE_4,FEATURE_5,FEATURE_6,FEATURE_7,FEATURE_8,FEATURE_9,...,FEATURE_148_5,FEATURE_127_1,FEATURE_135_1,FEATURE_183_1,FEATURE_77_1,FEATURE_87_1,FEATURE_88_1,FEATURE_186_1,FEATURE_244_1,SUM_FEATURE
0,0.005195,0.051282,0.0,0.0,0.0,0.0,0.0,0.017228,0.027275,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.002597,0.025641,0.0,0.0,1.0,0.0,0.0,0.151109,0.029359,0.272727,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056985,0.028302,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.002597,0.025641,0.0,0.0,0.0,0.0,0.0,0.058852,0.032526,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.01039,0.076923,1.0,0.0,1.0,0.0,0.0,0.255933,0.029359,0.272727,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0


In [125]:
test_data = pd.read_csv('data/contest_test.csv')
clean_test_data, test_ID = preprocess_test_data(test_data, scaler)

In [126]:
clean_test_data.head()

Unnamed: 0,FEATURE_0,FEATURE_1,FEATURE_2,FEATURE_3,FEATURE_4,FEATURE_5,FEATURE_6,FEATURE_7,FEATURE_8,FEATURE_9,...,FEATURE_148_5,FEATURE_127_1,FEATURE_135_1,FEATURE_183_1,FEATURE_77_1,FEATURE_87_1,FEATURE_88_1,FEATURE_186_1,FEATURE_244_1,SUM_FEATURE
0,0.000543,0.012821,0.0,0.0,0.0,0.0,0.0,0.037474,0.024319,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.158135,0.029084,0.2,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.33285,0.344223,0.2,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.003261,0.025641,0.0,0.0,0.0,0.0,0.0,0.050589,0.019639,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
4,0.000543,0.012821,0.0,0.0,0.0,0.0,0.0,0.044815,0.02521,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0


In [128]:
test_ID.head()

0    1154270440
1    1147987574
2    1129622364
3     619797496
4    7391484886
Name: ID, dtype: int64

## Основной Pipeline

Некоторые шаги ниже характерны либо только для этапа тестирования финальной модели, либо для этапа обучения. В этом случае будет явно указано, для какого этапа приведен шаг.

### 1) Добавляем в чистые данные регрессионный признак, а также кластеризацию за 6 и 12 кластеров.

Вообще говоря, задача напоминает регрессию, посколько данные классы имеют довольно строгое отношение порядка, например, по времени использования, количеству траффика и т.д. Поэтому возникла идея обучить на тренировочной выборке регрессор и добавить его предсказания в данные. Среди нескольких регрессоров (SGD, Linear, Ridge и пр.), лучший результат показал GradientBoostingRegressor.

**Для обучения:**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(clean_data, target, train_size=0.8)

In [19]:
reg = GradientBoostingRegressor()
reg.fit(X_train, y_train)
regression_train = reg.predict(X_train.copy())
regression_test = reg.predict(X_test.copy())
X_train['REGRESSION'] = regression_train
X_test['REGRESSION'] = regression_test
six_clusters = KMeans(n_clusters=6, precompute_distances = True, n_jobs=-1)
six_clusters.fit(X_train)
twelve_clusters = KMeans(n_clusters=12, precompute_distances = True, n_jobs=-1)
twelve_clusters.fit(X_train)
cluster_twelve_train = twelve_clusters.predict(X_train.copy())
cluster_twelve_test = twelve_clusters.predict(X_test.copy())
cluster_six_train = six_clusters.predict(X_train.copy())
cluster_six_test = six_clusters.predict(X_test.copy())
X_train['CLUSTER_6'] = cluster_six_train
X_test['CLUSTER_6'] = cluster_six_test
X_train['CLUSTER_12'] = cluster_twelve_train
X_test['CLUSTER_12'] = cluster_twelve_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


**Для теста:**

In [None]:
train_data = clean_data.copy()
test_data = clean_test_data.copy()

In [73]:
reg = GradientBoostingRegressor()
reg.fit(train_data, target)
regression_train = reg.predict(train_data.copy())
regression_test = reg.predict(test_data.copy())
train_data['REGRESSION'] = regression_train
test_data['REGRESSION'] = regression_test

Появилась идея добавить признак кластеризации, т.к. возможно с его помощью получится найти какую-то структуру в данных, если она есть. Kmeans -- unsupervised алгоритм, поэтому речи о каком-либо лике ответа быть не может. Экспериментально установлено, что кластеризация на большее число кластеров для алгоритма оказывается полезнее. 

In [74]:
six_clusters = KMeans(n_clusters=6, precompute_distances = True, n_jobs=-1)
six_clusters.fit(train_data)
twelve_clusters = KMeans(n_clusters=12, precompute_distances = True, n_jobs=-1)
twelve_clusters.fit(train_data)
cluster_twelve_train = twelve_clusters.predict(train_data.copy())
cluster_twelve_test = twelve_clusters.predict(test_data.copy())
cluster_six_train = six_clusters.predict(train_data.copy())
cluster_six_test = six_clusters.predict(test_data.copy())
train_data['CLUSTER_6'] = cluster_six_train
test_data['CLUSTER_6'] = cluster_six_test
train_data['CLUSTER_12'] = cluster_twelve_train
test_data['CLUSTER_12'] = cluster_twelve_test

### 2) Сэмплируем данные и обучаем xgboost с подобранными ранее параметрами.

In [75]:
from imblearn.over_sampling import SMOTE
from collections import Counter

**Для обучения:**

In [76]:
clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=0.3,
       learning_rate=0.002, max_delta_step=0, max_depth=6,
       min_child_weight=6, missing=None, n_estimators=800, n_jobs=1,
       nthread=8, objective='multi:softprob', random_state=0,
       reg_alpha=0.3, reg_lambda=0.5, scale_pos_weight=1, seed=27,
       silent=True, subsample=0.5)

In [None]:
X_resampled, y_resampled = SMOTE(kind='svm', m_neighbors=15, n_jobs=8).fit_sample(X_train, y_train)
res_X = pd.DataFrame(X_resampled, columns=X_train.columns)

**Для теста:**

In [78]:
final_clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=0.3,
       learning_rate=0.002, max_delta_step=0, max_depth=6,
       min_child_weight=6, missing=None, n_estimators=800, n_jobs=1,
       nthread=8, objective='multi:softprob', random_state=0,
       reg_alpha=0.3, reg_lambda=0.5, scale_pos_weight=1, seed=27,
       silent=True, subsample=0.5)

In [77]:
target.value_counts()

0    17372
1     5650
2     1499
Name: TARGET, dtype: int64

Т.к. в данных классы распределены очень неравномерно, есть смысл просэмплировать минорные классы. Резать мажоритарный класс не хочется, т.к. есть желание по-максимуму сохранить данные.

In [107]:
X_resampled, y_resampled = SMOTE(kind='svm', m_neighbors=15, n_jobs=8).fit_sample(train_data, target)

In [108]:
res_X = pd.DataFrame(X_resampled, columns=train_data.columns)

### 3) Отбрасываем менее важные признаки.

Получаем лучшие признаки на отложенной выборке.

In [30]:
def best_train_test_split(clf, X_train, X_test, y_train, y_test, best_features_num, metric='macro'):
    best_train = X_train.copy()
    best_test = X_test.copy()
    
    start = time.time()
    clf.fit(best_train, y_train)
    finish = time.time() - start
    print(finish, 'seconds needed for boosting on whole train')
    
    imps = np.array(clf.feature_importances_)
    features = np.arange(len(imps))
    sorted_features = [x for _, x in sorted(zip(imps, features), key=lambda pair: pair[0])]
    sorted_features = sorted_features[::-1]
    
    best_num = len(imps)
    preds = clf.predict(best_test)
    if metric == 'macro':
        fscore = metrics.f1_score(y_test, preds, average='macro')
    if metric == 'binary':
        fscore = metrics.f1_score(y_test, preds)
    best_fscore = fscore
    print('MACRO F_1: {} \nACCURACY: {}'.format(fscore,  metrics.accuracy_score(y_test, preds)))
                                   
    for features_num in best_features_num:
        best_features = sorted_features[:features_num]
        train = X_train.copy()
        test = X_test.copy()
        best_columns = np.array(X_train.columns)[best_features]
        train = train.loc[:, best_columns]
        test = test.loc[:, best_columns]

        start = time.time()
        clf.fit(train, y_train)
        finish = time.time() - start

        print('{} seconds needed for {} best features'.format(finish, features_num))
        preds = clf.predict(test)
        if metric == 'macro':
            fscore = metrics.f1_score(y_test, preds, average='macro')
        if metric == 'binary':
            fscore = metrics.f1_score(y_test, preds)
        if fscore > best_fscore:
            best_fscore = fscore
            best_train, best_test = train.copy(), test.copy()
            best_num = features_num
        print('MACRO F_1: {} \nACCURACY: {}'.format(fscore,  metrics.accuracy_score(y_test, preds)))
    return best_fscore, best_train, best_test, best_num

**Для обучения:**

In [31]:
fscore, res_train, res_test, num = best_train_test_split(clf, res_X, X_test, y_resampled, y_test, [250, 210, 190, 160, 100, 80])

368.2341229915619 seconds needed for boosting on whole train


  if diff:


MACRO F_1: 0.5281686664183706 
ACCURACY: 0.6621814475025484
343.66720390319824 seconds needed for 250 best features


  if diff:


MACRO F_1: 0.5277086476194753 
ACCURACY: 0.6617737003058104
310.5842673778534 seconds needed for 210 best features


  if diff:


MACRO F_1: 0.5265878269475303 
ACCURACY: 0.6601427115188583
285.47608947753906 seconds needed for 190 best features


  if diff:


MACRO F_1: 0.5273630206888623 
ACCURACY: 0.6607543323139653
245.9658784866333 seconds needed for 160 best features


  if diff:


MACRO F_1: 0.5269029597873013 
ACCURACY: 0.6621814475025484
167.02087688446045 seconds needed for 100 best features


  if diff:


MACRO F_1: 0.5293574365014486 
ACCURACY: 0.6619775739041794
117.54115748405457 seconds needed for 80 best features
MACRO F_1: 0.5288425350658285 
ACCURACY: 0.6623853211009174


  if diff:


In [82]:
res_train.to_csv('best_columns.csv')

In [34]:
print('it is better to use', num, 'best features')

it is better to use 100 best features


**Для теста:**

In [230]:
# columns = pd.read_csv('data/best_columns.csv')
# columns.drop(columns.columns[0], axis=1, inplace=True)
# columns = columns.drop('Unnamed 0', axis=1)

In [232]:
cols = ['REGRESSION', 'FEATURE_196', 'FEATURE_142', 'FEATURE_238',
       'FEATURE_174', 'FEATURE_185', 'FEATURE_180', 'FEATURE_248',
       'FEATURE_74', 'FEATURE_153', 'FEATURE_203', 'FEATURE_219',
       'FEATURE_226', 'FEATURE_1', 'FEATURE_55', 'FEATURE_110', 'FEATURE_250',
       'FEATURE_258', 'FEATURE_170', 'FEATURE_197', 'DROPPED_SUM',
       'FEATURE_227', 'FEATURE_220', 'FEATURE_24', 'FEATURE_121',
       'FEATURE_171', 'FEATURE_120', 'FEATURE_115', 'FEATURE_122', 'FEATURE_0',
       'FEATURE_111', 'FEATURE_172', 'FEATURE_169', 'FEATURE_165',
       'FEATURE_63', 'FEATURE_58', 'FEATURE_186_1', 'FEATURE_198',
       'FEATURE_135', 'FEATURE_86', 'FEATURE_77', 'FEATURE_232', 'FEATURE_69',
       'FEATURE_199', 'FEATURE_85', 'FEATURE_34', 'FEATURE_81', 'FEATURE_82',
       'FEATURE_178', 'FEATURE_118', 'FEATURE_91', 'FEATURE_114', 'FEATURE_99',
       'FEATURE_228', 'FEATURE_26', 'FEATURE_89', 'FEATURE_177', 'FEATURE_155',
       'FEATURE_76', 'FEATURE_179', 'FEATURE_243', 'FEATURE_224',
       'FEATURE_154', 'FEATURE_213', 'FEATURE_148', 'FEATURE_57',
       'FEATURE_107', 'FEATURE_215', 'FEATURE_211', 'SUM_FEATURE',
       'FEATURE_46', 'FEATURE_108', 'FEATURE_151', 'FEATURE_244',
       'FEATURE_254', 'FEATURE_8', 'FEATURE_116', 'FEATURE_257', 'FEATURE_167',
       'FEATURE_127', 'FEATURE_96', 'FEATURE_88', 'FEATURE_45', 'FEATURE_21',
       'FEATURE_218', 'FEATURE_133', 'FEATURE_130', 'FEATURE_59',
       'FEATURE_252', 'FEATURE_112', 'FEATURE_182', 'FEATURE_47', 'FEATURE_71',
       'FEATURE_221', 'FEATURE_38', 'FEATURE_48', 'FEATURE_186', 'FEATURE_7',
       'FEATURE_204', 'FEATURE_126']

In [234]:
train_data = train_data.loc[:, cols]
test_data = test_data.loc[:, cols]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [235]:
res_X = res_X.loc[:, cols]

### 4) I'm baggin', baggin' yoooouuuuu...

*...put your lovin' hand out, baby*

Напишем вспомогательную функцию для создания ансамбля

In [110]:
def generate_ensemble(clf, X_train, y_train, X_test, clf_num=10):
    predictions = None
    for i in range(clf_num):
        clf.set_params(seed=i)
        clf.fit(X_train, y_train)
        if predictions is None:
            predictions = clf.predict_proba(X_test)
        else:
            predictions += clf.predict_proba(X_test)
        print('Done with {} iteration'.format(i))
        
    predictions /= clf_num
    answers = np.argmax(predictions, axis=1)
    return answers, predictions

**Для обучения:**

In [None]:
answers, predictions = generate_ensemble(clf, y_resampled, res_test, clf_num=4)

**Для теста:**

In [111]:
final_clf.fit(res_X, y_resampled)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=0.3, learning_rate=0.002,
       max_delta_step=0, max_depth=6, min_child_weight=6, missing=None,
       n_estimators=800, n_jobs=1, nthread=8, objective='multi:softprob',
       random_state=0, reg_alpha=0.3, reg_lambda=0.5, scale_pos_weight=1,
       seed=27, silent=True, subsample=0.5)

In [118]:
final_answers, final_predictions = generate_ensemble(final_clf, res_X, y_resampled, test_data, clf_num=6)

Done with 0 iteration
Done with 1 iteration
Done with 2 iteration
Done with 3 iteration
Done with 4 iteration
Done with 5 iteration


### 5) Оценка результатов (для этапа обучения).

In [39]:
def get_details(preds, y_test):
    equals = preds == y_test.values
    TP = np.zeros(3)
    TN = np.zeros(3)
    FP = np.zeros(3)
    FN = np.zeros(3)
    Pr = np.zeros(3)
    Rec = np.zeros(3)
    Lift = np.zeros(3)
    
    for i in range(3):
        TP[i] = np.sum(equals & (preds == i))
        TN[i] = np.sum(equals & (preds != i))
        FN[i] = np.sum(~equals & (y_test.values == i))
        FP[i] = np.sum(~equals & (preds == i))
        Pr[i] = float(TP[i]) / (TP[i] + FP[i])
        Rec[i] = float(TP[i]) / (TP[i] + FN[i])
        Lift[i] = Pr[i] * len(y_test) / (TP[i] + FN[i])
        print('Precision for {} class: {} '.format(i, Pr[i]))
        print('Recall for {} class: {} '.format(i, Rec[i]))
        print('Lift for {} class: {}\n'.format(i, Lift[i]))
        
    Micro_pr = float(np.sum(TP)) / (np.sum(TP) + np.sum(FP))
    Micro_rec = float(np.sum(TP)) / (np.sum(TP) + np.sum(FN))
    Micro_f_score = 2 * Micro_pr * Micro_rec / (Micro_pr + Micro_rec)
    print('Micro precision: {}'.format(Micro_pr))
    print('Micro recall: {}'.format(Micro_rec))
    print('Micro_f_score: {}\n'.format(Micro_f_score))
    
    
    Macro_pr = np.sum(Pr) / float(len(Pr))
    Macro_rec = np.sum(Rec) / float(len(Rec))
    Macro_f_score = 2 * Macro_pr * Macro_rec / (Macro_pr + Macro_rec)
    print('Macro precision: {}'.format(Macro_pr))
    print('Macro recall: {}'.format(Macro_rec))
    print('Macro_f_score: {}'.format(Macro_f_score))
    Macro_f_score_LIB = metrics.f1_score(y_test, preds, average='macro')
    print('Macro_f_score_LIB: {}'.format(Macro_f_score_LIB))
    
    return Macro_f_score_LIB, Lift

In [42]:
preds = clf.fit(res_train, y_resampled).predict(res_test)
get_details(preds, y_test)

Precision for 0 class: 0.8056463595839525 
Recall for 0 class: 0.7721446881230418 
Lift for 0 class: 1.1255184829847016

Precision for 1 class: 0.3461928934010152 
Recall for 1 class: 0.3119853613906679 
Lift for 1 class: 1.553592078803275

Precision for 2 class: 0.36396396396396397 
Recall for 2 class: 0.6710963455149501 
Lift for 2 class: 5.931040675226722

Micro precision: 0.6634046890927625
Micro recall: 0.6634046890927625
Micro_f_score: 0.6634046890927625

Macro precision: 0.5052677389829773
Macro recall: 0.5850754650095532
Macro_f_score: 0.5422508367224458
Macro_f_score_LIB: 0.5295675526881614


  if diff:


(0.5295675526881614, array([1.12551848, 1.55359208, 5.93104068]))

In [50]:
final_macro_f_score, final_lift = get_details(answers, y_test)

Precision for 0 class: 0.8052412150089339 
Recall for 0 class: 0.7701509541441185 
Lift for 0 class: 1.1249524806661408

Precision for 1 class: 0.34510595358224017 
Recall for 1 class: 0.312900274473925 
Lift for 1 class: 1.5487142747675098

Precision for 2 class: 0.36330935251798563 
Recall for 2 class: 0.6710963455149501 
Lift for 2 class: 5.92037333588279

Micro precision: 0.6621814475025484
Micro recall: 0.6621814475025484
Micro_f_score: 0.6621814475025484

Macro precision: 0.5045521737030532
Macro recall: 0.5847158580443311
Macro_f_score: 0.5416842293657472
Macro_f_score_LIB: 0.5289773859349673


Стоит отметить, что если обучаться, например, без сэмплирования данных, то мы получим accuracy около 0.73, но в этом будет довольно мало смысла. Ввиду несбалансированности данных, такой подход будет вести к очень высокому recall и precision для 0-го (доминирующего) класса, и к recall < 0.1 для первого, например. Просэмплировав **тренировочную** выборку, мы смогли в три раза поднять recall первого класса и сделать модель более информативной.

Интересно то, что второй, самый малый, класс определяется достаточно неплохо и без сэмплинга (recall и precision около 0.5). Возможно, это связано с тем, что отличия между абонентами этой категории от нулевого класса гораздо выше, чем у первой.  

### 6) Расчет результатов (для этапа тестирования).

In [178]:
final_answers

array([1, 0, 2, ..., 0, 1, 0])

In [179]:
final_predictions

array([[0.32819775, 0.5375373 , 0.13426499],
       [0.49851096, 0.39148906, 0.11000001],
       [0.10202257, 0.17186111, 0.72611636],
       ...,
       [0.5963299 , 0.27531815, 0.12835196],
       [0.19286217, 0.5279997 , 0.27913815],
       [0.5153042 , 0.3715168 , 0.11317898]], dtype=float32)

In [180]:
sorted_ids_one_show = [(i_d, _) for i_d, _ in sorted(zip(test_ID, final_predictions), key=lambda probs: probs[1][1])]
sorted_ids_one = [i_d for i_d, _ in sorted(zip(test_ID, final_predictions), key=lambda probs: probs[1][1])]
sorted_ids_one_show = sorted_ids_one_show[::-1]
sorted_ids_one = sorted_ids_one[::-1]

In [181]:
best_ones = sorted_ids_one[:500]

In [182]:
sorted_ids_two_show = [(i_d, _) for i_d, _ in sorted(zip(test_ID, final_predictions), key=lambda probs: probs[1][2])]
sorted_ids_two = [i_d for i_d, _ in sorted(zip(test_ID, final_predictions), key=lambda probs: probs[1][2])]
sorted_ids_two_show = sorted_ids_two_show[::-1]
sorted_ids_two = sorted_ids_two[::-1]

In [183]:
best_twos = sorted_ids_two[:200]

In [206]:
indx = best_ones + best_twos
values = [1] * 500 + [2] * 200
vals = [(i_d, value) for i_d, value in zip(indx, values)]

In [224]:
contest_segments = pd.DataFrame(data=vals, columns=['ID', 'TARGET'])
contest_segments.to_csv('contest_segments.csv', index=False)

In [225]:
ans = [(i_d, label) for i_d, label in zip(test_ID, final_answers)]

In [226]:
contest_answer = pd.DataFrame(data=ans, columns=['ID', 'TARGET'])
contest_answer.to_csv('contest_answer.csv', index=False)

## Alternative: 2 classifiers.

В презентации мы упоминали про вторую модель, построенную на основе градиентного бустинга над решающими деревьями. Ниже приведена ее архитектура.

Идея в том, чтобы построить два классификатора по принципу ``i-ый классификатор показывает, что класс > i``, а потом сложить их ответы.

In [51]:
clf_more_than_zero = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8999999999999999, gamma=0.30000000000000004,
       learning_rate=0.002, max_delta_step=0, max_depth=6,
       min_child_weight=2, missing=None, n_estimators=700, n_jobs=1,
       nthread=8, objective='binary:logistic', random_state=0,
       reg_alpha=0.3, reg_lambda=0.5, scale_pos_weight=1, seed=27,
       silent=True, subsample=0.5)

clf_more_than_one = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8999999999999999, gamma=0.30000000000000004,
       learning_rate=0.002, max_delta_step=0, max_depth=6,
       min_child_weight=2, missing=None, n_estimators=700, n_jobs=1,
       nthread=8, objective='binary:logistic', random_state=0,
       reg_alpha=0.3, reg_lambda=0.5, scale_pos_weight=1, seed=27,
       silent=True, subsample=0.5)

In [52]:
zero_res_y_train = y_resampled.copy()
zero_res_y_train[zero_res_y_train > 1] = 1
zero_res_y_test = y_test.copy()
zero_res_y_test[zero_res_y_test > 1] = 1

one_res_y_train = y_resampled.copy()
one_res_y_train[one_res_y_train == 1] = 0
one_res_y_train[one_res_y_train == 2] = 1
one_res_y_test = y_test.copy()
one_res_y_test[one_res_y_test == 1] = 0
one_res_y_test[one_res_y_test == 2] = 1

In [55]:
best_features_num = [120]

In [56]:
fscore, best_zero_train, best_zero_test, num = best_train_test_split(clf_more_than_zero, res_X, X_test, zero_res_y_train, zero_res_y_test, best_features_num)

154.57032418251038 seconds needed for boosting on whole train


  if diff:


MACRO F_1: 0.6493227163642572 
ACCURACY: 0.7096839959225281
82.49921655654907 seconds needed for 120 best features
MACRO F_1: 0.6489218975882055 
ACCURACY: 0.709887869520897


  if diff:


In [57]:
fscore, best_one_train, best_one_test, num = best_train_test_split(clf_more_than_one, res_X, X_test, one_res_y_train, one_res_y_test, best_features_num)

163.08265805244446 seconds needed for boosting on whole train


  if diff:


MACRO F_1: 0.7114947562138312 
ACCURACY: 0.9221202854230377
77.74241137504578 seconds needed for 120 best features
MACRO F_1: 0.7103332753969873 
ACCURACY: 0.9211009174311927


  if diff:


In [None]:
preds_0, probs_0 = generate_ensemble(clf_more_than_zero, best_zero_train, zero_res_y_train, best_zero_test, clf_num=4)
preds_1, probs_1 = generate_ensemble(clf_more_than_one, best_one_train, one_res_y_train, best_one_test, clf_num=4)

In [62]:
clf_more_than_zero.fit(best_zero_train, zero_res_y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8999999999999999, gamma=0.30000000000000004,
       learning_rate=0.002, max_delta_step=0, max_depth=6,
       min_child_weight=2, missing=None, n_estimators=700, n_jobs=1,
       nthread=8, objective='binary:logistic', random_state=0,
       reg_alpha=0.3, reg_lambda=0.5, scale_pos_weight=1, seed=27,
       silent=True, subsample=0.5)

In [63]:
clf_more_than_one.fit(best_one_train, one_res_y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8999999999999999, gamma=0.30000000000000004,
       learning_rate=0.002, max_delta_step=0, max_depth=6,
       min_child_weight=2, missing=None, n_estimators=700, n_jobs=1,
       nthread=8, objective='binary:logistic', random_state=0,
       reg_alpha=0.3, reg_lambda=0.5, scale_pos_weight=1, seed=27,
       silent=True, subsample=0.5)

In [64]:
self_preds = clf_more_than_zero.predict(best_zero_test) + clf_more_than_one.predict(best_one_test)

  if diff:
  if diff:


In [65]:
get_details(self_preds, y_test)

Precision for 0 class: 0.8043161271507728 
Recall for 0 class: 0.7855311876958132 
Lift for 0 class: 1.123660097885087

Precision for 1 class: 0.33113828786453436 
Recall for 1 class: 0.3220494053064959 
Lift for 1 class: 1.4860322982392873

Precision for 2 class: 0.4019370460048426 
Recall for 2 class: 0.5514950166112956 
Lift for 2 class: 6.549837909148681

Micro precision: 0.6678899082568808
Micro recall: 0.6678899082568808
Micro_f_score: 0.6678899082568808

Macro precision: 0.5124638203400499
Macro recall: 0.5530252032045349
Macro_f_score: 0.5319724598113974
Macro_f_score_LIB: 0.5287764289193103


(0.5287764289193103, array([1.1236601 , 1.4860323 , 6.54983791]))

In [None]:
preds = preds_0 + preds_1
get_details(preds, y_test)