In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import gc
import sys
import os

from collections import Counter, defaultdict
from itertools import combinations, product, chain
from copy import deepcopy
from scipy import sparse
import scipy

import sklearn
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, KFold
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import roc_auc_score
import random

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier

DATASET_DIR = '/home/alexander/git/ml/datasets/SHAD/2017_3/'
_add_to_path = True

In [2]:
path = os.path.abspath('../../../')
if path not in sys.path:
    sys.path.append(path)
    
from ml.feature import *

print(FeatureKernel.__doc__)


    FeatureKernel - класс, реализующий общий функционал класса FeatureBase. 
    Является одним из аттрибутов экземпляров класса FeatureBase (SparseFeatureBase и DenseFeatureBase).
    Реализуемые операции включают в себя: 
        1) проверку корректности значений признака 
        2) вывод сообщений о некорректности значений
        3) предобработку и постобработку признаков
        4) получение характеристик признаков (размера, формата и т.п.)
    


<a id='toc'></a>
# Содержание
* [1. Загрузка данных](#loading)
* [2. Машинное обучение](#machine_learning)
    * [2.1 Создание пар и триплетов](#logistic_regression)
    * [2.2 Отбор признаков](#feature_selection)
        * [2.2.1 Logistic Regression](#lr)
            * [Single](#lr_single)
            * [Bagging](#lr_bagging)
            * [Binary Selection](#lr_binary)
        * [Отчет](#report)
        * [2.2.2 Bernoulli NB](#nb)
            * [Single](#nb_single)
            * [Bagging](#nb_bagging)
            * [Binary Selection](#nb_binary)
    * [2.2 Деревья](#trees)

* f1 — ресурс к которому происходит обращение
* f2 — идентификатор группы машин в котором находится источник
* f3 — тип источника обращения (1)
* f4 — тип источника обращения (2)
* f5 — класс устройств в котором находится источник
* f6 — сокращенное название источника обращения
* f7 — расширенный тип источника обращения (1)
* f8 — расширенный тип источника обращения (2)
* f9 — код модели источника

<a id='loading'></a>
# 1. Загрузка данных [[toc](#toc)]

In [3]:
X_train = pd.read_csv(os.path.join(DATASET_DIR, 'X_train.csv'))
X_test  = pd.read_csv(os.path.join(DATASET_DIR, 'X_test.csv'))
Y_train = pd.read_csv(os.path.join(DATASET_DIR, 'y_train.csv'))
print(X_train.shape, Y_train.shape)
print(X_test.shape)

Y_train = Y_train['label'].values
feature_names = f_columns = list(X_train.columns.difference(['id']))
X_test  = X_test.values[:, 1:10]
X_train = X_train.values[:, 1:10]
X_all   = np.concatenate([X_train, X_test], axis=0)

(19661, 10) (19661, 2)
(13108, 10)


# 2. Создание признаков [[toc](#toc)]

In [4]:
feature_groups = {}
features = OrderedDict()
for n_feature, fname in enumerate(f_columns):
    features[fname] = CategoricalFeature(values=X_train[:, n_feature], name=fname)
    print(fname, features[fname].get_name(), features[fname].is_label_encoded())
cat_manager = CategoricalFeaturesManager(features, treat_const='delete')

f1 f1 True
f2 f2 True
f3 f3 True
f4 f4 True
f5 f5 True
f6 f6 True
f7 f7 True
f8 f8 True
f9 f9 True


In [5]:
degree = 2
cat_manager.add_all_combinations(feature_names, degree=degree, replace=True)
print(cat_manager.get_list_of_features())

['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f1+f2', 'f1+f3', 'f1+f4', 'f1+f5', 'f1+f6', 'f1+f7', 'f1+f8', 'f1+f9', 'f2+f3', 'f2+f4', 'f2+f5', 'f2+f6', 'f2+f7', 'f2+f8', 'f2+f9', 'f3+f4', 'f3+f5', 'f3+f6', 'f3+f7', 'f3+f8', 'f3+f9', 'f4+f5', 'f4+f6', 'f4+f7', 'f4+f8', 'f4+f9', 'f5+f6', 'f5+f7', 'f5+f8', 'f5+f9', 'f6+f7', 'f6+f8', 'f6+f9', 'f7+f8', 'f7+f9', 'f8+f9']


In [23]:
data_sparse = cat_manager.assemble(feature_names=['f1'], ohe_names=['f1'], sparse=True)
print(data_sparse.shape, type(data_sparse))

data_dense = cat_manager.assemble(feature_names=['f1'], ohe_names=[], sparse=False)
print(data_dense.shape, type(data_dense))

(19661, 5740) <class 'scipy.sparse.csc.csc_matrix'>
(19661, 1) <class 'numpy.ndarray'>


<a id='machine_learning'></a>
# 2. Машинное обучение [[toc](#toc)]

## 2.1 Создание пар и триплетов<sup>[содержание](#toc)</sup>

In [3]:
class FeaturesCreator(object):     
    def group_features(self, X, degree, hash=hash):
        if degree == 1:
            return deepcopy(X), list(range(X.shape[1]))
        
        new_X = []
        combs = []
        n_samples, n_features = X.shape
        for indices in combinations(range(n_features), degree):
            new_X.append([hash(tuple(sequence)) for sequence in X[:, indices]])
            combs.append(tuple(indices))
        new_X = np.array(new_X).T
        return new_X, combs
          
    def label_encode_features(self, X, group_uniques=True, threshold_uniques=1):
        new_X = []
        unique_labels = []
        for n_feature in range(X.shape[1]):
            f_column = LabelEncoder().fit_transform(X[:, n_feature])
            add_column = True
            if group_uniques:
                counts = Counter(f_column)
                n_uniques = len(counts)
                for n, i in enumerate(f_column):
                    for thr in range(1, threshold_uniques + 1):
                        if counts[i] == thr:
                            f_column[n] = n_uniques + thr - 1

                if len(Counter(f_column)) == 1:
                    print('All values of feature {} are different'.format(n_feature))
                    add_column = False
                    
                encoder = LabelEncoder()
                f_column = encoder.fit_transform(f_column)
                temp = np.where(encoder.classes_ == n_uniques)[0]
                if len(temp) > 0:
                    index = temp[0]
                else:
                    index = -1
            if add_column:
                new_X.append(f_column[:, np.newaxis])
                unique_labels.append(index)
                
        X = np.concatenate(new_X, axis=1)
        if group_uniques:
            return X, unique_labels
        else:
            return X
    

    def encode_ohe(self, X, labels_to_omit=None):
        n_samples, n_features = X.shape
        if labels_to_omit is None:
            return OneHotEncoder().fit_transform(X)
        
        new_X = []
        for n_feature in range(n_features):
            f_column = X[:, [n_feature]]
            new_X.append(self.encode_ohe_feature(f_column, labels_to_omit[n_feature]))
        new_X = sparse.hstack(new_X).tocsr()
        return new_X
        
    def encode_ohe_feature(self, f_column, label_to_omit):
        if label_to_omit == -1:
            new_X = OneHotEncoder().fit_transform(f_column)
        else:
            mask = (f_column == label_to_omit)
            new_X = OneHotEncoder().fit_transform(f_column).tocsc()
            if not np.all(new_X[:, -1].todense() == mask):
                assert False
            new_X = new_X[:, :-1].tocsr()
        return new_X
    

In [103]:
features_creator = FeaturesCreator()

X_all_gr = []
X_all_gr_ohe = []
unique_labels = []
thresholds_uniques = [1, 2, 2, 3, 3, 2, 2]
for n, degree in enumerate(range(1, 8)):
    print('Generating for degree {}'.format(degree))
    X_new, _ = features_creator.group_features(X_all, degree)
    X_new, unique_labels_new = features_creator.label_encode_features(X_new, threshold_uniques=thresholds_uniques[n])
    X_new_ohe = features_creator.encode_ohe(X_new, labels_to_omit=None)
    print(X_new.shape, X_new_ohe.shape)
    X_all_gr.append(X_new)
    X_all_gr_ohe.append(X_new_ohe)
    unique_labels += unique_labels_new
    
X_all_comb = np.concatenate(X_all_gr, axis=1)
print(X_all_comb.shape)

Generating for degree 1
(32769, 8) (32769, 9953)
Generating for degree 2
(32769, 28) (32769, 54267)
Generating for degree 3
(32769, 56) (32769, 126956)
Generating for degree 4
(32769, 70) (32769, 105071)
Generating for degree 5
(32769, 56) (32769, 66795)
Generating for degree 6
(32769, 28) (32769, 33230)
Generating for degree 7
All values of feature 0 are different
(32769, 7) (32769, 4799)
(32769, 253)


<a id='feature_selection'></a>
## 2.2 Отбор признаков<sup>[содержание](#toc)</sup>

In [5]:
class MyXYTrainTestBuilder(object):
    def __init__(self, ohe_features_dict, Y_train, copy_input=False):
        if copy_input:
            self.ohe_features_dict = deepcopy(ohe_features_dict)
            self.Y_train = deepcopy(Y_train)
        else:
            self.ohe_features_dict = ohe_features_dict
            self.Y_train = Y_train
        
    def __call__(self, feature_names):
        return self.get_X_train(feature_names), self.get_Y_train()
    
    def get_X_train(self, feature_names):
        train_size = len(Y_train)
        X_train = sparse.hstack([self.ohe_features_dict[name][:train_size] for name in feature_names]).tocsr()
        return X_train
    
    def get_Y_train(self):
        return self.Y_train
            
    def get_X_test(self, feature_names):
        train_size = len(Y_train)
        X = sparse.hstack([self.ohe_features_dict[name][train_size:] for name in feature_names]).tocsr()
        return X

In [6]:
class GreedFeaturesSelector(object):
    def __init__(self):
        self.scores_history = {}
        self.best_features_history = {}
        
        self.n_iters = 0
        self.best_score = None
        self.best_features = []
        self.omit_feature_names = []
        self.start_backward = False
    
    def update_history(self):
        self.scores_history[self.n_iters] = self.best_score
        self.best_features_history[self.n_iters] = list(self.best_features)
        self.n_iters += 1
        
    def initialize(self, best_score, best_features, omit_feature_names=None):
        self.best_score = best_score
        self.best_features = list(best_features)
        if omit_feature_names is not None:
            self.omit_feature_names = list(omit_feature_names)
    
    def select_features(self, clf, feature_names, builder, kfold_cv, max_features=-1):
        self.clf = deepcopy(clf)
        self.feature_names = deepcopy(feature_names)
        self.builder = builder
        self.kfold_cv = kfold_cv
        
        if self.best_score is None:
            print('Adding first feature')
            best_feature, best_score = self.move_forward()
            self.best_score = best_score
            self.best_features.append(best_feature)
            self.update_history()
            if max_features == len(self.best_features):
                return self.best_features, self.best_score
        
        while 1:
            print('Entering infinite loop')
            forward = False
            backward = False
            
            if not self.start_backward:
                new_feature, new_score = self.move_forward()
                while new_score > self.best_score:
                    print('+ feature = {}, score = {} -> {}'.format(new_feature, self.best_score, new_score))
                    self.best_score = new_score
                    self.best_features.append(new_feature)
                    self.update_history()
                    forward = True
                    print(self.best_score, self.best_features)
                    if max_features == len(self.best_features):
                        break
                    new_feature, new_score = self.move_forward()
                
            if self.start_backward:
                self.start_backward = False
                
            old_feature, new_score = self.move_backward()
            while new_score > self.best_score:
                print('- feature = {}, score = {} -> {}'.format(old_feature, self.best_score, new_score))
                self.best_score = new_score
                self.best_features.remove(old_feature)
                self.update_history()
                backward = True
                print(self.best_score, self.best_features)
                old_feature, new_score = self.move_backward()
                
            if max_features == len(self.best_features):
                return self.best_features, self.best_score
                
            if (not forward) & (not backward):
                break
            print(self.best_score, self.best_features)
        return self.best_features, self.best_score
        
    def move_forward(self):
        scores = []
        for feature_name in self.feature_names:
            if feature_name in self.omit_feature_names:
                continue
            if feature_name not in self.best_features:
                current_features = list(self.best_features) + [feature_name]
                X, Y = self.builder(current_features)
                score = np.mean(cross_val_score(self.clf, X, y=Y, scoring='roc_auc', 
                                        cv=self.kfold_cv, n_jobs=-1))
                scores.append((score, feature_name))
                print("+ feature = {},  score = {}".format(feature_name, score))
        scores = sorted(scores)
        best_feature = scores[-1][1] 
        best_score = scores[-1][0]
        return best_feature, best_score

    def move_backward(self):
        if len(self.best_features) == 1:
            assert False
            
        scores = []    
        for feature_name in self.best_features:
            current_features = list(self.best_features)
            current_features.remove(feature_name)
            X, Y = self.builder(current_features)
            score = np.mean(cross_val_score(self.clf, X, y=Y, scoring='roc_auc', 
                                        cv=self.kfold_cv, n_jobs=-1))
            scores.append((score, feature_name))
            print("- feature = {},  score = {}".format(feature_name, score))
        scores = sorted(scores)
        best_feature = scores[-1][1] 
        best_score = scores[-1][0]
        return best_feature, best_score  

In [7]:
class SuperGreedyFeatureSelector(object):
    def __init__(self, clf, X_train, Y_train, X_test, cv, policy='sequential', steps =[1, -1], 
                 Nadd=512, Ndel=128, min_samples_add=64, min_samples_del=16, max_subsets=16, seed=3468):
        self.X_train = X_train
        self.Y_train = Y_train
        
        self.Nadd = Nadd
        self.Ndel = Ndel
        self.min_samples_add = min_samples_add
        self.min_samples_del = min_samples_del
        self.max_subsets = max_subsets
        self.steps = list(steps)
        
        self.cv = cv
        self.policy = policy
        if self.policy == 'tree':
            self.get_best_score = self.get_best_score_tree
        elif self.policy == 'sequential':
            self.get_best_score = self.get_best_score_seq
        random.seed(seed)
        
        self.current_score = -1
        self.selected_features = set()
        self.available_features = set(range(X_train.shape[1]))
        self.visited = np.ones(X_train.shape[1])
        self.clf = deepcopy(clf)
        self.feature_columns = X_train.tocsc()
        self.test_feature_columns = X_test.tocsc()
        
    def get_X_train(self, features):
        features = sorted(list(features))
        X = scipy.sparse.hstack([self.feature_columns[:, f] for f in features])
        X = X.tocsr()
        return X
    
    def get_X_test(self, features):
        features = sorted(list(features))
        X = scipy.sparse.hstack([self.test_feature_columns[:, f] for f in features])
        X = X.tocsr()
        return X        
        
    def get_features(self, features, step):
        if step == +1:
            return sorted(list(self.selected_features.union(set(features))))
        else:
            return sorted(list(self.selected_features - set(features)))
            
    def update_features(self, features, step):
        if step == +1:
            self.selected_features = self.selected_features.union(set(features))
            self.available_features -= set(features)
        else:
            self.selected_features -= set(features)
            self.available_features = self.available_features.union(set(features))
       
    def get_score(self, features):
        X = self.get_X_train(features)
        score = np.mean(cross_val_score(self.clf, X, y=self.Y_train, scoring='roc_auc', 
                                        cv=self.cv, n_jobs=-1))
        del X
        return score
    
    def get_best_score_seq(self, subsets, step):
        n_subsets = len(subsets)
        score_hist = []
        for i in range(n_subsets):
            features = subsets[i]
            all_features = self.get_features(features, step)
            score = self.get_score(all_features)
            score_hist.append((score, i))
            print('{}: (seq) score for subset {} = {}'.format(step, i, score))
            
        score_hist = sorted(score_hist)
        best_score = score_hist[-1][0]
        best_features = subsets[score_hist[-1][1]] 
        return best_score, best_features
        
    def get_best_score_tree(self, subsets, step):
        features = subsets[0]
        all_features = self.get_features(features, step)
        best_score, best_features = self.descend(all_features)
        return best_score, best_features     
    
    def select_features(self):
        self.min_samples_add = self.Nadd
        self.min_samples_del = self.Ndel
        
        while 1:
            results = defaultdict(bool)
            for step in self.steps:
                subsets_to_consider = []
                if step == +1:
                    features = list(self.available_features)
                    N = self.Nadd
                else:
                    features = list(self.selected_features)
                    N = self.Ndel
                    
                random.shuffle(features)
                n_subsets = int(len(features) / N)
                n_subsets = min(n_subsets, self.max_subsets)
                for i in range(n_subsets):
                    subsets_to_consider.append(features[i * N: i * N + N])
                
                best_score, best_features = self.get_best_score(subsets_to_consider, step)
                if best_score > self.current_score:
                    print('{}: score = {} -> {}, n_features = {} -> {}'.format(step,
                        self.current_score, best_score, len(self.selected_features), 
                        len(self.selected_features) + step * len(best_features)))
                    self.update_features(best_features, step)
                    self.current_score = best_score
                    results[step] = True
  
            if not np.any(results.values()):
                break
                
        return self.current_score, self.selected_features

    def descend(self, parent_features, step):
        current_features = self.get_features(parent_features, step)
        parent_score = self.get_score(current_features) 
        print('{}: parent score = {} for {}'.format(step, parent_score, len(parent_features)))
        if step == +1:
            min_samples = self.min_samples_add
        else:
            min_samples = self.min_samples_del
            
        if len(parent_features) < 2 * min_samples:
            return parent_score, parent_features
        
        random.shuffle(parent_features)
        mid = int(len(parent_features) / 2)
        left_features = parent_features[:mid]
        right_features = parent_features[mid:]
        left_score, left_features = self.descend(left_features, step)
        right_score, right_features = self.descend(right_features, step)
   
        if (parent_score > left_score) & (parent_score > right_score):
            return parent_score, parent_features
        if (left_score > parent_score) & (left_score > right_score):
            return left_score, left_features
        return right_score, right_features

In [7]:
class WrappedSVC(BaseEstimator, ClassifierMixin):
    def __init__(self, C, random_state, penalty, loss, **kwargs):
        self.C = C
        self.random_state = random_state
        self.loss = loss
        self.penalty = penalty
        
    def fit(self, X, y):
        self.lsvc_clf = LinearSVC(C=self.C, loss=self.loss, penalty=self.penalty, random_state=self.random_state)
        self.const = False
        if len(Counter(y)) < 2:
            self.const = True
            if y[0] == 0:
                self.margin = -10
            else:
                self.margin = 10
        else:
            self.lsvc_clf.fit(X, y)

    def predict_proba(self, X):
        if self.const == True:
            probas = np.array([self.margin] * X.shape[0])
        else:
            probas = self.lsvc_clf.decision_function(X)
        probas = probas[:, np.newaxis]
        probas = np.concatenate((probas, probas), axis=1)
        return probas
    
    def decision_function(self, X):
        return self.predict_proba(X)[:, 1]

<a id='individual'></a>
## 2.2 Индивидуальный подбор признаков<sup>[содержание](#toc)</sup>

In [52]:
CLF = 'LR'

CLASSIFIERS = {
    'SVC': WrappedSVC(C=0.063095734448019331, penalty='l2', loss='squared_hinge', random_state=36),
    'LR': LogisticRegression(C=1.0, random_state=5),
    'BNB': BernoulliNB(alpha=0.03)
}

### 2.2.1 Подбор по первой тройке

In [154]:
features = list(range(0, 3))
X_all_comb = np.concatenate(X_all_gr[:3], axis=1)

ohe_features_dict = [OneHotEncoder().fit_transform(X_all_comb[:, [i]]) for i in range(X_all_comb.shape[1])]
builder = MyXYTrainTestBuilder(ohe_features_dict, Y_train)
feature_names = list(range(X_all_comb.shape[1]))
clf = deepcopy(CLASSIFIERS[CLF])

In [54]:
best_features = None
best_score = None

In [55]:
kfold_cv = StratifiedKFold(10, shuffle=True, random_state=515)
selector = GreedFeaturesSelector()
if best_features is not None:
    selector.best_score = best_score
    selector.best_features = best_features
best_features, best_score = selector.select_features(clf, feature_names, builder, kfold_cv)

Adding first feature
+ feature = 0,  score = 0.6058000093051319
+ feature = 1,  score = 0.7995148047935221
+ feature = 2,  score = 0.5881961555892712
+ feature = 3,  score = 0.6412634540651512
+ feature = 4,  score = 0.7184583223458082
+ feature = 5,  score = 0.6690476223150149
+ feature = 6,  score = 0.7197050390643212
+ feature = 7,  score = 0.6229622350025911
+ feature = 8,  score = 0.5919309637723529
+ feature = 9,  score = 0.6010325166413409
+ feature = 10,  score = 0.6569364915279591
+ feature = 11,  score = 0.6807698600350316
+ feature = 12,  score = 0.5730007734765767
+ feature = 13,  score = 0.545723552547261
+ feature = 14,  score = 0.5900022097583301
+ feature = 15,  score = 0.7993225843810395
+ feature = 16,  score = 0.8012315949061952
+ feature = 17,  score = 0.8008501605134786
+ feature = 18,  score = 0.8173565715521496
+ feature = 19,  score = 0.8149148508603897
+ feature = 20,  score = 0.8141656819151717
+ feature = 21,  score = 0.6409552106669252
+ feature = 22,  score

KeyboardInterrupt: 

In [56]:
best_features = deepcopy(selector.best_features)
best_score = deepcopy(selector.best_score)
print(best_score,  sorted(best_features))

0.883665809431 [0, 1, 7, 9, 10, 11, 18, 20, 36, 38, 39, 49, 59, 63, 64, 73, 83, 88, 91]


In [58]:
pkl.dump(best_features, open('{}/best_features_{}.pkl'.format(CLF, features), 'wb'))
pkl.dump(best_score, open('{}/best_score_{}.pkl'.format(CLF, features), 'wb'))

#### Кросс-валидация

In [59]:
X_part_train = builder.get_X_train(best_features)
X_part_test = builder.get_X_test(best_features)

In [60]:
kfold_cv = StratifiedKFold(10, shuffle=True, random_state=925)

if CLF == 'BNB':
    parameters = {'alpha': np.linspace(0.01, 0.1, 21)}
elif CLF == 'LR':
    parameters = {'C': np.logspace(-1, 0, 11), 'penalty': ['l2']}
elif CLF == 'SVC': 
    parameters = {'C': np.logspace(-2, -1, 11), 'penalty': ['l2'], 'loss': ['hinge', 'squared_hinge']}
else:
    assert False
    
clf = deepcopy(CLASSIFIERS[CLF])

grid_searcher = GridSearchCV(clf, parameters, scoring='roc_auc', refit=True, cv=kfold_cv, n_jobs=-1)
grid_searcher.fit(X_part_train, Y_train)
print(grid_searcher.best_score_, grid_searcher.best_estimator_)
best_clf = grid_searcher.best_estimator_

0.882622134223 LogisticRegression(C=0.79432823472428149, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=5,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)


#### Финальное обучение

In [None]:
Y_pred = pd.DataFrame(best_clf.predict_proba(X_part_test)[:, 1], index=range(X_part_test.shape[0]), 
                      columns=['prediction'])
Y_pred.to_csv('{}/{}.csv'.format(CLF, CLF), index_label='id')

#### Слияние решений

In [152]:
Y_TRAIN_PRED = {} # Для обучения метаклассификатора
Y_TEST_PRED = {} # Для предсказания ответов

In [106]:
kfold_cv1 = StratifiedKFold(10, shuffle=True, random_state=525)
kfold_cv2 = StratifiedKFold(10, shuffle=True, random_state=526)
kfold_cv3 = StratifiedKFold(10, shuffle=True, random_state=527)
kfold_cv4 = StratifiedKFold(10, shuffle=True, random_state=528)
kfolds = [kfold_cv1] #[kfold_cv1, kfold_cv2, kfold_cv3, kfold_cv4]

def RKFolds(X, Y):
    for kfold in kfolds:
        for train_indices, test_indices in kfold.split(X, Y):
            yield train_indices, test_indices

Здесь всего три классификатора. Далее их будет дополнительно еще два десятка.

In [155]:
best_features = {}
for n_clf, cc in enumerate(['BNB', 'LR']):
    indices = list(range(3))
    features = pkl.load(open('{}/best_features_{}.pkl'.format(cc, indices), 'rb'))
    score = pkl.load(open('{}/best_score_{}.pkl'.format(cc, indices), 'rb'))
    best_features = features
    
    X_tr = builder.get_X_train(best_features)
    X_ts = builder.get_X_test(best_features)
    
    clf = deepcopy(CLASSIFIERS[cc])
    clf.fit(X_tr, Y_train)
    Y_TEST_PRED[n_clf] = clf.predict_proba(X_ts)[:, 1]
    
    Y_pred = np.zeros(len(Y_train))
    for n_fold, (train_indices, test_indices) in enumerate(RKFolds(X_tr, Y_train)):
        X_tr_tr, X_tr_ts = X_tr[train_indices], X_tr[test_indices]
        Y_tr_tr, Y_tr_ts = Y_train[train_indices], Y_train[test_indices]
        clf = deepcopy(CLASSIFIERS[cc])
        clf.fit(X_tr_tr, Y_tr_tr)
        Y_pred[test_indices] = clf.predict_proba(X_tr_ts)[:, 1]
    Y_TRAIN_PRED[n_clf] = Y_pred    
    score = roc_auc_score(Y_train, Y_pred)
    print(cc, score)

BNB 0.861443642765
LR 0.875766346046


<a id='report'></a>
### Отчет<sup>[содержание](#toc)</sup>

#### SVC
Без f9. Первые три группы признаков. Следующие номера признаков:
* [0, 7, 11, 36, 38, 49, 70, 79, 83, 88, 89]
Отобраны с помощью WrappedSVC(C=1.0, penalty='l2', loss='squared_hinge', random_state=36)

Точность:
* на CV 0.871773063256 WrappedSVC(C=0.063095734448019331, loss='squared_hinge', penalty='l2', random_state=36) при kfold_cv = StratifiedKFold(10, shuffle=True, random_state=925)

Потом произведен повторный отбор с новым параметром C = 0.063095734448019331. В результате получен новый набор признаков:
* [0, 7, 11, 20, 36, 38, 47, 49, 64, 69, 70, 79, 88, 89] на котором 0.876302286022 WrappedSVC(C=0.050118723362727248, loss='squared_hinge', penalty='l2', random_state=36) при kfold_cv = StratifiedKFold(10, shuffle=True, random_state=925)
 

#### LogisticRegression

Наилучшие решение. Первые три группы признаков без удаления f9. Следующие номера признаков:
* [0, 7, 10, 11, 12, 22, 27, 34, 45, 47, 48, 51, 60, 75, 76, 77, 78, 81, 82, 92, 107, 119]

Один из следующих классификаторов:
* LogisticRegression(C=0.50118723362727235, penalty='l2', random_state=17)
* LogisticRegression(C=0.63095734448019336, penalty='l2', random_state=23958)

Точность:
* На public-е дало 0.90085
* На CV 0.883246881869 при LogisticRegression(C=0.50118723362727235, penalty='l2', random_state=17) и StratifiedKFold(10, shuffle=True, random_state=965)

Первые три группы признаков. f9 Удален. Следующие номера признаков:
0.883665809431 [0, 1, 7, 9, 10, 11, 18, 20, 36, 38, 39, 49, 59, 63, 64, 73, 83, 88, 91]

0.882622134223 LogisticRegression(C=0.79432823472428149, penalty='l2', random_state=5) kfold_cv = StratifiedKFold(10, shuffle=True, random_state=515)

#### BernoulliNB

Первые три группы признаков без удаления f9. Следующие номера признаков:
* [0, 3, 4, 9, 11, 12, 42, 45, 46, 58, 65, 71, 81, 82, 83, 91, 93, 108, 117, 122]

Первые три группы признаков. f9 удален. Следующие номера признаков:
* [5, 11, 20, 37, 40, 42, 70, 85, 90]

* 0.865063382838 на CV при BernoulliNB(alpha=0.032500000000000001, binarize=0.0, class_prior=None, fit_prior=True) и kfold_cv = StratifiedKFold(10, shuffle=True, random_state=925)

#### Различные комбинации степени от 1 до 7

In [156]:
CLASSIFIER = 'LR'
degree = 3
max_features = 10
selected_features = {}
selected_scores = {}

In [157]:
def get_X_subset(X_groups, indices):
    new_X = []
    for i in indices:
        new_X.append(X_groups[i])
    return np.concatenate(new_X, axis=1)

Загрузка промежуточных результатов

In [158]:
selected_features = pkl.load(open('{}/selected_features{}.pkl'.format(CLF, max_features), 'rb'))
selected_scores = pkl.load(open('{}/selected_scores{}.pkl'.format(CLF, max_features), 'rb'))

In [63]:
for n, indices in enumerate(combinations(range(7), degree)):
    if indices in selected_features:
        print('SUBSET {} is already considered', indices)
        continue
    print('Considering SUBSET:', indices)
    kfold_cv = StratifiedKFold(8, shuffle=True, random_state=n)
    
    clf = LogisticRegression(C=1.0, penalty='l2', random_state=n * 11)
    clf = WrappedSVC(C=1.0, penalty='l2', loss='squared_hinge', random_state=n*11)
    
    X_all_comb = get_X_subset(X_all_gr, indices)
    
    feature_names = list(range(X_all_comb.shape[1]))
    ohe_features_dict = {}
    features_creator = FeaturesCreator()
    for n_feature in range(X_all_comb.shape[1]):
        ohe_features_dict[n_feature] = features_creator.encode_ohe_feature(X_all_comb[:, [n_feature]], 
                                                                           label_to_omit=-1)    
        
    builder = MyXYTrainTestBuilder(ohe_features_dict, Y_train)
    selector = GreedFeaturesSelector()
    best_features, best_score = selector.select_features(clf, feature_names, builder, kfold_cv, max_features=10)
    selected_features[indices] = best_features
    selected_scores[indices] = best_score

Considering SUBSET: (0, 1, 2)


KeyboardInterrupt: 

Сохранение промежуточных результатов

In [14]:
pkl.dump(selected_features, open('{}/selected_features{}.pkl'.format(CLF, max_features), 'wb'))
pkl.dump(selected_scores, open('{}/selected_scores{}.pkl'.format(CLF, max_features), 'wb'))

Embedding

In [159]:
X_all_comb = np.concatenate(X_all_gr, axis=1)
index_groups = []
first = 0
for i in range(len(X_all_gr)):
    length = X_all_gr[i].shape[1]
    index_groups.append(list(range(first, first + length)))
    first += length
print(X_all_comb.shape)
print(index_groups)

(32769, 253)
[[0, 1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], [36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91], [92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161], [162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 21

In [160]:
ohe_features_dict = {}
features_creator = FeaturesCreator()
for n_feature in range(X_all_comb.shape[1]):
    ohe_features_dict[n_feature] = features_creator.encode_ohe_feature(X_all_comb[:, [n_feature]], label_to_omit=-1)    
builder = MyXYTrainTestBuilder(ohe_features_dict, Y_train)

In [161]:
for n_subset, indices in enumerate(sorted(selected_features.keys())):
    print('Considering SUBSET {}'.format(indices))
    new_feature_names = np.array(sorted(selected_features[indices]))
    old_feature_names = np.array(list(chain(*[index_groups[i] for i in indices])))
    feature_names = old_feature_names[new_feature_names]
    
    X = builder.get_X_train(feature_names)
    Y_pred = np.zeros(len(Y_train))
    
    X_ts = builder.get_X_test(feature_names)
    clf = LogisticRegression(C=1.0, penalty='l2', random_state=11*n_subset)
    clf.fit(X, Y_train)
    Y_TEST_PRED[len(Y_TEST_PRED)] = clf.predict_proba(X_ts)[:, 1]
    
    print('Size of training set = {}'.format(X.shape))
    for n_fold, (train_indices, test_indices) in enumerate(RKFolds(X, Y_train)):
        X_tr, Y_tr = X[train_indices], Y_train[train_indices]
        X_ts, Y_ts = X[test_indices], Y_train[test_indices]
        clf = LogisticRegression(C=1.0, penalty='l2', random_state=11*n_subset)
        clf.fit(X_tr, Y_tr)
        Y_pred[test_indices] = clf.predict_proba(X_ts)[:, 1]
    Y_TRAIN_PRED[len(Y_TRAIN_PRED)] = Y_pred
    score = roc_auc_score(Y_train, Y_pred)
    print('Score = {}'.format(score))

Considering SUBSET (0, 1, 2)
Size of training set = (19661, 24871)
Score = 0.8744522504280695
Considering SUBSET (0, 1, 3)
Size of training set = (19661, 24402)
Score = 0.8721810057209212
Considering SUBSET (0, 1, 4)
Size of training set = (19661, 23630)
Score = 0.8718324807110936
Considering SUBSET (0, 1, 5)
Size of training set = (19661, 26754)
Score = 0.8737854501141696
Considering SUBSET (0, 1, 6)
Size of training set = (19661, 26886)
Score = 0.8715930482401012
Considering SUBSET (0, 2, 3)
Size of training set = (19661, 23691)
Score = 0.8709576592206572
Considering SUBSET (0, 2, 4)
Size of training set = (19661, 26392)
Score = 0.8723257687127826
Considering SUBSET (1, 2, 3)
Size of training set = (19661, 26234)
Score = 0.8696472661671125
Considering SUBSET (1, 2, 4)
Size of training set = (19661, 25290)
Score = 0.8695009059907872
Considering SUBSET (1, 2, 5)
Size of training set = (19661, 26197)
Score = 0.8683580479075964
Considering SUBSET (1, 2, 6)
Size of training set = (19661, 

In [120]:
pkl.dump(Y_pred_dict, open('LR/Y_pred_dict10.pkl', 'wb'))

In [52]:
Y_pred_dict = pkl.load(open('LR/Y_pred_dict10.pkl', 'rb'))

In [163]:
Y_train_pred = []
Y_test_pred = []
for n_clf in sorted(Y_TRAIN_PRED.keys()):
    Y_train_pred.append(Y_TRAIN_PRED[n_clf][:, np.newaxis])
    Y_test_pred.append(Y_TEST_PRED[n_clf][:, np.newaxis])
Y_train_pred = np.concatenate(Y_train_pred, axis=1)
Y_test_pred = np.concatenate(Y_test_pred, axis=1)
print(Y_train_pred.shape, Y_test_pred.shape)

(19661, 21) (13108, 21)


In [31]:
#Y_pred = -np.log((1 - Y_pred) / Y_pred)
#Y_pred = np.log(Y_pred)
ens_clf = LogisticRegression(C=1.0, random_state=347)
ens_clf.fit(Y_pred, Y_train)
print(ens_clf.coef_)

[[ 0.53043585 -0.02984388 -0.14230978  0.30787581  0.28184429 -0.02596989
   0.10557139]]


In [132]:
kfold_cv1 = StratifiedKFold(10, shuffle=True, random_state=525)
kfold_cv2 = StratifiedKFold(10, shuffle=True, random_state=526)
kfold_cv3 = StratifiedKFold(10, shuffle=True, random_state=527)
kfold_cv4 = StratifiedKFold(10, shuffle=True, random_state=528)
kfolds = [kfold_cv1, kfold_cv2, kfold_cv3, kfold_cv4]

def RKFolds(X, Y):
    for kfold in kfolds:
        for train_indices, test_indices in kfold.split(X, Y):
            yield train_indices, test_indices

In [169]:
kfold_cv = StratifiedKFold(10, shuffle=True, random_state=225)
parameters = {'C': np.logspace(-1, 1, 21)}
clf = GridSearchCV(LogisticRegression(random_state=8, max_iter=1000, tol=1e-4), parameters, 
                   scoring='roc_auc', refit=True, cv=kfold_cv, n_jobs=-1)
clf.fit(Y_train_pred[:, 1:], Y_train)
print(clf.best_score_, clf.best_estimator_)
ens_clf = clf.best_estimator_

0.877291134589 LogisticRegression(C=0.25118864315095801, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=8,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)


In [170]:
clf.grid_scores_



[mean: 0.87719, std: 0.01873, params: {'C': 0.10000000000000001},
 mean: 0.87720, std: 0.01877, params: {'C': 0.12589254117941673},
 mean: 0.87724, std: 0.01881, params: {'C': 0.15848931924611134},
 mean: 0.87728, std: 0.01885, params: {'C': 0.19952623149688797},
 mean: 0.87729, std: 0.01892, params: {'C': 0.25118864315095801},
 mean: 0.87725, std: 0.01900, params: {'C': 0.31622776601683794},
 mean: 0.87721, std: 0.01908, params: {'C': 0.39810717055349731},
 mean: 0.87718, std: 0.01922, params: {'C': 0.50118723362727235},
 mean: 0.87710, std: 0.01933, params: {'C': 0.63095734448019336},
 mean: 0.87703, std: 0.01945, params: {'C': 0.79432823472428149},
 mean: 0.87694, std: 0.01955, params: {'C': 1.0},
 mean: 0.87679, std: 0.01974, params: {'C': 1.2589254117941675},
 mean: 0.87663, std: 0.01992, params: {'C': 1.584893192461114},
 mean: 0.87641, std: 0.02005, params: {'C': 1.9952623149688797},
 mean: 0.87619, std: 0.02021, params: {'C': 2.511886431509581},
 mean: 0.87595, std: 0.02039, pa

In [173]:
ens_clf.fit(Y_train_pred, Y_train)
Y_test = pd.DataFrame(ens_clf.predict_proba(Y_test_pred)[:, 1], index=range(Y_test_pred.shape[0]), 
                      columns=['prediction'])
Y_test.to_csv('result.csv', index_label='id')

In [None]:
Y_ans = ens_clf.predict_proba(Y_pred)[:, 1]
roc_auc_score(Y_train, Y_ans)

In [None]:
ohe_features_dict = {}
features_creator = FeaturesCreator()
for n_feature in range(X_all_comb.shape[1]):
    label_to_omit = unique_labels[n_feature]
    ohe_features_dict[n_feature] = features_creator.encode_ohe_feature(X_all_comb[:, [n_feature]], label_to_omit)    
builder = MyXYTrainTestBuilder(ohe_features_dict, Y_train)

In [93]:
ohe_features_dict = {}
features_creator = FeaturesCreator()
for n_feature in range(X_all_comb.shape[1]):
    ohe_features_dict[n_feature] = features_creator.encode_ohe_feature(X_all_comb[:, [n_feature]], label_to_omit=-1)
builder = MyXYTrainTestBuilder(ohe_features_dict, Y_train)

feature_names = list(range(X_all_comb.shape[1]))
lr_clf = LogisticRegression(C=1.0, penalty='l2', random_state=26958)
kfold_cv = StratifiedKFold(10, shuffle=True, random_state=515)

In [94]:
selector = GreedFeaturesSelector()
best_features, best_score = selector.select_features(lr_clf, feature_names, builder, kfold_cv)

Adding first feature
+ feature = 0,  score = 0.6005505670675881
+ feature = 1,  score = 0.7942035665244352
+ feature = 2,  score = 0.5890837976953259
+ feature = 3,  score = 0.6434072879277847
+ feature = 4,  score = 0.7171899921012496
+ feature = 5,  score = 0.6693860910464869
+ feature = 6,  score = 0.7170034549237565
+ feature = 7,  score = 0.6218355502300369
+ feature = 8,  score = 0.5913973576609204
+ feature = 9,  score = 0.6004204885791858
+ feature = 10,  score = 0.6521102593083931
+ feature = 11,  score = 0.675144584201497
+ feature = 12,  score = 0.5732027820927024
+ feature = 13,  score = 0.5450234886824584
+ feature = 14,  score = 0.5840417152788498
+ feature = 15,  score = 0.7935559770164357
+ feature = 16,  score = 0.7955232883844393
+ feature = 17,  score = 0.7974991525831293
+ feature = 18,  score = 0.8115001341537119
+ feature = 19,  score = 0.8126530668284809
+ feature = 20,  score = 0.80818741725418
+ feature = 21,  score = 0.6431057549328861
+ feature = 22,  score =

KeyboardInterrupt: 

In [95]:
best_score = selector.best_score
best_features = selector.best_features

In [99]:
kfold_cv = StratifiedKFold(10, shuffle=True, random_state=515)
selector = GreedFeaturesSelector()
selector.best_features = best_features
selector.best_score = best_score
best_features, best_score = selector.select_features(lr_clf, feature_names, builder, kfold_cv)

Entering infinite loop
+ feature = 2,  score = 0.8804850914777674
+ feature = 3,  score = 0.8796110556486442
+ feature = 4,  score = 0.8798576780135576
+ feature = 5,  score = 0.8787146711823359
+ feature = 6,  score = 0.8815870270722513
+ feature = 8,  score = 0.8808449415022276
+ feature = 9,  score = 0.8806254742892963
+ feature = 10,  score = 0.8809736044382795
+ feature = 12,  score = 0.8797501707526602
+ feature = 13,  score = 0.8807377593530703
+ feature = 14,  score = 0.8796481861065413
+ feature = 15,  score = 0.8802951100687963
+ feature = 16,  score = 0.8803582731118353
+ feature = 17,  score = 0.8804063487192101
+ feature = 18,  score = 0.8812635402921266
+ feature = 19,  score = 0.8813356999846709
+ feature = 21,  score = 0.8796160775309069
+ feature = 22,  score = 0.8784659475661065
+ feature = 23,  score = 0.8798346774969751
+ feature = 24,  score = 0.8817087875888381
+ feature = 25,  score = 0.8805937559955662
+ feature = 26,  score = 0.879050584203559
+ feature = 27,  

KeyboardInterrupt: 

In [100]:
best_score = selector.best_score
best_features = selector.best_features

In [101]:
X_partial_train = builder.get_X_train(best_features)
X_partial_test = builder.get_X_test(best_features)

In [102]:
kfold_cv = StratifiedKFold(10, shuffle=True, random_state=955)
parameters = {'C': np.logspace(0, 1, 11),
              'penalty': ['l2']}
clf = GridSearchCV(LogisticRegression(random_state=17), parameters, scoring='roc_auc', refit=True, cv=kfold_cv, 
                   n_jobs=-1)
clf.fit(X_partial_train, Y_train)
print(clf.best_score_, clf.best_estimator_)

0.880318836272 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [103]:
clf = clf.best_estimator_

In [104]:
Y_test = pd.DataFrame(clf.predict_proba(X_partial_test)[:, 1], index=range(X_partial_test.shape[0]), 
                      columns=['prediction'])
Y_test.to_csv('lr1234567.csv', index_label='id')

<a id='lr_binary'></a>
#### 3) Binary Selection<sup>[содержание](#toc)</sup>

In [None]:
X_all_comb = np.concatenate((X_all_singles, X_all_pairs, X_all_triplets), axis=1)
X_all_ohe = OneHotEncoder().fit_transform(X_all_comb)
X_all_ohe_train = X_all_ohe[:len(X_train)]
X_all_ohe_test = X_all_ohe[len(X_train):]
del X_all_ohe

In [40]:
X_all_comb = np.concatenate((X_all_singles, X_all_pairs, X_all_triplets), axis=1)
ohe_features_dict = [OneHotEncoder().fit_transform(X_all_comb[:, [i]]) for i in range(X_all_comb.shape[1])]
builder = MyXYTrainTestBuilder(ohe_features_dict, len(X_train), Y_train)

best_features = [0, 7, 9, 11, 17, 20, 29, 35, 36, 38, 39, 42, 49, 59, 63, 64, 66, 69, 79, 88]
X_all_ohe_train, Y_train = builder(best_features)
X_all_ohe_test = builder.get_X_test(best_features)

In [21]:
clf = LogisticRegression(C=1.0, random_state=189)
kfold_cv = StratifiedKFold(10, shuffle=True, random_state=965)
Nadd = 2048; Ndel = 512; steps = [1, 1, -1]

In [38]:
X_all_ohe.shape

(19661, 47143)

In [42]:
super_selector = SuperGreedyFeatureSelector(clf, X_all_ohe_train, Y_train, 
                                            X_all_ohe_test, kfold_cv,
                                            Nadd=Nadd, Ndel=Ndel, steps=steps)
best_score, best_features = super_selector.select_features()

In [25]:
current_score = super_selector.current_score
selected_features = deepcopy(super_selector.selected_features)
available_features = deepcopy(super_selector.available_features)

params = {'current_score': current_score, 
          'selected_features': selected_features,
          'available_features': available_features}
pkl.dump(params, open('lr_features_without_f9.pkl', 'wb'))

In [None]:
super_selector = SuperGreedyFeatureSelector(clf, X_all_ohe_train, Y_train, 
                                            X_all_ohe_test, kfold_cv,
                                            Nadd=Nadd, Ndel=Ndel, steps=steps)
super_selector.current_score = current_score
super_selector.selected_features = deepcopy(selected_features)
super_selector.available_features = deepcopy(available_features)
best_score, best_features = super_selector.select_features()

#### Финальное обучение

In [43]:
X_train_part = super_selector.get_X_train(selected_features)
X_test_part = super_selector.get_X_test(selected_features)

In [45]:
cv = StratifiedKFold(10, shuffle=True, random_state=965)
parameters = {'C': np.logspace(-1, 0, 11), 
              'penalty': ['l2']}
grid_search = GridSearchCV(LogisticRegression(random_state=17), parameters, scoring='roc_auc', refit=True, cv=cv, 
                   n_jobs=-1)
grid_search.fit(X_train_part, Y_train)
print(grid_search.best_score_, grid_search.best_estimator_)

0.891616193648 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [46]:
clf = grid_search.best_estimator_
Y_test = pd.DataFrame(clf.predict_proba(X_test_part)[:, 1], index=range(X_test.shape[0]), 
                      columns=['prediction'])
Y_test.to_csv('lr_binary_selection.csv', index_label='id')

<a id='nb'></a>
### 2.2.2 BernoulliNB<sup>[содержание](#toc)</sup>

<a id='nb_bagging'></a>
#### 1) Bagging<sup>[содержание](#toc)</sup>

In [205]:
kfold_cv = StratifiedKFold(10, shuffle=True, random_state=925)
best_features = [0, 3, 4, 9, 11, 12, 42, 45, 46, 58, 65, 71, 81, 82, 83, 91, 93, 108, 117, 122]
builder = MyXYTrainTestBuilder(ohe_features_dict, len(X_train), Y_train)
X_tr, Y_tr = builder(best_features)

gb_clf = BaggingClassifier(base_estimator=BernoulliNB(alpha=0.03), n_estimators=100, max_samples=0.8, 
                  max_features=0.5, bootstrap=True, n_jobs=-1, random_state=1294)
score = np.mean(cross_val_score(gb_clf, X=X_tr, y=Y_tr, scoring='roc_auc', cv=kfold_cv))
print(score)

0.850308275827


<a id='nb_binary'></a>
#### 3) Binary Selection<sup>[содержание](#toc)</sup>

### LogisticRegression + BernoulliNB

In [149]:
kfold_cv = StratifiedKFold(10, shuffle=True, random_state=674)
builder = MyXYTrainTestBuilder(ohe_features_dict, len(X_train), Y_train)
nb_features = [0, 4, 7, 9, 11, 12, 42, 45, 46, 51, 58, 71, 81, 82, 83, 91, 108, 117, 122]
lr_features = [0, 7, 10, 11, 12, 22, 27, 34, 45, 47, 48, 51, 60, 75, 76, 77, 78, 81, 82, 92, 107, 119]

Y_pred = np.zeros((len(X_train), 2))
for train_indices, test_indices in kfold_cv.split(X_train, Y_train):
    lr_clf = LogisticRegression(C=0.50118723362727235, penalty='l2', random_state=23958)
    nb_clf = BernoulliNB(alpha=0.03)

    X_nb, Y_nb = builder(nb_features)
    X_tr, X_ts = X_nb[train_indices], X_nb[test_indices]
    Y_tr, Y_ts = Y_nb[train_indices], Y_nb[test_indices]
    nb_clf.fit(X_tr, Y_tr)
    Y_pred[test_indices, 1] = nb_clf.predict_proba(X_ts)[:, 1]
    
    X_lr, Y_lr = builder(lr_features)
    X_tr, X_ts = X_lr[train_indices], X_lr[test_indices]
    Y_tr, Y_ts = Y_lr[train_indices], Y_lr[test_indices]
    lr_clf.fit(X_tr, Y_tr)
    Y_pred[test_indices, 0] = lr_clf.predict_proba(X_ts)[:, 1]
    
#Y_pred[Y_pred > 0.9999] = 0.9999
#Y_pred[Y_pred < 0.0001] = 0.0001
#Y_pred = -np.log((1 - Y_pred) / Y_pred)
#Y_pred = np.log(Y_pred)
#Y_ans = ens_clf.predict_proba(Y_pred)[:, 1]
#roc_auc_score(Y_train, Y_ans)

In [156]:
roc_auc_score(Y_train, Y_pred[:, 0])

0.8794198813069305

In [158]:
best_a, best_b = -1, -1
best_result = -1
for a, b in product(np.linspace(0, 1, 101), np.linspace(0, 1, 101)):
    Y_ans = deepcopy(Y_pred)
    #Y_ans[:, 0] = Y_pred[:, 0] ** a
    #Y_ans[:, 1] = Y_pred[:, 1] ** b
    Y_ans = a * Y_ans[:, 0] + b * Y_ans[:, 1]
    result = roc_auc_score(Y_train, Y_ans)
    if result > best_result:
        best_a, best_b = a, b
        best_result = result
print(best_a, best_b, best_result)

0.5 0.01 0.880327662877


In [None]:
1.0 0.01 0.879579502954 # y1^a * y2^b
0.5 0.01 0.880327662877 # a * y1 + b * y2

In [135]:
kfold_cv = StratifiedKFold(10, shuffle=True, random_state=233)
parameters = {'C': np.logspace(-1, 1, 21)}
clf = GridSearchCV(LogisticRegression(random_state=456), parameters, 
                   scoring='roc_auc', refit=True, cv=kfold_cv, n_jobs=-1)
"""parameters = {}
clf = GridSearchCV(LinearRegression(), parameters, 
                   scoring='roc_auc', refit=True, cv=kfold_cv, n_jobs=-1)"""
clf.fit(Y_pred, Y_train)
print(clf.best_score_, clf.best_estimator_, clf.best_estimator_.coef_)
ens_clf = clf.best_estimator_

0.879106052542 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) [ 0.43660004  0.00588597]


In [100]:
ens_clf = clf.best_estimator_
Y_pred = np.zeros((len(X_test), 2))
builder = MyXYTrainTestBuilder(ohe_features_dict, len(X_train), Y_train)

nb_clf = BernoulliNB(alpha=0.03)
X_tr, Y_tr = builder(nb_features)
nb_clf.fit(X_tr, Y_tr)
Y_pred[:, 1] = nb_clf.predict_proba(builder.get_X_test(nb_features))[:, 1]

lr_clf = LogisticRegression(C=0.63095734448019336, penalty='l2', random_state=23958)
X_tr, Y_tr = builder(lr_features)
lr_clf.fit(X_tr, Y_tr)
Y_pred[:, 0] = lr_clf.predict_proba(builder.get_X_test(lr_features))[:, 0]

Y_pred[Y_pred > 0.9999] = 0.9999
Y_pred[Y_pred < 0.0001] = 0.0001
Y_pred = -np.log((1 - Y_pred) / Y_pred)
Y_test = pd.DataFrame(ens_clf.predict_proba(Y_pred)[:, 1], 
                      index=range(Y_pred.shape[0]), columns=['prediction'])
Y_test.to_csv('lr_nb.csv', index_label='id')

<a id='trees'></a>
## 2.2 Деревья<sup>[содержание](#toc)</sup>

In [93]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
Y_train = pd.read_csv('y_train.csv')
names = ['id', 'RES', 'MGR', 'SRC_TYPE1', 'SRC_TYPE2', 'SRC_CLASS', 'SRC_SHORTNAME', 
         'SRC_EXTNAME1', 'SRC_EXTNAME2', 'SRC_CODE']

X_train.columns = names
X_train = X_train.drop(['id', 'SRC_CODE'], axis=1)
Y_train = Y_train['label'].values

X_test.columns = names
X_test = X_test.drop(['id', 'SRC_CODE'], axis=1)
X_all = pd.concat([X_train, X_test], ignore_index=True)

feature_names = list(X_train.columns.difference(['id']))

* f1, RES — ресурс к которому происходит обращение
* f2, MGR — идентификатор группы машин в котором находится источник
* f3, SRC_TYPE1 — тип источника обращения (1)
* f4, SRC_TYPE2 — тип источника обращения (2)
* f5, SRC_CLASS — класс устройств в котором находится источник
* f6, SRC_SHORTNAME — сокращенное название источника обращения
* f7, SRC_EXTNAME1 — расширенный тип источника обращения (1)
* f8, SRC_EXTNAME2 — расширенный тип источника обращения (2)
* f9, SRC_CODE — код модели источника

* RESOURCE 	An ID for each resource
* MGR_ID 	The EMPLOYEE ID of the manager of the current EMPLOYEE ID record; an employee may have only one manager at a time
* ROLE_ROLLUP_1 	Company role grouping category id 1 (e.g. US Engineering)
* ROLE_ROLLUP_2 	Company role grouping category id 2 (e.g. US Retail)
* ROLE_DEPTNAME 	Company role department description (e.g. Retail)
* ROLE_TITLE 	Company role business title description (e.g. Senior Engineering Retail Manager)
* ROLE_FAMILY_DESC 	Company role family extended description (e.g. Retail Manager, Software Engineering)
* ROLE_FAMILY 	Company role family description (e.g. Retail Manager)
* ROLE_CODE 	Company role code; this code is unique to each role (e.g. Manager)

In [94]:
#X_all['SRC_SHORTNAME'] = X_all['SRC_SHORTNAME'] + 1000 * X_all['SRC_EXTNAME2']
X_all['SRC_TYPE'] = X_all['SRC_TYPE1'] + 10000 * X_all['SRC_TYPE2']
#X_all = X_all.drop(['SRC_TYPE1','SRC_TYPE2','SRC_EXTNAME2'], axis=1)

(32769, 18)


In [99]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
Y_train = pd.read_csv('y_train.csv')
names = ['id', 'RES', 'MGR', 'SRC_TYPE1', 'SRC_TYPE2', 'SRC_CLASS', 'SRC_SHORTNAME', 
         'SRC_EXTNAME1', 'SRC_EXTNAME2', 'SRC_CODE']

X_train.columns = names
X_train = X_train.drop(['id', 'SRC_CODE'], axis=1)
Y_train = Y_train['label'].values

X_test.columns = names
X_test = X_test.drop(['id', 'SRC_CODE'], axis=1)
X_all = pd.concat([X_train, X_test], ignore_index=True)
feature_names = list(X_train.columns.difference(['id']))

X_all = X_all.values
print('Generating singles')
X_all_singles = deepcopy(X_all)
for n_feature in range(X_all_singles.shape[1]):
    X_all_singles[:, n_feature] = LabelEncoder().fit_transform(X_all_singles[:, n_feature])

print('Generating pairs')
X_all_pairs = group_features(X_all_singles, degree=2)

print('Generating triplets')
X_all_triplets = group_features(X_all_singles, degree=3)
print(X_all_singles.shape, X_all_pairs.shape, X_all_triplets.shape)

print('Filtering uniques')
X_all_singles = get_rid_of_uniques(X_all_singles)
X_all_pairs = get_rid_of_uniques(X_all_pairs)
X_all_triplets = get_rid_of_uniques(X_all_triplets)

Generating singles
Generating pairs
Generating triplets
(32769, 8) (32769, 28) (32769, 56)
Filtering uniques


In [100]:
def add_counts(X):
    n_samples, n_features = X.shape
    new_X = []
    for n_feature in range(n_features):
        counts = Counter(X[:, n_feature])
        new_features = []
        for i in range(n_samples):
            new_features.append(counts[X[i, n_feature]])
        new_X.append(np.array(new_features)[:, np.newaxis])
    new_X = np.concatenate(new_X, axis=1)
    X = np.concatenate((X, new_X), axis=1)
    return X
    
def encode_labels_by_freq(X):
    n_samples, n_features = X.shape
    for n_col in range(n_features):
        items = sorted(Counter(X[:, n_col]).items(), key=lambda x: x[1])
        val2order = {}
        for order, (value, count) in enumerate(items):
            val2order[value] = order
        for i in range(n_samples):
            X[i, n_col] = val2order[X[i, n_col]]
    return X

X_all_comb = np.concatenate((X_all_singles, X_all_pairs, X_all_triplets), axis=1)

In [101]:
X_all_comb = encode_labels_by_freq(X_all_comb)
print(X_all_comb.shape)

(32769, 92)


In [None]:
X_all_comb = add_counts(X_all_comb)

In [98]:
bg_clf = BaggingClassifier(n_estimators=2000, max_samples=0.8, max_features=0.6)
scores = cross_val_score(bg_clf, X=X_all[:len(X_train)], y=Y_train,
                         scoring='roc_auc', cv=5, n_jobs=-1)
print(np.mean(scores), np.std(scores))

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


0.836932939485 0.024854882347


In [102]:
rf_clf = ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=9, random_state=4568)
scores = cross_val_score(rf_clf, X=X_all_comb[:len(X_train)], y=Y_train,
                         scoring='roc_auc', cv=5, n_jobs=-1)
print(np.mean(scores), np.std(scores))

0.817059917027 0.0202004518458


In [None]:
rf_clf = RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_split=9, random_state=4568)
scores = cross_val_score(rf_clf, X=X_all[:len(X_train)], y=Y_train,
                         scoring='roc_auc', cv=5, n_jobs=-1)
print(np.mean(scores))
#0.830538520676

In [72]:
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.10, max_depth=10, 
                           min_samples_split=4, random_state=4568)
scores = cross_val_score(gb_clf, X=X_all_comb[:len(X_train)], y=Y_train,
                         scoring='roc_auc', cv=5, n_jobs=-1)
print(np.mean(scores))

0.803292024913


Перебираем по сетке параметры для логистической регрессии

In [None]:
parameters = {'C': np.logspace(-1, 1, 21),
              'penalty': ['l1', 'l2']}
clf = GridSearchCV(LogisticRegression(random_state=17), parameters, scoring='roc_auc', refit=True, cv=10, n_jobs=-1)
clf.fit(X_train_ohe, Y_train['label'])
print(clf.best_score_)

In [None]:
clf.best_estimator_

In [None]:
clf = GridSearchCV(RandomForset(random_state=17), parameters, scoring='roc_auc', refit=True, cv=5, n_jobs=-1)
clf.fit(X_train_ohe, Y_train['label'])

Записываем в файл результаты предсказания

In [None]:
X_test['prediction'] = clf.predict_proba(X_test_ohe)[:, 1]
X_test[['id', 'prediction']].to_csv('baseline.csv', index=None)

# 3. Склад

In [None]:
########################## 3rd order features ################################
for n_feature in range(dt.shape[1]):
    print('n_feature', n_feature)
    dt[:, n_feature] = LabelEncoder().fit_transform(dt[:, n_feature])
    uniques = len(set(dt[:, n_feature]))
    maximum = max(dt[:, n_feature])
    if maximum < 65534:
        count_map = np.bincount((dt[:, n_feature]).astype('uint16'))
        for n, i in enumerate(dt[:, n_feature]):
            if count_map[i] <= 1:
                dt[n, n_feature] = uniques
            elif count_map[i] == 2:
                dt[n, n_feature] = uniques + 1
    else:
        for n, i in enumerate(dt[:, n_feature]):
            if (dt[:, n_feature] == i).sum() <= 1:
                dt[n, n_feature] = uniques
            elif (dt[:, n_feature] == i).sum() == 2:
                dt[n, n_feature] = uniques + 1
    uniques = len(set(dt[:, n_feature]))
    dt[:, n_feature] = LabelEncoder().fit_transform(dt[:, n_feature])
    
########################## 1st order features ################################
for n_feature in range(ds.shape[1]):
    ds[:, n_feature] = LabelEncoder().fit_transform(ds[:, n_feature])
    uniques = len(set(ds[:, n_feature]))
    maximum = max(ds[:, n_feature])
    if maximum < 65534:
        count_map = np.bincount((ds[:, n_feature]).astype('uint16'))
        for n, i in enumerate(ds[:, n_feature]):
            if count_map[i] <= 1:
                ds[n, n_feature] = uniques
            elif count_map[i] == 2:
                ds[n, n_feature] = uniques+1
    else:
        for n, i in enumerate(ds[:, n_feature]):
            if (ds[:, n_feature] == i).sum() <= 1:
                ds[n, n_feature] = uniques
            elif (ds[:, n_feature] == i).sum() == 2:
                ds[n, col] = uniques + 1
    ds[:, n_feature] = LabelEncoder().fit_transform(ds[:, n_feature])