# Иванов Александр, CS

In [6]:
import pandas as pd
import numpy as np
import gc
import pickle as pkl

from collections import Counter, defaultdict
from itertools import combinations, product, chain
from copy import deepcopy
from scipy import sparse
import scipy

import sklearn
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score, StratifiedKFold, KFold
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import roc_auc_score
import random

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier

<a id='toc'></a>
# Содержание
* [1) Создание различных комбинаций признаков](#combinations)
* [2) Отбор признаков](#feature_selection)
* [3) Индивидуальный подбор признаков для LR, SVC и BNB](#individual)
* [4) Подбор признаков для LR из различных комбинаций степени от 1 до 7](#lr17)
* [5) Embedding (итоговый ответ)](#embedding)
* [6) Склад](#store)

Для получения итогового результата грузить только его пункты 1, 2 и 5.

* f1 — ресурс к которому происходит обращение
* f2 — идентификатор группы машин в котором находится источник
* f3 — тип источника обращения (1)
* f4 — тип источника обращения (2)
* f5 — класс устройств в котором находится источник
* f6 — сокращенное название источника обращения
* f7 — расширенный тип источника обращения (1)
* f8 — расширенный тип источника обращения (2)
* f9 — код модели источника

<a id='combinations'></a>
## 1) Создание различных комбинаций признаков<sup>[содержание](#toc)</sup>

In [3]:
class FeaturesCreator(object):     
    def group_features(self, X, degree, hash=hash):
        if degree == 1:
            return deepcopy(X), list(range(X.shape[1]))
        
        new_X = []
        combs = []
        n_samples, n_features = X.shape
        for indices in combinations(range(n_features), degree):
            new_X.append([hash(tuple(sequence)) for sequence in X[:, indices]])
            combs.append(tuple(indices))
        new_X = np.array(new_X).T
        return new_X, combs
          
    def label_encode_features(self, X, group_uniques=True, threshold_uniques=1):
        new_X = []
        unique_labels = []
        for n_feature in range(X.shape[1]):
            f_column = LabelEncoder().fit_transform(X[:, n_feature])
            add_column = True
            if group_uniques:
                counts = Counter(f_column)
                n_uniques = len(counts)
                for n, i in enumerate(f_column):
                    for thr in range(1, threshold_uniques + 1):
                        if counts[i] == thr:
                            f_column[n] = n_uniques + thr - 1

                if len(Counter(f_column)) == 1:
                    print('All values of feature {} are different'.format(n_feature))
                    add_column = False
                    
                encoder = LabelEncoder()
                f_column = encoder.fit_transform(f_column)
                temp = np.where(encoder.classes_ == n_uniques)[0]
                if len(temp) > 0:
                    index = temp[0]
                else:
                    index = -1
            if add_column:
                new_X.append(f_column[:, np.newaxis])
                unique_labels.append(index)
                
        X = np.concatenate(new_X, axis=1)
        if group_uniques:
            return X, unique_labels
        else:
            return X
    

    def encode_ohe(self, X, labels_to_omit=None):
        n_samples, n_features = X.shape
        if labels_to_omit is None:
            return OneHotEncoder().fit_transform(X)
        
        new_X = []
        for n_feature in range(n_features):
            f_column = X[:, [n_feature]]
            new_X.append(self.encode_ohe_feature(f_column, labels_to_omit[n_feature]))
        new_X = sparse.hstack(new_X).tocsr()
        return new_X
        
    def encode_ohe_feature(self, f_column, label_to_omit):
        if label_to_omit == -1:
            new_X = OneHotEncoder().fit_transform(f_column)
        else:
            mask = (f_column == label_to_omit)
            new_X = OneHotEncoder().fit_transform(f_column).tocsc()
            if not np.all(new_X[:, -1].todense() == mask):
                assert False
            new_X = new_X[:, :-1].tocsr()
        return new_X

#### Загрузка данных

In [7]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
Y_train = pd.read_csv('y_train.csv')
Y_train = Y_train['label'].values
f_columns = list(X_train.columns.difference(['id']))
X_test = X_test.values[:, 1:9]
X_train = X_train.values[:, 1:9]
X_all = np.concatenate([X_train, X_test], axis=0)

#### Создание комбинаций

In [8]:
features_creator = FeaturesCreator()

X_all_gr = []
X_all_gr_ohe = []
unique_labels = []
thresholds_uniques = [1, 2, 2, 3, 3, 2, 2]
for n, degree in enumerate(range(1, 8)):
    print('Generating for degree {}'.format(degree))
    X_new, _ = features_creator.group_features(X_all, degree)
    X_new, unique_labels_new = features_creator.label_encode_features(X_new, threshold_uniques=thresholds_uniques[n])
    X_new_ohe = features_creator.encode_ohe(X_new, labels_to_omit=None)
    print(X_new.shape, X_new_ohe.shape)
    X_all_gr.append(X_new)
    X_all_gr_ohe.append(X_new_ohe)
    unique_labels += unique_labels_new
    
X_all_comb = np.concatenate(X_all_gr, axis=1)
print(X_all_comb.shape)

Generating for degree 1
(32769, 8) (32769, 9953)
Generating for degree 2
(32769, 28) (32769, 54267)
Generating for degree 3
(32769, 56) (32769, 126956)
Generating for degree 4
(32769, 70) (32769, 105071)
Generating for degree 5
(32769, 56) (32769, 66795)
Generating for degree 6
(32769, 28) (32769, 33230)
Generating for degree 7
All values of feature 0 are different
(32769, 7) (32769, 4799)
(32769, 253)


<a id='feature_selection'></a>
## 2) Отбор признаков<sup>[содержание](#toc)</sup>

In [9]:
class MyXYTrainTestBuilder(object):
    def __init__(self, ohe_features_dict, Y_train, copy_input=False):
        if copy_input:
            self.ohe_features_dict = deepcopy(ohe_features_dict)
            self.Y_train = deepcopy(Y_train)
        else:
            self.ohe_features_dict = ohe_features_dict
            self.Y_train = Y_train
        
    def __call__(self, feature_names):
        return self.get_X_train(feature_names), self.get_Y_train()
    
    def get_X_train(self, feature_names):
        train_size = len(Y_train)
        X_train = sparse.hstack([self.ohe_features_dict[name][:train_size] for name in feature_names]).tocsr()
        return X_train
    
    def get_Y_train(self):
        return self.Y_train
            
    def get_X_test(self, feature_names):
        train_size = len(Y_train)
        X = sparse.hstack([self.ohe_features_dict[name][train_size:] for name in feature_names]).tocsr()
        return X

In [10]:
class GreedFeaturesSelector(object):
    def __init__(self):
        self.scores_history = {}
        self.best_features_history = {}
        
        self.n_iters = 0
        self.best_score = None
        self.best_features = []
        self.omit_feature_names = []
        self.start_backward = False
    
    def update_history(self):
        self.scores_history[self.n_iters] = self.best_score
        self.best_features_history[self.n_iters] = list(self.best_features)
        self.n_iters += 1
        
    def initialize(self, best_score, best_features, omit_feature_names=None):
        self.best_score = best_score
        self.best_features = list(best_features)
        if omit_feature_names is not None:
            self.omit_feature_names = list(omit_feature_names)
    
    def select_features(self, clf, feature_names, builder, kfold_cv, max_features=-1):
        self.clf = deepcopy(clf)
        self.feature_names = deepcopy(feature_names)
        self.builder = builder
        self.kfold_cv = kfold_cv
        
        if self.best_score is None:
            print('Adding first feature')
            best_feature, best_score = self.move_forward()
            self.best_score = best_score
            self.best_features.append(best_feature)
            self.update_history()
            if max_features == len(self.best_features):
                return self.best_features, self.best_score
        
        while 1:
            print('Entering infinite loop')
            forward = False
            backward = False
            
            if not self.start_backward:
                new_feature, new_score = self.move_forward()
                while new_score > self.best_score:
                    print('+ feature = {}, score = {} -> {}'.format(new_feature, self.best_score, new_score))
                    self.best_score = new_score
                    self.best_features.append(new_feature)
                    self.update_history()
                    forward = True
                    print(self.best_score, self.best_features)
                    if max_features == len(self.best_features):
                        break
                    new_feature, new_score = self.move_forward()
                
            if self.start_backward:
                self.start_backward = False
                
            old_feature, new_score = self.move_backward()
            while new_score > self.best_score:
                print('- feature = {}, score = {} -> {}'.format(old_feature, self.best_score, new_score))
                self.best_score = new_score
                self.best_features.remove(old_feature)
                self.update_history()
                backward = True
                print(self.best_score, self.best_features)
                old_feature, new_score = self.move_backward()
                
            if max_features == len(self.best_features):
                return self.best_features, self.best_score
                
            if (not forward) & (not backward):
                break
            print(self.best_score, self.best_features)
        return self.best_features, self.best_score
        
    def move_forward(self):
        scores = []
        for feature_name in self.feature_names:
            if feature_name in self.omit_feature_names:
                continue
            if feature_name not in self.best_features:
                current_features = list(self.best_features) + [feature_name]
                X, Y = self.builder(current_features)
                score = np.mean(cross_val_score(self.clf, X, y=Y, scoring='roc_auc', 
                                        cv=self.kfold_cv, n_jobs=-1))
                scores.append((score, feature_name))
                print("+ feature = {},  score = {}".format(feature_name, score))
        scores = sorted(scores)
        best_feature = scores[-1][1] 
        best_score = scores[-1][0]
        return best_feature, best_score

    def move_backward(self):
        if len(self.best_features) == 1:
            assert False
            
        scores = []    
        for feature_name in self.best_features:
            current_features = list(self.best_features)
            current_features.remove(feature_name)
            X, Y = self.builder(current_features)
            score = np.mean(cross_val_score(self.clf, X, y=Y, scoring='roc_auc', 
                                        cv=self.kfold_cv, n_jobs=-1))
            scores.append((score, feature_name))
            print("- feature = {},  score = {}".format(feature_name, score))
        scores = sorted(scores)
        best_feature = scores[-1][1] 
        best_score = scores[-1][0]
        return best_feature, best_score  

In [11]:
class WrappedSVC(BaseEstimator, ClassifierMixin):
    def __init__(self, C, random_state, penalty, loss, **kwargs):
        self.C = C
        self.random_state = random_state
        self.loss = loss
        self.penalty = penalty
        
    def fit(self, X, y):
        self.lsvc_clf = LinearSVC(C=self.C, loss=self.loss, penalty=self.penalty, random_state=self.random_state)
        self.const = False
        if len(Counter(y)) < 2:
            self.const = True
            if y[0] == 0:
                self.margin = -10
            else:
                self.margin = 10
        else:
            self.lsvc_clf.fit(X, y)

    def predict_proba(self, X):
        if self.const == True:
            probas = np.array([self.margin] * X.shape[0])
        else:
            probas = self.lsvc_clf.decision_function(X)
        probas = probas[:, np.newaxis]
        probas = np.concatenate((probas, probas), axis=1)
        return probas
    
    def decision_function(self, X):
        return self.predict_proba(X)[:, 1]

<a id='individual'></a>
## 3) Индивидуальный подбор признаков для LR, SVC и BNB<sup>[содержание](#toc)</sup>

In [13]:
CLF = 'LR'

CLASSIFIERS = {
    'SVC': WrappedSVC(C=0.063095734448019331, penalty='l2', loss='squared_hinge', random_state=36),
    'LR': LogisticRegression(C=1.0, random_state=5),
    'BNB': BernoulliNB(alpha=0.03)
}

#### Подбор признаков для выбранного выше классификатора

In [14]:
features = list(range(0, 3))
X_all_comb = np.concatenate(X_all_gr[:3], axis=1)

ohe_features_dict = [OneHotEncoder().fit_transform(X_all_comb[:, [i]]) for i in range(X_all_comb.shape[1])]
builder = MyXYTrainTestBuilder(ohe_features_dict, Y_train)
feature_names = list(range(X_all_comb.shape[1]))
clf = deepcopy(CLASSIFIERS[CLF])

In [54]:
best_features = None
best_score = None

In [55]:
kfold_cv = StratifiedKFold(10, shuffle=True, random_state=515)
selector = GreedFeaturesSelector()
if best_features is not None:
    selector.best_score = best_score
    selector.best_features = best_features
best_features, best_score = selector.select_features(clf, feature_names, builder, kfold_cv)

Adding first feature
+ feature = 0,  score = 0.6058000093051319
+ feature = 1,  score = 0.7995148047935221
+ feature = 2,  score = 0.5881961555892712
+ feature = 3,  score = 0.6412634540651512
+ feature = 4,  score = 0.7184583223458082
+ feature = 5,  score = 0.6690476223150149
+ feature = 6,  score = 0.7197050390643212
+ feature = 7,  score = 0.6229622350025911
+ feature = 8,  score = 0.5919309637723529
+ feature = 9,  score = 0.6010325166413409
+ feature = 10,  score = 0.6569364915279591
+ feature = 11,  score = 0.6807698600350316
+ feature = 12,  score = 0.5730007734765767
+ feature = 13,  score = 0.545723552547261
+ feature = 14,  score = 0.5900022097583301
+ feature = 15,  score = 0.7993225843810395
+ feature = 16,  score = 0.8012315949061952
+ feature = 17,  score = 0.8008501605134786
+ feature = 18,  score = 0.8173565715521496
+ feature = 19,  score = 0.8149148508603897
+ feature = 20,  score = 0.8141656819151717
+ feature = 21,  score = 0.6409552106669252
+ feature = 22,  score

KeyboardInterrupt: 

In [56]:
best_features = deepcopy(selector.best_features)
best_score = deepcopy(selector.best_score)
print(best_score,  sorted(best_features))

0.883665809431 [0, 1, 7, 9, 10, 11, 18, 20, 36, 38, 39, 49, 59, 63, 64, 73, 83, 88, 91]


In [58]:
pkl.dump(best_features, open('{}/best_features_{}.pkl'.format(CLF, features), 'wb'))
pkl.dump(best_score, open('{}/best_score_{}.pkl'.format(CLF, features), 'wb'))

#### Кросс-валидация

In [59]:
X_part_train = builder.get_X_train(best_features)
X_part_test = builder.get_X_test(best_features)

In [60]:
kfold_cv = StratifiedKFold(10, shuffle=True, random_state=925)

if CLF == 'BNB':
    parameters = {'alpha': np.linspace(0.01, 0.1, 21)}
elif CLF == 'LR':
    parameters = {'C': np.logspace(-1, 0, 11), 'penalty': ['l2']}
elif CLF == 'SVC': 
    parameters = {'C': np.logspace(-2, -1, 11), 'penalty': ['l2'], 'loss': ['hinge', 'squared_hinge']}
else:
    assert False
    
clf = deepcopy(CLASSIFIERS[CLF])

grid_searcher = GridSearchCV(clf, parameters, scoring='roc_auc', refit=True, cv=kfold_cv, n_jobs=-1)
grid_searcher.fit(X_part_train, Y_train)
print(grid_searcher.best_score_, grid_searcher.best_estimator_)
best_clf = grid_searcher.best_estimator_

0.882622134223 LogisticRegression(C=0.79432823472428149, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=5,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)


#### Финальное обучение

In [None]:
Y_pred = pd.DataFrame(best_clf.predict_proba(X_part_test)[:, 1], index=range(X_part_test.shape[0]), 
                      columns=['prediction'])
Y_pred.to_csv('{}/{}.csv'.format(CLF, CLF), index_label='id')

<a id='report'></a>
### Отчет<sup>[содержание](#toc)</sup>

#### SVC
Без f9. Первые три группы признаков. Следующие номера признаков:
* [0, 7, 11, 36, 38, 49, 70, 79, 83, 88, 89]
Отобраны с помощью WrappedSVC(C=1.0, penalty='l2', loss='squared_hinge', random_state=36)

Точность:
* на CV 0.871773063256 WrappedSVC(C=0.063095734448019331, loss='squared_hinge', penalty='l2', random_state=36) при kfold_cv = StratifiedKFold(10, shuffle=True, random_state=925)

Потом произведен повторный отбор с новым параметром C = 0.063095734448019331. В результате получен новый набор признаков:
* [0, 7, 11, 20, 36, 38, 47, 49, 64, 69, 70, 79, 88, 89] на котором 0.876302286022 WrappedSVC(C=0.050118723362727248, loss='squared_hinge', penalty='l2', random_state=36) при kfold_cv = StratifiedKFold(10, shuffle=True, random_state=925)
 

#### LogisticRegression

Наилучшие решение. Первые три группы признаков без удаления f9. Следующие номера признаков:
* [0, 7, 10, 11, 12, 22, 27, 34, 45, 47, 48, 51, 60, 75, 76, 77, 78, 81, 82, 92, 107, 119]

Один из следующих классификаторов:
* LogisticRegression(C=0.50118723362727235, penalty='l2', random_state=17)
* LogisticRegression(C=0.63095734448019336, penalty='l2', random_state=23958)

Точность:
* На public-е дало 0.90085
* На CV 0.883246881869 при LogisticRegression(C=0.50118723362727235, penalty='l2', random_state=17) и StratifiedKFold(10, shuffle=True, random_state=965)

Первые три группы признаков. f9 Удален. Следующие номера признаков:
0.883665809431 [0, 1, 7, 9, 10, 11, 18, 20, 36, 38, 39, 49, 59, 63, 64, 73, 83, 88, 91]

0.882622134223 LogisticRegression(C=0.79432823472428149, penalty='l2', random_state=5) kfold_cv = StratifiedKFold(10, shuffle=True, random_state=515)

#### BernoulliNB

Первые три группы признаков без удаления f9. Следующие номера признаков:
* [0, 3, 4, 9, 11, 12, 42, 45, 46, 58, 65, 71, 81, 82, 83, 91, 93, 108, 117, 122]

Первые три группы признаков. f9 удален. Следующие номера признаков:
* [5, 11, 20, 37, 40, 42, 70, 85, 90]

* 0.865063382838 на CV при BernoulliNB(alpha=0.032500000000000001, binarize=0.0, class_prior=None, fit_prior=True) и kfold_cv = StratifiedKFold(10, shuffle=True, random_state=925)

<a id='lr17'></a>
## 4) Подбор признаков для LR из различных комбинаций степени от 1 до 7<sup>[содержание](#toc)</sup>

In [40]:
def get_X_subset(X_groups, indices):
    new_X = []
    for i in indices:
        new_X.append(X_groups[i])
    return np.concatenate(new_X, axis=1)

#### Загрузка уже полученных результатов

In [25]:
selected_features = pkl.load(open('{}/selected_features{}.pkl'.format(CLF, max_features), 'rb'))
selected_scores = pkl.load(open('{}/selected_scores{}.pkl'.format(CLF, max_features), 'rb'))

In [26]:
for n, indices in enumerate(combinations(range(7), degree)):
    if indices in selected_features:
        print('SUBSET {} is already considered', indices)
        continue
    print('Considering SUBSET:', indices)
    kfold_cv = StratifiedKFold(8, shuffle=True, random_state=n)
    
    clf = LogisticRegression(C=1.0, penalty='l2', random_state=n * 11)
    clf = WrappedSVC(C=1.0, penalty='l2', loss='squared_hinge', random_state=n*11)
    
    X_all_comb = get_X_subset(X_all_gr, indices)
    
    feature_names = list(range(X_all_comb.shape[1]))
    ohe_features_dict = {}
    features_creator = FeaturesCreator()
    for n_feature in range(X_all_comb.shape[1]):
        ohe_features_dict[n_feature] = features_creator.encode_ohe_feature(X_all_comb[:, [n_feature]], 
                                                                           label_to_omit=-1)    
        
    builder = MyXYTrainTestBuilder(ohe_features_dict, Y_train)
    selector = GreedFeaturesSelector()
    best_features, best_score = selector.select_features(clf, feature_names, builder, kfold_cv, max_features=10)
    selected_features[indices] = best_features
    selected_scores[indices] = best_score

SUBSET {} is already considered (0, 1, 2)
SUBSET {} is already considered (0, 1, 3)
SUBSET {} is already considered (0, 1, 4)
SUBSET {} is already considered (0, 1, 5)
SUBSET {} is already considered (0, 1, 6)
SUBSET {} is already considered (0, 2, 3)
SUBSET {} is already considered (0, 2, 4)
Considering SUBSET: (0, 2, 5)


TypeError: object of type 'int' has no len()

#### Сохранение промежуточных результатов

In [14]:
pkl.dump(selected_features, open('{}/selected_features{}.pkl'.format(CLF, max_features), 'wb'))
pkl.dump(selected_scores, open('{}/selected_scores{}.pkl'.format(CLF, max_features), 'wb'))

<a id='embedding'></a>
## 5) Embedding (итоговый ответ)<sup>[содержание](#toc)</sup>

### 2.0 Формирование признаков

In [41]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
Y_train = pd.read_csv('y_train.csv')
Y_train = Y_train['label'].values
f_columns = list(X_train.columns.difference(['id']))
X_test = X_test.values[:, 1:9]
X_train = X_train.values[:, 1:9]
X_all = np.concatenate([X_train, X_test], axis=0)

features_creator = FeaturesCreator()

X_all_gr = []
X_all_gr_ohe = []
unique_labels = []
thresholds_uniques = [1, 2, 2, 3, 3, 2, 2]
for n, degree in enumerate(range(1, 8)):
    print('Generating for degree {}'.format(degree))
    X_new, _ = features_creator.group_features(X_all, degree)
    X_new, unique_labels_new = features_creator.label_encode_features(X_new, threshold_uniques=thresholds_uniques[n])
    X_new_ohe = features_creator.encode_ohe(X_new, labels_to_omit=None)
    print(X_new.shape, X_new_ohe.shape)
    X_all_gr.append(X_new)
    X_all_gr_ohe.append(X_new_ohe)
    unique_labels += unique_labels_new
    
X_all_comb = np.concatenate(X_all_gr, axis=1)
print(X_all_comb.shape)

Generating for degree 1
(32769, 8) (32769, 9953)
Generating for degree 2
(32769, 28) (32769, 54267)
Generating for degree 3
(32769, 56) (32769, 126956)
Generating for degree 4
(32769, 70) (32769, 105071)
Generating for degree 5
(32769, 56) (32769, 66795)
Generating for degree 6
(32769, 28) (32769, 33230)
Generating for degree 7
All values of feature 0 are different
(32769, 7) (32769, 4799)
(32769, 253)


In [93]:
Y_TRAIN_PRED = {} # Для обучения метаклассификатора
Y_TEST_PRED = {} # Для предсказания ответов

In [94]:
kfold_cv1 = StratifiedKFold(Y_train, 10, shuffle=True, random_state=525)
kfold_cv2 = StratifiedKFold(Y_train, 10, shuffle=True, random_state=526)
kfold_cv3 = StratifiedKFold(Y_train, 10, shuffle=True, random_state=527)
kfold_cv4 = StratifiedKFold(Y_train ,10, shuffle=True, random_state=528)
kfolds = [kfold_cv1] #[kfold_cv1, kfold_cv2, kfold_cv3, kfold_cv4]

def RKFolds(X, Y):
    for kfold in kfolds:
        for train_indices, test_indices in kfold:
            yield train_indices, test_indices

### 2.1 Загрузка результатов для SVC, BNB и LR 

In [95]:
CLF = 'LR'

CLASSIFIERS = {
    'SVC': WrappedSVC(C=0.063095734448019331, penalty='l2', loss='squared_hinge', random_state=36),
    'LR': LogisticRegression(C=1.0, random_state=5),
    'BNB': BernoulliNB(alpha=0.03)
}

In [96]:
best_features = {}
for n_clf, cc in enumerate(['BNB', 'LR']):
    indices = list(range(3))
    features = pkl.load(open('{}/best_features_{}.pkl'.format(cc, indices), 'rb'))
    score = pkl.load(open('{}/best_score_{}.pkl'.format(cc, indices), 'rb'))
    best_features = features
    
    X_tr = builder.get_X_train(best_features)
    X_ts = builder.get_X_test(best_features)
    
    clf = deepcopy(CLASSIFIERS[cc])
    clf.fit(X_tr, Y_train)
    Y_TEST_PRED[n_clf] = clf.predict_proba(X_ts)[:, 1]
    
    Y_pred = np.zeros(len(Y_train))
    for n_fold, (train_indices, test_indices) in enumerate(RKFolds(X_tr, Y_train)):
        X_tr_tr, X_tr_ts = X_tr[train_indices], X_tr[test_indices]
        Y_tr_tr, Y_tr_ts = Y_train[train_indices], Y_train[test_indices]
        clf = deepcopy(CLASSIFIERS[cc])
        clf.fit(X_tr_tr, Y_tr_tr)
        Y_pred[test_indices] = clf.predict_proba(X_tr_ts)[:, 1]
    Y_TRAIN_PRED[n_clf] = Y_pred    
    score = roc_auc_score(Y_train, Y_pred)
    print(cc, score)

BNB 0.854912852292
LR 0.876947633355


### 2.2 Загрузка данных для кучи логистических регрессий

In [97]:
CLF = 'LR'
degree = 3
max_features = 10
selected_features = {}
selected_scores = {}

In [98]:
selected_features = pkl.load(open('{}/selected_features{}.pkl'.format(CLF, max_features), 'rb'))
selected_scores = pkl.load(open('{}/selected_scores{}.pkl'.format(CLF, max_features), 'rb'))

In [99]:
X_all_comb = np.concatenate(X_all_gr, axis=1)
index_groups = []
first = 0
for i in range(len(X_all_gr)):
    length = X_all_gr[i].shape[1]
    index_groups.append(list(range(first, first + length)))
    first += length

In [100]:
ohe_features_dict = {}
features_creator = FeaturesCreator()
for n_feature in range(X_all_comb.shape[1]):
    ohe_features_dict[n_feature] = features_creator.encode_ohe_feature(X_all_comb[:, [n_feature]], label_to_omit=-1)    
builder = MyXYTrainTestBuilder(ohe_features_dict, Y_train)

In [101]:
for n_subset, indices in enumerate(sorted(selected_features.keys())):
    print('Considering SUBSET {}'.format(indices))
    new_feature_names = np.array(sorted(selected_features[indices]))
    old_feature_names = np.array(list(chain(*[index_groups[i] for i in indices])))
    feature_names = old_feature_names[new_feature_names]
    
    X = builder.get_X_train(feature_names)
    Y_pred = np.zeros(len(Y_train))
    
    X_ts = builder.get_X_test(feature_names)
    clf = LogisticRegression(C=1.0, penalty='l2', random_state=11*n_subset)
    clf.fit(X, Y_train)
    Y_TEST_PRED[len(Y_TEST_PRED)] = clf.predict_proba(X_ts)[:, 1]
    
    print('Size of training set = {}'.format(X.shape))
    for n_fold, (train_indices, test_indices) in enumerate(RKFolds(X, Y_train)):
        X_tr, Y_tr = X[train_indices], Y_train[train_indices]
        X_ts, Y_ts = X[test_indices], Y_train[test_indices]
        clf = LogisticRegression(C=1.0, penalty='l2', random_state=11*n_subset)
        clf.fit(X_tr, Y_tr)
        Y_pred[test_indices] = clf.predict_proba(X_ts)[:, 1]
    Y_TRAIN_PRED[len(Y_TRAIN_PRED)] = Y_pred
    score = roc_auc_score(Y_train, Y_pred)
    print('Score = {}'.format(score))

Considering SUBSET (0, 1, 2)
Size of training set = (19661, 24871)
Score = 0.8747937817059266
Considering SUBSET (0, 1, 3)
Size of training set = (19661, 24402)
Score = 0.8738901866953759
Considering SUBSET (0, 1, 4)
Size of training set = (19661, 23630)
Score = 0.8742141005446011
Considering SUBSET (0, 1, 5)
Size of training set = (19661, 26754)
Score = 0.8743675270521909
Considering SUBSET (0, 1, 6)
Size of training set = (19661, 26886)
Score = 0.8725502699290143
Considering SUBSET (0, 2, 3)
Size of training set = (19661, 23691)
Score = 0.8726856676119741
Considering SUBSET (0, 2, 4)
Size of training set = (19661, 26392)
Score = 0.8726374858806473
Considering SUBSET (1, 2, 3)
Size of training set = (19661, 26234)
Score = 0.8698941037660738
Considering SUBSET (1, 2, 4)
Size of training set = (19661, 25290)
Score = 0.8696337142989343
Considering SUBSET (1, 2, 5)
Size of training set = (19661, 26197)
Score = 0.8681519385124639
Considering SUBSET (1, 2, 6)
Size of training set = (19661, 

In [62]:
pkl.dump(Y_pred_dict, open('LR/Y_pred_dict10.pkl', 'wb'))

In [33]:
Y_pred_dict = pkl.load(open('LR/Y_pred_dict10.pkl', 'rb'))

In [102]:
Y_train_pred = []
Y_test_pred = []
for n_clf in sorted(Y_TRAIN_PRED.keys()):
    Y_train_pred.append(Y_TRAIN_PRED[n_clf][:, np.newaxis])
    Y_test_pred.append(Y_TEST_PRED[n_clf][:, np.newaxis])
Y_train_pred = np.concatenate(Y_train_pred, axis=1)
Y_test_pred = np.concatenate(Y_test_pred, axis=1)
print(Y_train_pred.shape, Y_test_pred.shape)

(19661, 29) (13108, 29)


In [104]:
ens_clf = LogisticRegression(C=1.0, random_state=347)
ens_clf.fit(Y_train_pred[:, :28], Y_train)
print(ens_clf.coef_)

[[ 1.47550969  0.6368205   1.5348157   0.33394894 -0.20308353  0.94506701
   0.81158306  0.3354811   0.1479721  -1.38504352  0.02092172  0.23693711
   1.25162297 -1.07590155  0.38772734 -0.33511459  1.70290607  0.465081
   0.26421439  0.77738986 -0.67539405 -0.3853827  -0.01747549 -0.79090023
  -0.6786086   0.68717252  0.02846795  0.41538281]]


In [105]:
kfold_cv = StratifiedKFold(Y_train, 10, shuffle=True, random_state=225)
parameters = {'C': np.logspace(-1, 1, 21)}
clf = GridSearchCV(LogisticRegression(random_state=8, max_iter=1000, tol=1e-4), parameters, 
                   scoring='roc_auc', refit=True, cv=kfold_cv, n_jobs=-1)
clf.fit(Y_train_pred[:, 1:], Y_train)
print(clf.best_score_, clf.best_estimator_)
ens_clf = clf.best_estimator_

0.878318676217 LogisticRegression(C=0.19952623149688797, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=8,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)


In [106]:
ens_clf.fit(Y_train_pred, Y_train)
Y_test = pd.DataFrame(ens_clf.predict_proba(Y_test_pred)[:, 1], index=range(Y_test_pred.shape[0]), 
                      columns=['prediction'])
Y_test.to_csv('result.csv', index_label='id')

<a id='store'></a>
## 6) Склад<sup>[содержание](#toc)</sup>

In [76]:
import keras
from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.optimizers import SGD, RMSprop

In [70]:
X_nn_train = deepcopy(Y_train_pred)
X_nn_test = deepcopy(Y_test_pred)

def preprocess(X, low=0.0001, upp=0.9999):
    X[X < low] = low
    X[X > upp] = upp
    X = -np.log((1 - X) / X)
    return X

X_nn_train[:, 1:] = preprocess(X_nn_train[:, 1:])
X_nn_test[:, 1:] = preprocess(X_nn_test[:, 1:])

In [77]:
model = Sequential()
model.add(Dense(100, activation='sigmoid', input_dim=30))
model.add(Dense(100, activation='sigmoid'))
model.add(Dense(100, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

In [80]:
opt = RMSprop()
model.compile(loss='binary_crossentropy', optimizer=opt)
model.fit(X_nn_train, Y_train, batch_size=200, nb_epoch=500, verbose=1)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.callbacks.History at 0x7f30a25e54a8>

In [81]:
Y_test = model.predict(X_nn_train)

In [84]:
roc_auc_score(Y_train, Y_test)

0.96824093072681872

In [85]:
Y_test = model.predict(X_nn_test)

In [86]:
Y_test = pd.DataFrame(Y_test, index=range(Y_test_pred.shape[0]), columns=['prediction'])
Y_test.to_csv('result.csv', index_label='id')