Для различных методов классификации выполняем отбор признаков. Из всех признаков выбираем те, которые дают максимальную точность классификации.

Загрузка данных и подготовка обучающей и тестовой выборок.

In [1]:
import numpy as np

In [2]:
train_set = np.load('train_set.npy')
test_set = np.load('test_set.npy')

In [3]:
X_train = train_set[:, :-1]
y_train = train_set[:, -1].astype(int)

X_test = test_set[:, :-1]
y_test = test_set[:, -1].astype(int)

In [4]:
X_train = np.nan_to_num(X_train)
X_test = np.nan_to_num(X_test)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Для ускорения расчётов обучаться будем только на 100 первых пользователях:

In [6]:
X_train = X_train[:3 * 100, :]
y_train = y_train[:3 * 100]

X_test = X_test[:100, :]
y_test = y_test[:100]

Функция, которая позволяет находить признаки, дающие наибольшую точность классификации.

In [7]:
def select_features(X_train, y_train, X_test, y_test, model):

    combination = []
    all_indexes = set(np.arange(X_train.shape[1], dtype=int))
    
    best_score = 0
    best_comb = []
    best = lambda score, comb: (score, comb.copy()) if score > best_score else (best_score, best_comb)

    if len(np.unique(y_train)) == 1:
        return best_score, best_comb

    unused_indexes = all_indexes - set(combination)

    while len(unused_indexes) > 0:

        scores = []

        indexes = list(unused_indexes)

        for index in indexes:
            comb = list(combination) + [index]

            model.fit(X_train[:, comb], y_train)
            score = model.score(X_test[:, comb], y_test)
            scores.append(score)

        max_index = np.argmax(scores)
        max_score = scores[max_index]
        combination.append(indexes[max_index])

        best_score, best_comb = best(max_score, combination)

        unused_indexes = all_indexes - set(combination)
        
    return best_score, best_comb

Выполним поиск признаков, максимизирующих точность, для модели KNeighborsClassifier:

In [8]:
from sklearn.model_selection import StratifiedKFold

# в обучающей выборке по 3 примера каждого класса, поэтому можно сформировать 3 варианта уникальных разбиений:
# 0, 1 - 2
# 0, 2 - 1
# 1, 2 - 0
skf = StratifiedKFold(n_splits=3)

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [10]:
models = [KNeighborsClassifier(n_neighbors=3),
          LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1_000),
          GaussianNB(),
          SVC(gamma='auto'),
          MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=10_000),
          DecisionTreeClassifier(min_samples_leaf=2),
          RandomForestClassifier(min_samples_leaf=2, n_estimators=20)
         ]

In [11]:
from collections import Counter

for model in models:
    
    print(model.__class__.__name__)
    
    model_features = Counter()
    train_scores = []
    test_scores = []
    
    for train_index, test_index in skf.split(X_train, y_train):
        best_score, best_comb = select_features(X_train[train_index], y_train[train_index], X_train[test_index], y_train[test_index], model)
        
        print('Точность', best_score, 'на признаках', best_comb)
        
        for item in best_comb:
            model_features[item] += 1
        
        model.fit(X_train[:, best_comb], y_train)
        
        train_scores.append(best_score)
        test_scores.append(model.score(X_test[:, best_comb], y_test))
        
    print('Средняя точность на контрольной выборке', np.mean(train_scores))
    print('Лучшая точность на тестовой выборке', np.max(test_scores))
    print('Признаки:', model_features.most_common())

KNeighborsClassifier
Точность 0.91 на признаках [3, 0, 4, 6, 11, 2, 1, 13, 10, 5, 12, 7, 27, 28, 19, 9]
Точность 0.96 на признаках [2, 10, 5, 1, 7, 0, 12, 14, 3, 8, 9, 11, 6, 4, 13, 27, 19, 29, 15, 18, 23, 28, 33]
Точность 0.93 на признаках [28, 5, 2, 12, 10, 13, 9, 3, 7, 6, 36, 37, 41, 44, 45, 46, 47, 48, 4, 8, 34, 11, 15, 1, 20, 14]
Средняя точность на контрольной выборке 0.9333333333333335
Лучшая точность на тестовой выборке 0.97
Признаки: [(3, 3), (4, 3), (6, 3), (11, 3), (2, 3), (1, 3), (13, 3), (10, 3), (5, 3), (12, 3), (7, 3), (28, 3), (9, 3), (0, 2), (27, 2), (19, 2), (14, 2), (8, 2), (15, 2), (29, 1), (18, 1), (23, 1), (33, 1), (36, 1), (37, 1), (41, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (34, 1), (20, 1)]
LogisticRegression
Точность 0.99 на признаках [27, 7, 4, 2, 3, 0, 8, 9, 6, 19, 11, 10, 32, 13, 5, 28, 25, 21, 36, 12, 1]
Точность 1.0 на признаках [5, 13, 27, 7, 2, 9, 6, 12, 10, 15, 4, 28, 8, 1, 0, 3, 20]
Точность 0.99 на признаках [27, 35, 3, 2, 9, 12, 8, 0, 6, 4

  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  (self.sigma_[i, :]), 1)
  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  (self.sigma_[i, :]), 1)


Точность 0.42 на признаках [3, 28, 9, 0, 2, 5, 20, 37, 44, 6, 4, 25, 33, 30, 48, 10, 15, 22, 21, 11, 17, 43, 41]


  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  (self.sigma_[i, :]), 1)
  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  (self.sigma_[i, :]), 1)


Точность 0.43 на признаках [8, 13, 11, 4, 6, 7, 1, 32, 37, 44, 48, 3, 36, 26, 40, 43, 5, 28, 31, 15, 30, 42, 47, 39]


  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  (self.sigma_[i, :]), 1)
  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  (self.sigma_[i, :]), 1)


Точность 0.44 на признаках [28, 5, 2, 34, 11, 6, 30, 37, 44, 18, 10, 13, 40, 16, 20, 47, 26, 33, 22, 31, 1, 4, 43, 27, 19, 36, 42]
Средняя точность на контрольной выборке 0.43
Лучшая точность на тестовой выборке 0.65
Признаки: [(28, 3), (5, 3), (37, 3), (44, 3), (6, 3), (4, 3), (30, 3), (11, 3), (43, 3), (3, 2), (2, 2), (20, 2), (33, 2), (48, 2), (10, 2), (15, 2), (22, 2), (13, 2), (1, 2), (36, 2), (26, 2), (40, 2), (31, 2), (42, 2), (47, 2), (9, 1), (0, 1), (25, 1), (21, 1), (17, 1), (41, 1), (8, 1), (7, 1), (32, 1), (39, 1), (34, 1), (18, 1), (16, 1), (27, 1), (19, 1)]
SVC
Точность 0.98 на признаках [28, 2, 5, 9, 0, 11, 27, 8, 10, 19, 6, 29, 41, 7, 37, 44, 45, 46, 47, 48, 1, 3, 34, 32, 4, 12, 13, 36]
Точность 1.0 на признаках [27, 1, 4, 11, 2, 10, 9, 13, 7, 6, 34, 5, 12, 28, 8, 15, 0, 3, 20]
Точность 0.99 на признаках [27, 8, 34, 1, 13, 3, 10, 4, 7, 2, 6, 9, 15, 5, 11, 25, 18]
Средняя точность на контрольной выборке 0.9899999999999999
Лучшая точность на тестовой выборке 0.99
Признаки