In [43]:
import numpy as np
import pandas as pd

import eli5

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier

import warnings
warnings.filterwarnings('ignore')

<div style="text-align:justify">Возьмите четыре любые темы из корпуса 20newsgroups (постарайтесть брать не слишком похожие, и не слишком разные темы).</div>

In [2]:
subj = ['rec.sport.baseball', 'talk.politics.guns', 'talk.politics.misc', 'rec.autos']

In [3]:
news_train = fetch_20newsgroups(subset='train', categories=subj)

In [4]:
news_test = fetch_20newsgroups(subset='test', categories=subj)

In [5]:
# the order of the subjects is not kept
names = news_test.target_names
names

['rec.autos', 'rec.sport.baseball', 'talk.politics.guns', 'talk.politics.misc']

In [6]:
y_train = news_train.target
y_test = news_test.target

In [7]:
X_train = news_train.data

In [8]:
X_test = news_test.data

<div style="text-align:justify">Векторизуйте датасет с помощью CountVectorizer.</div>

In [9]:
c_vect = CountVectorizer()

In [10]:
c_vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [11]:
X_train_vec = c_vect.transform(X_train)

In [12]:
X_test_vec = c_vect.transform(X_test)

<div style="text-align:justify">
Выберите три любых классификатора. <br>
Используя кроссвалидацию (любой вариант из KFold, StratifiedKFold, RepeatedStratifiedKFold), подберите оптимальные параметры моделей с помощью grid_search. Обучите классификаторы с оптимальными параметрами.
Оцените полученные классификаторы на тесте, мера качества - macro_f1.
Посмотрите, насколько полученные результаты на тесте отличаются от результатов предсказания на трейне?<br> 
<strong>3 балла</strong>
</div>

In [13]:
log_reg = LogisticRegression()
rf = RandomForestClassifier()
sgd = SGDClassifier()

In [14]:
def eval_model_gscv(model, X, y, gs_param_grid, folds=5, verbose=True):
    # utilizes StratifiedKFold by default for multiclass labels when `cv` is specified
    gridsearch = GridSearchCV(model, param_grid=gs_param_grid,
                             n_jobs=-1,  scoring='f1_macro', cv=folds).fit(X, y)
    
    if verbose:
        print(f'Best score: {gridsearch.best_score_}')
        print(f'Best parameters: {gridsearch.best_params_}')
        
    return gridsearch

In [15]:
def validate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f = f1_score(y_test, y_pred, average='macro')
    print(f"Macro f-score on test data is {f:.5}")

In [16]:
%%time

# only liblinear solver supports both l1 and l2 regularization
lr_gridsearch = eval_model_gscv(log_reg, X_train_vec, y_train,
                                gs_param_grid={'C': [1, 2, 3],
                                               'penalty':['l1', 'l2'],
                                               'intercept_scaling': [1, 2, 5],
                                               'solver': ['liblinear']})


Best score: 0.9544185713553807
Best parameters: {'C': 2, 'intercept_scaling': 5, 'penalty': 'l2', 'solver': 'liblinear'}
Wall time: 45.4 s


In [17]:
best_log_reg = LogisticRegression(**lr_gridsearch.best_params_)

In [18]:
print('Logistic regression')
validate_model(best_log_reg, X_train_vec, y_train, X_test_vec, y_test)

Logistic regression
Macro f-score on test data is 0.86372


In [19]:
%%time

rf_gridsearch = eval_model_gscv(rf, X_train_vec, y_train,
                               gs_param_grid={'n_estimators':[100, 150, 200],
                                              'max_depth': [10, 15, 20],
                                              'criterion':['gini', 'entropy']})

Best score: 0.9210640721979676
Best parameters: {'criterion': 'gini', 'max_depth': 20, 'n_estimators': 150}
Wall time: 23.3 s


In [20]:
best_rf = RandomForestClassifier(**rf_gridsearch.best_params_)
print('Random forest')
validate_model(best_rf, X_train_vec, y_train, X_test_vec, y_test)

Random forest
Macro f-score on test data is 0.85031


In [21]:
%%time

sgd_gridsearch = eval_model_gscv(sgd, X_train_vec, y_train,
                                gs_param_grid={'penalty': ['l2', 'l1', 'elasticnet'],
                                               'loss': ['log', 'hinge', 'perceptron'],
                                               'alpha': [0.01, 0.001, 0.0001]})

Best score: 0.9528648261865843
Best parameters: {'alpha': 0.01, 'loss': 'log', 'penalty': 'l2'}
Wall time: 39.4 s


In [22]:
best_sgd = SGDClassifier(**sgd_gridsearch.best_params_)
print('SGD classifier')
validate_model(best_sgd, X_train_vec, y_train, X_test_vec, y_test)

SGD classifier
Macro f-score on test data is 0.85904


*В среднем результаты моделей на тесте хуже результатов на трейне примерно на 8-9 процентов, что довольно ощутимо.*

<div style="text-align:justify">
Постройте функцию analyze_features(model, n), которая бы для каждой модели выводила самые значимые признаки по каждому из четырех классов. 
Для этого вам понадобится словарь, связывающий номер признака с самим признаком, он может быть таким:
index_to_word = {v:k for k,v in count_vect.vocabulary_.items()}
обращаться к весам модели можно либо через eli5, как мы делали на лекции, либо напрямую: через clf.coef_ - матрица размера (n_classes, n_features), то есть для получения признаков с наибольшим весом для класса n вам нужно сортировать веса внутри n-ной строки.
Eсли вы используете деревья решений, то можно воспользоваться методом  model.feature_importances
<br>
<strong>4 балла</strong>
</div>

In [23]:
index_to_word = {v:k for k, v in c_vect.vocabulary_.items()}

In [24]:
def get_word_by_index(feature):
    global index_to_word
    feature = int(feature.strip('x'))
    return index_to_word[feature]

In [25]:
def analyze_features(model, n):
    explanation = eli5.formatters.as_dataframe.explain_weights_df(model)
    if 'target' not in explanation.columns:
        explanation['word'] = explanation.feature.apply(get_word_by_index)
        explanation = explanation.nlargest(n, 'weight')
        explanation.drop('std', axis=1, inplace=True)
        explanation.reset_index(drop=True, inplace=True)
        return explanation
    target_feats = []
    for target in explanation.target.unique():
        subset = explanation.loc[explanation.target == target]
        subset = subset.nlargest(n, 'weight')
        subset = subset.loc[subset.feature != '<BIAS>']
        subset['word'] = subset.feature.apply(get_word_by_index)
        subset['subject'] = names[target]
        target_feats.append(subset)
    result = pd.concat(target_feats, axis=0)
    result.reset_index(drop=True, inplace=True)
    return result

<div style="text-align:justify">
Примените функцию к вашим классификаторам, видны ли по отобранным словам очевидные ошибки?
Используйте параметры CountVectorizer, для того, чтобы уменьшить количество признаков и убрать нерелевантные (например числа, токены слишком низкой или высокой документной частотой, и т.д.), постарайтесь добиться улучшения результатов работы моделей (снова выводите результаты модели на трейне и на тесте, чтобы видеть, уменьшается ли переобучение)
<strong>3 балла </strong>
</div>

In [26]:
analyze_features(best_log_reg, 10)

Unnamed: 0,target,feature,weight,word,subject
0,0,x8549,1.783461,car,rec.autos
1,0,x8620,1.00814,cars,rec.autos
2,0,x33012,0.68661,warning,rec.autos
3,0,x14538,0.674743,ford,rec.autos
4,0,x32004,0.660695,unisql,rec.autos
5,0,x7416,0.642764,bmw,rec.autos
6,0,x21954,0.636252,my,rec.autos
7,0,x5797,0.607378,any,rec.autos
8,0,x12352,0.58928,drive,rec.autos
9,0,x6437,0.575745,automotive,rec.autos


In [27]:
analyze_features(best_sgd, 10)

Unnamed: 0,target,feature,weight,word,subject
0,0,x8549,0.947075,car,rec.autos
1,0,x8620,0.577921,cars,rec.autos
2,0,x21954,0.348873,my,rec.autos
3,0,x13079,0.289875,engine,rec.autos
4,0,x25942,0.283385,read,rec.autos
5,0,x14538,0.278228,ford,rec.autos
6,0,x24942,0.256336,price,rec.autos
7,0,x33012,0.255041,warning,rec.autos
8,0,x5797,0.240954,any,rec.autos
9,0,x6416,0.239201,auto,rec.autos


In [28]:
analyze_features(best_rf, 40)

Unnamed: 0,feature,weight,word
0,x8549,0.023245,car
1,x15787,0.017555,gun
2,x6774,0.013867,baseball
3,x15800,0.011954,guns
4,x8620,0.011412,cars
5,x32909,0.008754,waco
6,x30514,0.008656,team
7,x15000,0.008105,game
8,x33123,0.007521,weapons
9,x15002,0.007488,games


*В основном можно отметить стоп-слова как неправильные. Остальное – аббревиатуры и имена собственные, которые можно расценивать как разделяющие темы.*

In [29]:
c_vect = CountVectorizer(ngram_range=(1, 2), max_df=0.8, min_df=5, stop_words='english')

In [30]:
c_vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.8, max_features=None, min_df=5,
                ngram_range=(1, 2), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [31]:
X_train_vec = c_vect.transform(X_train)

In [32]:
X_test_vec = c_vect.transform(X_test)

In [33]:
index_to_word = {v:k for k, v in c_vect.vocabulary_.items()}

In [34]:
%%time

# only liblinear solver supports both l1 and l2 regularization
lr_gridsearch = eval_model_gscv(log_reg, X_train_vec, y_train,
                                gs_param_grid={'C': [1, 2, 3],
                                               'penalty':['l1', 'l2'],
                                               'intercept_scaling': [1, 2, 5],
                                               'solver': ['liblinear']})

Best score: 0.96054776592035
Best parameters: {'C': 1, 'intercept_scaling': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Wall time: 21.3 s


In [35]:
best_log_reg = LogisticRegression(**lr_gridsearch.best_params_)
print('Logistic regression')
validate_model(best_log_reg, X_train_vec, y_train, X_test_vec, y_test)

Logistic regression
Macro f-score on test data is 0.86938


In [36]:
%%time

rf_gridsearch = eval_model_gscv(rf, X_train_vec, y_train,
                               gs_param_grid={'n_estimators':[100, 150, 200],
                                              'max_depth': [10, 15, 20],
                                              'criterion':['gini', 'entropy']})

Best score: 0.9204860075161108
Best parameters: {'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 200}
Wall time: 12.9 s


In [37]:
best_rf = RandomForestClassifier(**rf_gridsearch.best_params_)
print('Random forest')
validate_model(best_rf, X_train_vec, y_train, X_test_vec, y_test)

Random forest
Macro f-score on test data is 0.84276


In [38]:
%%time

sgd_gridsearch = eval_model_gscv(sgd, X_train_vec, y_train,
                                gs_param_grid={'penalty': ['l2', 'l1', 'elasticnet'],
                                               'loss': ['log', 'hinge', 'perceptron'],
                                               'alpha': [0.01, 0.001, 0.0001]})

Best score: 0.961563613035959
Best parameters: {'alpha': 0.01, 'loss': 'log', 'penalty': 'l2'}
Wall time: 8.17 s


In [39]:
best_sgd = SGDClassifier(**sgd_gridsearch.best_params_)
print('SGD classifier')
validate_model(best_sgd, X_train_vec, y_train, X_test_vec, y_test)

SGD classifier
Macro f-score on test data is 0.87724


*Видим, что использование параметров CountVectorizer повысило результаты модели на тесте примерно на 1%. При этом разница между трейном и тестом по прежнему составляет порядка 8-9 процентов.*

In [40]:
analyze_features(best_log_reg, 10)

Unnamed: 0,target,feature,weight,word,subject
0,0,x2771,1.542404,car,rec.autos
1,0,x2847,0.837227,cars,rec.autos
2,0,x6031,0.603442,ford,rec.autos
3,0,x2310,0.576001,bmw,rec.autos
4,0,x4920,0.547478,drive,rec.autos
5,0,x7669,0.517325,interested,rec.autos
6,0,x7113,0.496665,honda,rec.autos
7,0,x14570,0.479048,toyota,rec.autos
8,0,x15432,0.47754,warning,rec.autos
9,0,x15433,0.465963,warning read,rec.autos


In [41]:
analyze_features(best_sgd, 10)

Unnamed: 0,target,feature,weight,word,subject
0,0,x2771,0.816749,car,rec.autos
1,0,x2847,0.484481,cars,rec.autos
2,0,x6031,0.298153,ford,rec.autos
3,0,x12064,0.255958,requests,rec.autos
4,0,x5400,0.253165,engine,rec.autos
5,0,x2310,0.242062,bmw,rec.autos
6,0,x12061,0.229471,request,rec.autos
7,0,x14570,0.227662,toyota,rec.autos
8,0,x10233,0.225937,oil,rec.autos
9,0,x4920,0.217415,drive,rec.autos


In [42]:
analyze_features(best_rf, 40)

Unnamed: 0,feature,weight,word
0,x2771,0.032033,car
1,x6700,0.021181,guns
2,x1974,0.019442,baseball
3,x6670,0.014696,gun
4,x14140,0.013792,team
5,x6516,0.013317,government
6,x2847,0.012164,cars
7,x11017,0.010804,players
8,x6221,0.010621,games
9,x15521,0.010146,weapons


*В целом фичи выглядят достаточно вменяемо. Например, для бейсбола наблюдаем счет игры (видимо) как фичу.*