<div style="text-align:justify">Возьмите четыре любые темы из корпуса 20newsgroups (постарайтесть брать не слишком похожие, и не слишком разные темы).</div>

In [44]:
import numpy as np
import pandas as pd

import eli5

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
subj = ['rec.sport.baseball', 'talk.politics.guns', 'talk.politics.misc', 'rec.autos']

In [3]:
news_train = fetch_20newsgroups(subset='train', categories=subj)

In [4]:
news_test = fetch_20newsgroups(subset='test', categories=subj)

In [5]:
# the order of the subjects is not kept
names = news_test.target_names
names

['rec.autos', 'rec.sport.baseball', 'talk.politics.guns', 'talk.politics.misc']

In [6]:
y_train = news_train.target
y_test = news_test.target

In [7]:
X_train = news_train.data

In [8]:
X_test = news_test.data

<div style="text-align:justify">Векторизуйте датасет с помощью CountVectorizer.</div>

In [9]:
#убрать параметры
c_vect = CountVectorizer()

In [10]:
c_vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [11]:
X_train_vec = c_vect.transform(X_train)

In [12]:
X_test_vec = c_vect.transform(X_test)

<div style="text-align:justify">
Выберете три любых классификатора. <br>
Используя кроссвалидацию (любой вариант из KFold, StratifiedKFold, RepeatedStratifiedKFold), подберите оптимальные параметры моделей с помощью grid_search. Обучите классификаторы с оптимальными параметрами.
Оцените полученные классификаторы на тесте, мера качества - macro_f1.
Посмотрите, насколько полученные результаты на тесте отличаются от результатов предсказания на трейне?<br> 
<strong>3 балла</strong>
</div>

In [13]:
log_reg = LogisticRegression()
rf = RandomForestClassifier()
sgd = SGDClassifier()

In [14]:
def eval_model_gscv(model, X, y, gs_param_grid, folds=5, verbose=True):
    # utilizes StratifiedKFold by default for multiclass labels when `cv` is specified
    gridsearch = GridSearchCV(model, param_grid=gs_param_grid,
                             n_jobs=-1,  scoring='f1_macro', cv=folds).fit(X, y)
    
    if verbose:
        print(f'Best score: {gridsearch.best_score_}')
        print(f'Best parameters: {gridsearch.best_params_}')
        
    return gridsearch

In [15]:
def validate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f = f1_score(y_test, y_pred, average='macro')
    print(f"Macro f-score on test data is {f:.5}")
    return model

In [16]:
%%time

# only liblinear solver supports both l1 and l2 regularization
lr_gridsearch = eval_model_gscv(log_reg, X_train_vec, y_train,
                                gs_param_grid={'C': [1, 2, 3],
                                               'penalty':['l1', 'l2'],
                                               'intercept_scaling': [1, 2, 5],
                                               'solver': ['liblinear']})


Best score: 0.9544185713553807
Best parameters: {'C': 2, 'intercept_scaling': 5, 'penalty': 'l2', 'solver': 'liblinear'}
Wall time: 44.4 s


In [17]:
best_log_reg = LogisticRegression(**lr_gridsearch.best_params_)

In [18]:
print('Logistic regression')
validate_model(best_log_reg, X_train_vec, y_train, X_test_vec, y_test)

Logistic regression
Macro f-score on test data is 0.86372


LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=5, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
%%time

rf_gridsearch = eval_model_gscv(rf, X_train_vec, y_train,
                               gs_param_grid={'n_estimators':[100, 150, 200],
                                              'max_depth': [10, 15, 20],
                                              'criterion':['gini', 'entropy']})

Best score: 0.9168589699200554
Best parameters: {'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 200}
Wall time: 23 s


In [20]:
best_rf = RandomForestClassifier(**rf_gridsearch.best_params_)
print('Random forest')
validate_model(best_rf, X_train_vec, y_train, X_test_vec, y_test)

Random forest
Macro f-score on test data is 0.83895


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
%%time

sgd_gridsearch = eval_model_gscv(sgd, X_train_vec, y_train,
                                gs_param_grid={'penalty': ['l2', 'l1', 'elasticnet'],
                                               'loss': ['log', 'hinge', 'perceptron'],
                                               'alpha': [0.01, 0.001, 0.0001]})

Best score: 0.953519476767875
Best parameters: {'alpha': 0.01, 'loss': 'hinge', 'penalty': 'l2'}
Wall time: 36.9 s


In [22]:
best_sgd = SGDClassifier(**sgd_gridsearch.best_params_)
print('SGD classifier')
validate_model(best_sgd, X_train_vec, y_train, X_test_vec, y_test)

SGD classifier
Macro f-score on test data is 0.87526


SGDClassifier(alpha=0.01, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

<div style="text-align:justify">
Постройте функцию analyze_features(model, n), которая бы для каждой модели выводила самые значимые признаки по каждому из четырех классов. 
Для этого вам понадобится словарь, связывающий номер признака с самим признаком, он может быть таким:
index_to_word = {v:k for k,v in count_vect.vocabulary_.items()}
обращаться к весам модели можно либо через eli5, как мы делали на лекции, либо напрямую: через clf.coef_ - матрица размера (n_classes, n_features), то есть для получения признаков с наибольшим весом для класса n вам нужно сортировать веса внутри n-ной строки.
Eсли вы используете деревья решений, то можно воспользоваться методом  model.feature_importances
<br>
<strong>4 балла</strong>
<div>

In [23]:
index_to_word = {v:k for k, v in c_vect.vocabulary_.items()}

In [24]:
def get_word_by_index(feature, d=index_to_word):
    feature = int(feature.strip('x'))
    return d[feature]

In [25]:
def analyze_features(model, n):
    explanation = eli5.formatters.as_dataframe.explain_weights_df(model)
    if 'target' not in explanation.columns:
        explanation['word'] = explanation.feature.apply(get_word_by_index)
        explanation = explanation.nlargest(n, 'weight')
        explanation.drop('std', axis=1, inplace=True)
        return explanation
    target_feats = []
    for target in explanation.target.unique():
        subset = explanation.loc[explanation.target == target]
        subset = subset.nlargest(n, 'weight')
        subset = subset.loc[subset.feature != '<BIAS>']
        subset['word'] = subset.feature.apply(get_word_by_index)
        subset['subject'] = names[target]
        target_feats.append(subset)
    result = pd.concat(target_feats, axis=0)
    result.reset_index(drop=True, inplace=True)
    return result

<div style="text-align:justify">
Примените функцию к вашим классификаторам, видны ли по отобранным словам очевидные ошибки?
Используйте параметры CountVectorizer, для того, чтобы уменьшить количество признаков и убрать нерелевантные (например числа, токены слишком низкой или высокой документной частотой, и т.д.), постарайтесь добиться улучшения результатов работы моделей (снова выводите результаты модели на трейне и на тесте, чтобы видеть, уменьшается ли переобучение)
<strong>3 балла </strong>

In [29]:
analyze_features(best_log_reg, 10)

Unnamed: 0,target,feature,weight,word,subject
0,0,x8549,1.783461,car,rec.autos
1,0,x8620,1.00814,cars,rec.autos
2,0,x33012,0.68661,warning,rec.autos
3,0,x14538,0.674743,ford,rec.autos
4,0,x32004,0.660695,unisql,rec.autos
5,0,x7416,0.642764,bmw,rec.autos
6,0,x21954,0.636252,my,rec.autos
7,0,x5797,0.607378,any,rec.autos
8,0,x12352,0.58928,drive,rec.autos
9,0,x6437,0.575745,automotive,rec.autos


In [27]:
analyze_features(best_sgd, 10)

Unnamed: 0,target,feature,weight,word,subject
0,0,x8549,0.62192,car,rec.autos
1,0,x8620,0.352332,cars,rec.autos
2,0,x33012,0.205527,warning,rec.autos
3,0,x21954,0.19752,my,rec.autos
4,0,x14538,0.19752,ford,rec.autos
5,0,x12352,0.192181,drive,rec.autos
6,0,x25942,0.189512,read,rec.autos
7,0,x5797,0.189512,any,rec.autos
8,0,x32004,0.186843,unisql,rec.autos
9,0,x24942,0.178835,price,rec.autos


In [30]:
analyze_features(best_rf, 40)

Unnamed: 0,feature,weight,word
0,x8549,0.027019,car
1,x15787,0.01726,gun
2,x8620,0.012959,cars
3,x6774,0.011931,baseball
4,x15468,0.009113,government
5,x15800,0.007311,guns
6,x30514,0.007266,team
7,x6059,0.007091,arms
8,x32909,0.006592,waco
9,x9368,0.006436,clinton


В основном можно отметь стоп-слова как неправильные. Остальное – аббревиатуры и имена собствнные, которые можно расценивать, как разделяющие темы.

In [31]:
c_vect = CountVectorizer(ngram_range=(1, 2), max_df=0.9, min_df=3, stop_words='english')

In [34]:
c_vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.9, max_features=None, min_df=3,
                ngram_range=(1, 2), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [35]:
X_train_vec = c_vect.transform(X_train)

In [36]:
X_test_vec = c_vect.transform(X_test)

In [50]:
index_to_word = {v:k for k, v in c_vect.vocabulary_.items()}

In [38]:
%%time

# only liblinear solver supports both l1 and l2 regularization
lr_gridsearch = eval_model_gscv(log_reg, X_train_vec, y_train,
                                gs_param_grid={'C': [1, 2, 3],
                                               'penalty':['l1', 'l2'],
                                               'intercept_scaling': [1, 2, 5],
                                               'solver': ['liblinear']})

Best score: 0.9616504581066844
Best parameters: {'C': 1, 'intercept_scaling': 2, 'penalty': 'l2', 'solver': 'liblinear'}
Wall time: 30.3 s


In [41]:
best_log_reg = LogisticRegression(**lr_gridsearch.best_params_)
print('Logistic regression')
validate_model(best_log_reg, X_train_vec, y_train, X_test_vec, y_test)

Logistic regression
Macro f-score on test data is 0.87104


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=2, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
%%time

rf_gridsearch = eval_model_gscv(rf, X_train_vec, y_train,
                               gs_param_grid={'n_estimators':[100, 150, 200],
                                              'max_depth': [10, 15, 20],
                                              'criterion':['gini', 'entropy']})

Best score: 0.9167424532722646
Best parameters: {'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 200}
Wall time: 16.6 s


In [43]:
best_rf = RandomForestClassifier(**rf_gridsearch.best_params_)
print('Random forest')
validate_model(best_rf, X_train_vec, y_train, X_test_vec, y_test)

Random forest
Macro f-score on test data is 0.84408


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [45]:
%%time

sgd_gridsearch = eval_model_gscv(sgd, X_train_vec, y_train,
                                gs_param_grid={'penalty': ['l2', 'l1', 'elasticnet'],
                                               'loss': ['log', 'hinge', 'perceptron'],
                                               'alpha': [0.01, 0.001, 0.0001]})

Best score: 0.962212443505928
Best parameters: {'alpha': 0.01, 'loss': 'hinge', 'penalty': 'l2'}
Wall time: 11 s


In [46]:
best_sgd = SGDClassifier(**sgd_gridsearch.best_params_)
print('SGD classifier')
validate_model(best_sgd, X_train_vec, y_train, X_test_vec, y_test)

SGD classifier
Macro f-score on test data is 0.87768


SGDClassifier(alpha=0.01, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

Видим, что использование параметров CountVectorizer повысили результаты модели на тесте примерно на 1%.

In [47]:
analyze_features(best_log_reg, 10)

Unnamed: 0,target,feature,weight,word,subject
0,0,x6196,1.500435,assenmacher,rec.autos
1,0,x6381,0.818883,aug,rec.autos
2,0,x12974,0.540159,empire,rec.autos
3,0,x5224,0.529866,affiliation,rec.autos
4,0,x10663,0.502515,csc,rec.autos
5,0,x16543,0.476275,hoffs,rec.autos
6,0,x31311,0.463805,transporter,rec.autos
7,0,x15403,0.461948,goncz,rec.autos
8,0,x33137,0.437863,webber,rec.autos
9,0,x29743,0.427708,strong,rec.autos


In [48]:
analyze_features(best_sgd, 10)

Unnamed: 0,target,feature,weight,word,subject
0,0,x6196,0.381118,assenmacher,rec.autos
1,0,x6381,0.211732,aug,rec.autos
2,0,x5224,0.175435,affiliation,rec.autos
3,0,x15403,0.154262,goncz,rec.autos
4,0,x16543,0.154262,hoffs,rec.autos
5,0,x10663,0.154262,csc,rec.autos
6,0,x12974,0.148213,empire,rec.autos
7,0,x31311,0.139138,transporter,rec.autos
8,0,x24440,0.136114,plow,rec.autos
9,0,x29743,0.133089,strong,rec.autos


In [51]:
analyze_features(best_rf, 40)

KeyError: 34355

In [53]:
index_to_word[34355]

'yesterday'