In [20]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.svm
import sklearn.datasets
import sklearn.feature_extraction.text
import sklearn.model_selection

In [21]:
newsgroups_all = sklearn.datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space'],
             )
newsgroups_train = sklearn.datasets.fetch_20newsgroups(
                    subset='train', 
                    categories=['alt.atheism', 'sci.space'],
             )
newsgroups_test = sklearn.datasets.fetch_20newsgroups(
                    subset='test', 
                    categories=['alt.atheism', 'sci.space'],
             )

In [23]:
newsgroups_all.filenames.shape

(1786,)

In [25]:
newsgroups_train.filenames.shape

(1073,)

In [26]:
newsgroups_test.filenames.shape

(713,)

In [178]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()

In [180]:
X_all = vectorizer.fit_transform(newsgroups_all.data)

X_train = vectorizer.transform(newsgroups_train.data)

X_test = vectorizer.transform(newsgroups_test.data)

In [46]:
feature_mapping = vectorizer.get_feature_names()

In [181]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = sklearn.model_selection.KFold(n_splits=5, shuffle=True, random_state=241)
clf = sklearn.svm.SVC(kernel='linear', random_state=241) 
gs = sklearn.model_selection.GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X_all, newsgroups_all.target)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [52]:
len(newsgroups_train.target)

1073

In [54]:
X_train.shape

(1073, 28382)

In [182]:
for a in gs.grid_scores_:
    print(a.mean_validation_score) # — оценка качества по кросс-валидации
    print(a.parameters) # — значения параметров

0.5526315789473685
{'C': 1e-05}
0.5526315789473685
{'C': 0.0001}
0.5526315789473685
{'C': 0.001}
0.5526315789473685
{'C': 0.01}
0.9501679731243001
{'C': 0.1}
0.9932810750279956
{'C': 1.0}
0.9932810750279956
{'C': 10.0}
0.9932810750279956
{'C': 100.0}
0.9932810750279956
{'C': 1000.0}
0.9932810750279956
{'C': 10000.0}
0.9932810750279956
{'C': 100000.0}




In [183]:
clf.fit(X_all, newsgroups_all.target)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [184]:
word_indexes = np.argsort(np.abs(clf.coef_.toarray()[0]))[-10:]

In [185]:
word_indexes

array([22936, 15606,  5776, 21850, 23673, 17802,  5093,  5088, 12871,
       24019], dtype=int64)

In [187]:
arr = list()
for ind in word_indexes:
    arr.append(feature_mapping[ind])

In [188]:
arr.sort()
print(arr)

['atheism', 'atheists', 'bible', 'god', 'keith', 'moon', 'religion', 'sci', 'sky', 'space']
