In [0]:
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import KFold, GridSearchCV
import numpy as np
import pandas as pd

In [2]:
newsgroups = datasets.fetch_20newsgroups(subset='all', categories=['alt.atheism', 'sci.space'])

X = newsgroups.data
y = newsgroups.target

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [4]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv, verbose=1, n_jobs=-1)
gs.fit(X, y)
C = gs.best_params_.get('C')
print(C)

Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  1.9min finished


1.0


In [5]:
clf = SVC(C=C, kernel='linear', random_state=241)
clf.fit(X, y)
print(clf.coef_)

  (0, 11098)	0.11331531787773684
  (0, 6775)	0.0513432082410742
  (0, 5107)	0.0544519626112152
  (0, 98)	0.05976641330900942
  (0, 27042)	0.10471864296583751
  (0, 22622)	0.10471864296583751
  (0, 6135)	0.10471864296583751
  (0, 27130)	0.006843181403656412
  (0, 27083)	0.006843181403656412
  (0, 26026)	0.007218660562038832
  (0, 23036)	0.006201295542734525
  (0, 22982)	0.007218660562038832
  (0, 22762)	0.007218660562038832
  (0, 22739)	0.007218660562038832
  (0, 22595)	0.007218660562038832
  (0, 21945)	0.006843181403656412
  (0, 20801)	0.014437321124077664
  (0, 20800)	0.04790226982559489
  (0, 20042)	0.021655981686116498
  (0, 16400)	0.01667822904543791
  (0, 16224)	0.007218660562038832
  (0, 13928)	0.007218660562038832
  (0, 11716)	0.007218660562038832
  (0, 10127)	0.006843181403656412
  (0, 8938)	0.007218660562038832
  :	:
  (0, 9935)	0.32483997519389374
  (0, 9711)	-0.09516871224270008
  (0, 9622)	0.039908813011477126
  (0, 9368)	-0.31125053382095413
  (0, 9107)	-0.1575245967541240

In [6]:
words = np.array(vectorizer.get_feature_names())
word_weights = pd.Series(clf.coef_.data, index = words[clf.coef_.indices], name = 'weight')
word_weights.index.name = "word"

top_words = word_weights.abs().sort_values(ascending = False).head(10)
print(top_words)

word
space       2.663165
god         1.920379
atheism     1.254690
atheists    1.249180
moon        1.201611
sky         1.180132
religion    1.139081
bible       1.130612
keith       1.097094
sci         1.029307
Name: weight, dtype: float64


In [7]:
print(" ".join(top_words.index.sort_values(ascending = True)))

atheism atheists bible god keith moon religion sci sky space
