In [1]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
def runLR(xtrain, ytrain, xtest, ytest):
    lr = LogisticRegression(penalty='l1')
    lr.fit(xtrain, ytrain)
    print('Accuracy: ', lr.score(xtest, ytest))
    return lr

In [28]:
def runDTC(xtrain, ytrain, xtest, ytest, names=None, f=30):
    dt = DecisionTreeClassifier()
    dt.fit(xtrain, ytrain)
    print('Accuracy: ', dt.score(xtest, ytest))
    return dt

In [4]:
ngtrain = fetch_20newsgroups(subset='train', data_home='../Data')
ngtest = fetch_20newsgroups(subset='test', data_home='../Data')

ngtrain_vectorizer = TfidfVectorizer(stop_words='english', strip_accents='ascii')
_ngtrain_vectors = ngtrain_vectorizer.fit_transform(ngtrain.data)
ngtest_vectorizer = TfidfVectorizer(stop_words='english', strip_accents='ascii')
_ngtest_vectors = ngtest_vectorizer.fit_transform(ngtest.data)

# remove features in test that are not found in train
get_indices = lambda x, y: list(set(np.searchsorted(x, y)))
common_features = np.intersect1d(ngtrain_vectorizer.get_feature_names(), ngtest_vectorizer.get_feature_names())
ngtrain_vectors = _ngtrain_vectors[:, get_indices(ngtrain_vectorizer.get_feature_names(), common_features)]
ngtest_vectors = _ngtest_vectors[:, get_indices(ngtest_vectorizer.get_feature_names(), common_features)]

In [10]:
nglr = runLR(ngtrain_vectors, ngtrain.target, ngtest_vectors, ngtest.target)

Accuracy:  0.749203398832


In [25]:
f = 200
indices = np.array([])
for i in range(nglr.coef_.shape[0]):
    print('Top {} features for class `{}`:'.format(f, ngtrain.target_names[i]))
    _indices = np.argsort(nglr.coef_[i])[::-1][:f]
    indices = np.concatenate([indices, _indices])
    print(common_features[_indices])
    print('--'*15)
indices = np.unique(indices)

Top 200 features for class `alt.atheism`:
['atheism' 'bmd' 'benedikt' 'keith' 'livesey' 'okcforum' 'atheists'
 'jaeger' 'islamic' 'wwc' 'wingate' 'mathew' 'kmr4' 'cobb' 'rushdie' 'ico'
 'liar' 'videocart' 'bsa' 'lippard' 'caltech' 'psilink' 'tek' 'rice' 'ultb'
 'tammy' 'psuvm' 'umd' 'islam' 'bible' 'bake' 'mangoe' 'sex' 'evidence'
 'compaq' 'moral' 'people' 'say' 'au' 'sandvik' 'freedom' 'halat' 'qur'
 'bobby' 'god' 'charley' 'wesleyan' 'religion' 'argument' 'khan'
 'princeton' 'book' 'values' 'karner' 'penalty' 'crchh410' 'jim' 'satan'
 'monash' 'ingles' 'maddi' 'mozumder' 'cmu' 'things' 'perry' 'gods'
 'humans' 'science' 'morality' 'religious' 'think' 'motto' 'natural' 'edu'
 'timmons' 'uk' 'allah' 'claim' 'doctine' 'dogfight' 'docs' 'doctor'
 'doghouse' 'docks' 'dogma' 'doctor8' 'documented' 'dogmas' 'dockery'
 'dobson' 'dobyns' 'dokumentation' 'doj' 'doc' 'dock' 'doings' 'docked'
 'doing' 'dogmatic' 'docket' 'docking' 'dockmaster' 'doin' 'dohc' 'dof'
 'doh' 'dogs' 'dog' 'does' 'doe

In [27]:
ngtrain_x = ngtrain_vectors[:,indices]
ngtest_x = ngtest_vectors[:, indices]

In [29]:
nglr = runLR(ngtrain_x, ngtrain.target, ngtest_x, ngtest.target)

Accuracy:  0.748672331386


In [30]:
ngdtc = runDTC(ngtrain_x, ngtrain.target, ngtest_x, ngtest.target)

Accuracy:  0.570366436537
