In [10]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

In [2]:
def runLR(xtrain, ytrain, xtest, ytest):
    lr = LogisticRegression()
    lr.fit(xtrain, ytrain)
    print('Accuracy: ', lr.score(xtest, ytest))
    return lr

In [8]:
def runDTC(xtrain, ytrain, xtest, ytest, names=None, f=30):
    dt = DecisionTreeClassifier()
    dt.fit(xtrain, ytrain)
    print('Accuracy: ', dt.score(xtest, ytest))
#     print('Top {} features using DTC'.format(f))
#     if names is not None:
#         if type(names) is list:
#             print([names[z] for z in np.argsort(dt.feature_importances_)[::-1][:f]])
#         else:
#             print(names[np.argsort(dt.feature_importances_)[::-1][:f]])
#     else:
#         print([np.argsort(dt.feature_importances_)[::-1][:f]])
    return dt

In [4]:
ngtrain = fetch_20newsgroups(subset='train', data_home='../Data')
ngtest = fetch_20newsgroups(subset='test', data_home='../Data')

ngtrain_vectorizer = TfidfVectorizer(stop_words='english', strip_accents='ascii')
_ngtrain_vectors = ngtrain_vectorizer.fit_transform(ngtrain.data)
ngtest_vectorizer = TfidfVectorizer(stop_words='english', strip_accents='ascii')
_ngtest_vectors = ngtest_vectorizer.fit_transform(ngtest.data)

# remove features in test that are not found in train
get_indices = lambda x, y: list(set(np.searchsorted(x, y)))
common_features = np.intersect1d(ngtrain_vectorizer.get_feature_names(), ngtest_vectorizer.get_feature_names())
ngtrain_vectors = _ngtrain_vectors[:, get_indices(ngtrain_vectorizer.get_feature_names(), common_features)]
ngtest_vectors = _ngtest_vectors[:, get_indices(ngtest_vectorizer.get_feature_names(), common_features)]

# compute top features using chi2

In [5]:
select_best = SelectKBest(chi2, k=200)
Xchi_train = select_best.fit_transform(ngtrain_vectors, ngtrain.target)
Xchi_test = select_best.transform(ngtest_vectors)

In [7]:
runLR(Xchi_train, ngtrain.target, Xchi_test, ngtest.target)

Accuracy:  0.611391396707


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
runDTC(Xchi_train, ngtrain.target, Xchi_test, ngtest.target)

Accuracy:  0.544609665428


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

# using mutual information

In [11]:
select_best = SelectKBest(mutual_info_classif, k=200)
Xmi_train = select_best.fit_transform(ngtrain_vectors, ngtrain.target)
Xmi_test = select_best.transform(ngtest_vectors)

In [12]:
runLR(Xmi_train, ngtrain.target, Xmi_test, ngtest.target)

Accuracy:  0.336298459904


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
runDTC(Xmi_train, ngtrain.target, Xmi_test, ngtest.target)

Accuracy:  0.230350504514


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')