# Text Classification #

### 20newsgroups dataset ###
20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups.

In [36]:
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train', shuffle=True)
test = fetch_20newsgroups(subset='test', shuffle=True)

In [37]:
import numpy as np
np.set_printoptions(precision=2)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

### Pipeline ###

In [38]:
pipe_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True, sublinear_tf=True)),
    ('clf', SGDClassifier()),
], verbose=True)

In [39]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__loss': ('hinge', 'log_loss', 'squared_error'),
    'clf__alpha': (1e-3, 1e-4, 1e-5),
}

gs_clf = GridSearchCV(pipe_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(train.data, train.target)
print("Best score: %s" % gs_clf.best_score_) 
print("Best param: %s" % gs_clf.best_params_)

[Pipeline] .............. (step 1 of 3) Processing vect, total=   6.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.8s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   3.2s
Best score: 0.9348595578820944
Best param: {'clf__alpha': 0.0001, 'clf__loss': 'hinge', 'vect__ngram_range': (1, 2)}
