# Text Classification #

### 20newsgroups dataset ###
20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups.

In [10]:
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train', shuffle=True)
test = fetch_20newsgroups(subset='test', shuffle=True)

In [11]:
import numpy as np
np.set_printoptions(precision=2)
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

### Pipeline ###

In [12]:
stemmer = SnowballStemmer('english', ignore_stopwords=True)

def preprocessor(x):
  return stemmer.stem(x)

pipe_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
], verbose=True)

# pipe_clf = pipe_clf.fit(train.data, train.target)

In [13]:
# cv_predictions = cross_val_predict(pipe_clf, test.data, test.target, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1)
# cv_accuracy = accuracy_score(test.target, cv_predictions)
# print(cv_accuracy)

# predicted = pipe_clf.predict(test.data)
# np.mean(predicted == test.target)

In [16]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vect__stop_words': ('english',),
    'tfidf__use_idf': (True,),
    'tfidf__sublinear_tf': (True,),
    'clf__loss': ('hinge', 'log_loss', 'squared_error'),
    'clf__alpha': (1e-3, 1e-4, 1e-5),
}

gs_clf = GridSearchCV(pipe_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(train.data, train.target)
print("Best score: %s" % gs_clf.best_score_) 
print("Best param: %s" % gs_clf.best_params_) 

[Pipeline] .............. (step 1 of 3) Processing vect, total=   7.0s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.8s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   3.4s
Best score: 0.934771062410601
Best param: {'clf__alpha': 0.0001, 'clf__loss': 'hinge', 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2), 'vect__stop_words': 'english'}
