In [15]:
from sklearn.datasets import fetch_20newsgroups


categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

# Tokenizing text  with scikit-learn - CountVectorizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

# Tokenizing text  with scikit-learn - TfidfTransformer - fit -> transform

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

# Tokenizing text  with scikit-learn - TfidfTransformer - fit_transform

In [21]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

# Let's train the Naive Bayes classifier

In [26]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [27]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']

In [29]:
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [30]:
predicted = clf.predict(X_new_tfidf)

In [31]:
for doc, category in zip(docs_new, predicted):
    print("%r = %s " % (doc, twenty_train.target_names[category]))

'God is love' = soc.religion.christian 
'OpenGL on the GPU is fast' = comp.graphics 


# Let's build a pipeline

In [32]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect',CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB())])

In [33]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])