## Loading the inbuilt data set

In [1]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)

## Accessing the data

In [15]:
data.target_names   # To find the catagories

len(data.data)      # to access the data 

len(data.filenames)

data.filenames[0]

'C:\\Users\\Prudhvinath\\scikit_learn_data\\20news_home\\20news-bydate-train\\comp.graphics\\38440'

## Access of target name

In [28]:
print(data.target_names[data.target[0]])

data.target[:10]

comp.graphics


array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

## Tokenization 

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data.data)
X_train_counts

<1x35788 sparse matrix of type '<class 'numpy.int64'>'
	with 73 stored elements in Compressed Sparse Row format>

## Term Frequency times Inverse Document Frequency

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

## Training a classifier

In [23]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train_tfidf, data.target)

### While predicting we use .transform instead of .fit_transform

In [26]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = model.predict(X_new_tfidf)
predicted

array([3, 1], dtype=int64)

## Building a pipeline

In [34]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model', MultinomialNB()),
])

text_clf.fit(data.data, data.target)  

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ear_tf=False, use_idf=True)), ('model', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

## Testing our trained model

In [35]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
text_clf.score(twenty_test.data, twenty_test.target)

0.83488681757656458

## Report 

In [38]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

           avg / total       0.88      0.83      0.84      1502



## Confusion matrix

In [37]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[192,   2,   6, 119],
       [  2, 347,   4,  36],
       [  2,  11, 322,  61],
       [  2,   2,   1, 393]], dtype=int64)