In [1]:
# Importing Dataset offline 
import sklearn.datasets as skd

categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
news_train = skd.load_files('Fetch20newsgroups/train', categories= categories, encoding= 'ISO-8859-1')
news_test = skd.load_files('Fetch20newsgroups/test/',categories= categories, encoding= 'ISO-8859-1')

In [2]:
# Word Count-CountVectorize -> Count the occurance of each word, basically encoding documents
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_tf = count_vect.fit_transform(news_train.data)
X_train_tf.shape

(2257, 35788)

We got same number of sample & same number of features, so there is no data loss

In [3]:
# Term Frequency Inverse Document Frequency
# Term Frequency -> This summarizes how offen a given word appears within a document.
# Inverse Document Frequency -> This downscales words that appear a lot across documents.

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)
X_train_tfidf.shape

(2257, 35788)

# MultinomalNB

Naive Bayes Classifier for Multinomial Models, sklearn already has inbuilt multinomial naive bayes classifier package

Using this package we can directly train our model with the matrix obtained from TFIDF transformer

##### Naive Bayes Classifier

In [4]:
# Training the model
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, news_train.target)
clf

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [5]:
X_test_tf = count_vect.transform(news_test.data)
X_test_tfidf = tfidf_transformer.transform(X_test_tf)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [7]:
text_clf = Pipeline([('vect', TfidfVectorizer()), 
                      ('clf', MultinomialNB()) ])

In [8]:
# Train the model
text_clf.fit(news_train.data, news_train.target)
# Predict the test cases
predicted = text_clf.predict(news_test.data)


In [9]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
import numpy as np

In [10]:
print('Accuracy achieved is ' + str(np.mean(predicted == news_test.target)))
print(metrics.classification_report(news_test.target, predicted, target_names=news_test.target_names)),
metrics.confusion_matrix(news_test.target, predicted)

Accuracy achieved is 0.8348868175765646
                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

           avg / total       0.88      0.83      0.84      1502



array([[192,   2,   6, 119],
       [  2, 347,   4,  36],
       [  2,  11, 322,  61],
       [  2,   2,   1, 393]], dtype=int64)