In [3]:
import pandas as pd
import gensim #the library for Topic modelling
from gensim import corpora, models
import warnings
warnings.simplefilter('ignore')
from itertools import chain

from sklearn.datasets import fetch_20newsgroups

In [4]:
categories = ["alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med" ]
news_train = fetch_20newsgroups(subset='train', categories=categories,shuffle=True)
news_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True)


Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [42]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [20]:
count_vector = CountVectorizer()
X_train_tf = count_vector.fit_transform(news_train.data)
X_train_tf.shape

(2257, 35788)

In [47]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)
X_train_tfidf.shape

(2257, 35788)

In [48]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, news_train.target)

In [53]:
X_test_tf = count_vector.transform(news_test.data)
X_train_tfidf = tfidf_transformer.transform(X_test_tf)
predicted = clf.predict(X_test_tfidf)

In [55]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
print("Accuracy: ", accuracy_score(news_test.target, predicted))
print(metrics.classification_report(news_test.target, predicted, target_names=news_test.target_names))
metrics.confusion_matrix(news_test.target, predicted)

Accuracy:  0.6058588548601864
                        precision    recall  f1-score   support

           alt.atheism       0.95      0.34      0.50       319
         comp.graphics       0.97      0.58      0.73       389
               sci.med       0.94      0.45      0.61       396
soc.religion.christian       0.41      1.00      0.58       398

              accuracy                           0.61      1502
             macro avg       0.82      0.59      0.60      1502
          weighted avg       0.81      0.61      0.61      1502



array([[108,   0,   4, 207],
       [  2, 227,   7, 153],
       [  3,   7, 178, 208],
       [  1,   0,   0, 397]])

In [56]:
# Using pipeline

In [64]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [66]:
text_clf = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', MultinomialNB())])

In [68]:
text_clf.fit(news_train.data, news_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [70]:
predicted = text_clf.predict(news_test.data)
predicted

array([2, 2, 3, ..., 2, 2, 1])

In [71]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
print("Accuracy: ", accuracy_score(news_test.target, predicted))
print(metrics.classification_report(news_test.target, predicted, target_names=news_test.target_names))
metrics.confusion_matrix(news_test.target, predicted)

Accuracy:  0.8348868175765646
                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

              accuracy                           0.83      1502
             macro avg       0.89      0.82      0.83      1502
          weighted avg       0.88      0.83      0.84      1502



array([[192,   2,   6, 119],
       [  2, 347,   4,  36],
       [  2,  11, 322,  61],
       [  2,   2,   1, 393]])