<a href="https://colab.research.google.com/github/smasterparth/opencv/blob/master/Text_Category_Classifier_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

# Import Data

In [0]:
from sklearn.datasets import fetch_20newsgroups

In [0]:
categories = ['alt.atheism' , 'soc.religion.christian' , 'comp.graphics' , 'sci.med']

In [14]:
news_train = fetch_20newsgroups(subset='train' , categories=categories , shuffle=True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [15]:
type(news_train)

sklearn.utils.Bunch

In [16]:
print(news_train.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [18]:
print(news_train.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features       

In [19]:
print(news_train.target_names)

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


In [0]:
news_test = fetch_20newsgroups(subset='test' , categories=categories , shuffle=True)

In [21]:
print(news_test.target_names)

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


# CountVectorizer

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
text = ['The quick brown fox jumped over the lazy dog' , 'The dog' , 'The fox']
vector = CountVectorizer()

In [24]:
vector.fit(text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [25]:
print(vector.vocabulary_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}


In [26]:
vector.get_feature_names()

['brown', 'dog', 'fox', 'jumped', 'lazy', 'over', 'quick', 'the']

In [0]:
counts = vector.transform(text)

In [28]:
counts

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [29]:
counts.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 2],
       [0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 1]])

In [30]:
counts.shape

(3, 8)

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
vector_x = CountVectorizer()

In [55]:
vector_x.fit(text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [56]:
vector_x.vocabulary_

{'brown': 0,
 'dog': 1,
 'fox': 2,
 'jumped': 3,
 'lazy': 4,
 'over': 5,
 'quick': 6,
 'the': 7}

In [57]:
vector_x.get_feature_names()

['brown', 'dog', 'fox', 'jumped', 'lazy', 'over', 'quick', 'the']

In [0]:
count_x = vector_x.transform(text)

In [59]:
count_x.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 2],
       [0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 1]])

In [60]:
text

['The quick brown fox jumped over the lazy dog', 'The dog', 'The fox']

In [61]:
vector_x.get_feature_names()

['brown', 'dog', 'fox', 'jumped', 'lazy', 'over', 'quick', 'the']

In [62]:
print(vector_x.vocabulary_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}


In [63]:
count_x.shape

(3, 8)

In [0]:
count_vect = CountVectorizer()
x_train_tf = count_vect.fit_transform(news_train.data)

In [65]:
x_train_tf.shape

(2257, 35788)

# TF - IDF

TF - Term Frequency - Tells us how often a given word appears within a doc

IDF - Inverse Doc Frequency - It procides weights to words. It downscales those words that appear a lot across documents.

In [0]:
from sklearn.feature_extraction.text import TfidfTransformer

In [0]:
vectorizer = TfidfTransformer()

In [71]:
count_x.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 2],
       [0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 1]])

In [72]:
vectorizer.fit(count_x)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [73]:
print(vectorizer.idf_)

[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ]


In [0]:
freq = vectorizer.transform(count_x)

In [76]:
freq.toarray()

array([[0.36388646, 0.27674503, 0.27674503, 0.36388646, 0.36388646,
        0.36388646, 0.36388646, 0.42983441],
       [0.        , 0.78980693, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.61335554],
       [0.        , 0.        , 0.78980693, 0.        , 0.        ,
        0.        , 0.        , 0.61335554]])

In [77]:
text

['The quick brown fox jumped over the lazy dog', 'The dog', 'The fox']

In [0]:
tfidf_trans = TfidfTransformer()

In [0]:
x_train_tfidf = tfidf_trans.fit_transform(x_train_tf)

In [69]:
x_train_tfidf.shape

(2257, 35788)

# MultiNomial Naive Bayes

In [0]:
from sklearn.naive_bayes import MultinomialNB

In [0]:
model = MultinomialNB()

In [80]:
model.fit(x_train_tfidf , news_train.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Preparing x_test data

In [0]:
x_test_tf = count_vect.transform(news_test.data)

In [0]:
x_test_tfidf = tfidf_trans.transform(x_test_tf)

In [0]:
pred = model.predict(x_test_tfidf)

In [0]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [87]:
print(classification_report(news_test.target , pred))

              precision    recall  f1-score   support

           0       0.97      0.60      0.74       319
           1       0.96      0.89      0.92       389
           2       0.97      0.81      0.88       396
           3       0.65      0.99      0.78       398

    accuracy                           0.83      1502
   macro avg       0.89      0.82      0.83      1502
weighted avg       0.88      0.83      0.84      1502



In [90]:
print(classification_report(news_test.target , pred , target_names=news_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

              accuracy                           0.83      1502
             macro avg       0.89      0.82      0.83      1502
          weighted avg       0.88      0.83      0.84      1502



In [91]:
print(confusion_matrix(news_test.target , pred))

[[192   2   6 119]
 [  2 347   4  36]
 [  2  11 322  61]
 [  2   2   1 393]]


In [92]:
print(accuracy_score(news_test.target , pred))

0.8348868175765646
