In [1]:
import numpy as np
import pgmpy
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Naive Bayes for document classification
# getting started

In [3]:
corpus = ['This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?']

In [4]:
# Tokenizes and vectorizes. ie, from text to list of tokens 'words or particular characters split
#single elements in a list'. Thes individual words or groups of characters are then converted to vectors
#by assigning number to represent each work or group of characters(token).

In [5]:
# Defining the vectorising function
vectorizer = CountVectorizer(min_df=1)

In [10]:
# fitting the vectorizer function to the corpus
X = vectorizer.fit_transform(corpus)

In [11]:
# Prints the list of tokens corresponding to the columns of the document term matrix.
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [12]:
#Document term matrix. 
print(X.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


In [13]:
#binary vectorizer. 1 if word is present in document 0 otherwise
vectorizer_binary = CountVectorizer(min_df=1, binary=True)

In [14]:
#fitting binary vectorizer
X_binary = vectorizer_binary.fit_transform(corpus)

In [15]:
print(X_binary.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 1 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


In [16]:
# The tf-idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
# Assigning a variable to the vectorizer
vectorizer_tfidf = TfidfVectorizer(min_df=1)

In [26]:
#fit_transform method basically Learn the vocabulary dictionary
# and return term-document matrix.
X_tfidf = vectorizer_tfidf.fit_transform(corpus)

In [27]:
# checking the feature names  for the term document matrix
print(vectorizer_tfidf.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [28]:
# Returnig the corresponding term documents
print(X_tfidf.toarray())

[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]]


# Building a Naive Bayes

In [29]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics

## Brief description
The dataset used in this example is the 20 newsgroups dataset.The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups. It will be automatically downloaded, then cached.

In [30]:
# For our simple example we are only going to use 4 news group
categories = ['alt.atheism','talk.religion.misc','comp.graphics','sci.space']

In [31]:
# Loading training data
data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [32]:
#Loading test data
data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [33]:
# pulling out the target variables
y_train, y_test = data_train.target, data_test.target

To convert the text documents into numerical features, we need to use a feature extractor. In this example we are using HashingVectorizer as it would be memory efficient in case of large datasets

In [34]:
# In case of HashingVectorizer we don't need to fit the data, just transform would work.
vectorizer_hash = HashingVectorizer(stop_words='english')

In [35]:
# Performing the harshing transformation
X_train = vectorizer_hash.transform(data_train.data)
X_test = vectorizer_hash.transform(data_test.data)

In the event that we choose the tf-idf or anyone of the countvectorizers we need to fit them on the joint data set of train and test because we need the complete vocabulary to create the matrix. Example below;

In [36]:
# fitting the data to the binary vectorizer defined in eairlier cells
vectorizer_binary.fit_transform(data_train.data + data_test.data)

<3387x43562 sparse matrix of type '<class 'numpy.int64'>'
	with 546941 stored elements in Compressed Sparse Row format>

In [37]:
#Then transform it
X_train = vectorizer.transform(data_train.data)
X_test = vectorizer.transform(data_test.data)

In [38]:
#alpha is additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
clf = BernoulliNB(alpha=.01)

In [39]:
#Training the classifier
clf.fit(X_train, y_train)

BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)

In [40]:
# Predicting results
y_predicted = clf.predict(X_test)

In [41]:
#Comparing the predicted and the true values
score = metrics.accuracy_score(y_test, y_predicted)

In [42]:
#Lets see how good our clasifier has performed on this text
print("accuracy: %0.3f" % score)

accuracy: 0.334
