## Gaussian Naive Bayes Classifier

In [None]:
#Importing the libraries
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB

In [None]:
#Loading the data as features and target
ds = load_breast_cancer()
x = ds.data
y = ds.target

In [None]:
x[0]

array([1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
       3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
       8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
       3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
       1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01])

In [None]:
y[0]

np.int64(0)

In [None]:
#Splitting the data into training and testing, and creating a GaussianNB model
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size = 0.25, random_state = 47)
model = GaussianNB()

In [None]:
model.fit(xtrain, ytrain)

In [None]:
#Obtaining the predictions and evaluating the model using accuracy score and confusion matrix
ypred = model.predict(xtest)
acc = accuracy_score(ypred, ytest)
conf = confusion_matrix(ypred, ytest)
print(acc, "\n",conf)

0.9230769230769231 
 [[49  4]
 [ 7 83]]


## Multinomial Naive Bayes Classifier

In [None]:
#Importing the libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [None]:
#Obtaining the dataset with all the records and loading as features and target
ds = fetch_20newsgroups(subset = 'all')
x = ds.data
y = ds.target

In [None]:
#Obtaining the names of the target classes (20)
ds.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
x[0], y[0]

("From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>\nSubject: Pens fans reactions\nOrganization: Post Office, Carnegie Mellon, Pittsburgh, PA\nLines: 12\nNNTP-Posting-Host: po4.andrew.cmu.edu\n\n\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n",
 np.int64(10))

In [None]:
len(x)

18846

In [None]:
#As the input is text, we use a CountVectorizer to create a vector representation of the text.
vect1 = CountVectorizer(binary = True) #This is for BinomialNB, as binary=True indicates that the vectorizer will only provide presence or absence of the word.
vect2 = CountVectorizer(binary = False) #This is for MultinomialNB, as binary=False indicates that the vectorizer will provide count of the word.

x1 = vect1.fit_transform(x)
x2 =  vect2.fit_transform(x)

In [None]:
#Creating new training and testing data based on the CountVectorizer data
xtrain1, xtest1, ytrain, ytest = train_test_split(x1, y, test_size = 0.25, random_state = 7)
xtrain2, xtest2, ytrain, ytest = train_test_split(x2, y, test_size = 0.25, random_state = 7)

In [None]:
#Creating the 2 models with default parameters
bnb = BernoulliNB()
mnb = MultinomialNB()

In [None]:
bnb.fit(xtrain1, ytrain)
mnb.fit(xtrain2, ytrain)

In [None]:
ypred1 = bnb.predict(xtest1)
ypred2 = mnb.predict(xtest2)

In [None]:
#Evaluating the accuracy of both the models on the same dataset.
acc1 = accuracy_score(ypred1, ytest)
acc2 = accuracy_score(ypred2, ytest)

In [None]:
acc1, acc2

(0.6910016977928692, 0.8452886247877759)

## Naive Bayes Classifier with TF-IDF Vectorizer

In [None]:
#Creating a TfidfVectorizer object that will provide a vector representation of the texts with binary=False for MultinomialNB.
vect3 = TfidfVectorizer(binary=False)
x3 = vect3.fit_transform(x)

xtrain3, xtest3, ytrain, ytest = train_test_split(x3, y, test_size = 0.25, random_state = 7)

In [None]:
#Creating and fitting the model on the above data to obtain the predictions and subsequent accuracy.
mnb = MultinomialNB()
mnb.fit(xtrain3, ytrain)
ypred3 = mnb.predict(xtest3)
acc3 = accuracy_score(ypred3, ytest)
print(acc3)

0.8435908319185059
