In [None]:
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='ISO-8859-1')
df.head()

# Processing Data

In [None]:
le = LabelEncoder()

In [None]:
data = df.to_numpy() # same as data.values
data[0]

In [None]:
X = data[:, 1] # only second col
y = data[:, 0] # only first col

### Tokenize and Stemming

In [None]:
tokenizer = RegexpTokenizer('\w+') # all the words
sw = set(stopwords.words('english'))
ps = PorterStemmer()

In [None]:
def getStem(review):
    review = review.lower() # lowercase
    
    tokens = tokenizer.tokenize(review) # breaking into small words
    
    removed_stopwords = [w for w in tokens if w not in sw]
    stemmed_words = [ps.stem(token) for token in removed_stopwords]
    clean_review = ' '.join(stemmed_words)
    
    return clean_review

In [None]:
# get a clean document
def getDoc(document):
    d = []
    for doc in document:
        d.append(getStem(doc))
    return d

In [None]:
stemmed_doc = getDoc(X)
stemmed_doc[:3]

### Count Vectorizer

In [None]:
cv = CountVectorizer()

In [None]:
# create my vocab
vc = cv.fit_transform(stemmed_doc)

In [None]:
X = vc.todense() # same as to array
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Comparing Naive Bayes
* **Gaussian Naive Bayes classifier**: used when features are not discreet.

* **Multinomial Naive Bayes Classifier**: used when features follow a multinomial distribution.

* **Bernoulli Naive Bayes classifier**: used when features are of the boolean type.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)
ans = model.score(X_test, y_test)
ans

In [None]:
model1 = GaussianNB()
model1.fit(X_train, y_train)
ans1 = model1.score(X_test, y_test)
ans1

In [None]:
model2 = BernoulliNB()
model2.fit(X_train, y_train)
ans2 = model2.score(X_test, y_test)
ans2

In [None]:
left = [1,2,3]
x = ['MultinomialNB','GaussianNB','BernoulliNB']
y = [ans,ans1,ans2]
plt.ylim(0.7,1.005)
plt.title('Comparing Naive Bayes')
plt.bar(left, y, tick_label = x, width = 0.4, color = ['red', 'green','blue'])

#### As we see, Multinomial NB performs the best among the 3 followed closely by Bernoulli NB 

In [None]:
print('The best model is MultinomialNB with the accuracy : ',ans * 100)