In [None]:
#importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


In [None]:
#loading dataset from csv
spamdata = pd.read_csv('SMSSpamCollection', sep='\t',
header=None, names=['Label', 'SMS'])
spamdata.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


###Splitting dataset

In [None]:
#test size 25%
X_train, X_test, y_train, y_test = train_test_split(spamdata.SMS,spamdata.Label, test_size=0.25)

In [None]:
#an example of CountVectorizer to count the appearance of tokens(unique words) in every sentence
samples = [
     'welcome to dsc',
     'we are doing spam detection', 
     'spam detection in dsc', 
     'we are at dsc', 
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(samples) #both fitting and transforming in a go
print(vectorizer.get_feature_names())
print(X.toarray())
#will output tokens as columns and a list of times every token appears in 4 sentences above

['are', 'at', 'detection', 'doing', 'dsc', 'in', 'spam', 'to', 'we', 'welcome']
[[0 0 0 0 1 0 0 1 0 1]
 [1 0 1 1 0 0 1 0 1 0]
 [0 0 1 0 1 1 1 0 0 0]
 [1 1 0 0 1 0 0 0 1 0]]




In [None]:
#applying the above technique on dataset, because the sklearn MultinomailNB classifier needs data to be fed in that form
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
#using MultinomailNB because every feature (in our case word) is discrete
# if it was continuous then would have used GaussianNB :)
model = MultinomialNB()
#fitting the model
model.fit(X_train_count,y_train)

MultinomialNB()

In [None]:
#creating a test case and predicting the class
msg =[
      'Hello Rewa, you have a meeting at 7:00 pm today.',
      'WINNER!! This is the secret code to unlock the money: LA667.'
     ]
emails_count = v.transform(msg)
test = model.predict(emails_count)
for i in test:
  print(i)

ham
spam


In [None]:
#calculating accuracy
X_test_count = v.transform(X_test)
model.score(X_test_count, y_test)

0.9820531227566404

#**Accuracy 98.2 %**