# Importing a Data

In [6]:
import numpy as np
import pandas as pd
df = pd.read_csv('/content/train-dataset.csv')

# shuffling all our data
df = df.sample(frac=1)

# reading only Message_body and labal
df = df[['Message_body','Label']]
df

Unnamed: 0,Message_body,Label
1279,Bored _! Chat _ _ now! 0871750. 77. _! _ - _ _...,Spam
338,I just cooked a rather nice salmon a la you,Non-Spam
1564,2p per min to call Germany 08448350055 from yo...,Spam
170,Somebody set up a website where you can play h...,Non-Spam
224,"K, fyi I'm back in my parents' place in south ...",Non-Spam
...,...,...
873,_ only! Had your _ 11mths +? _ are _ to _ to t...,Spam
1504,FREE GAME. Get Golf Rayman 4 FREE from the Arc...,Spam
447,"Geeeee ... Your internet is really bad today, ...",Non-Spam
1009,Double Mins & Double Txt & / 2 price Linerenta...,Spam


In [7]:
# reading our test data
df_test = pd.read_csv('/content/train-dataset.csv',encoding='cp1252')
df_test

Unnamed: 0.1,Unnamed: 0,S. No.,Message_body,Label
0,0,1.0,Rofl. Its true to its name,Non-Spam
1,1,2.0,The guy did some bitching but I acted like i'd...,Non-Spam
2,2,3.0,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,3,4.0,Will Ã¼ b going to esplanade fr home?,Non-Spam
4,6,7.0,Huh y lei...,Non-Spam
...,...,...,...,...
1672,920,921.0,We tried to contact you re your reply to our o...,Spam
1673,924,925.0,You are being ripped off! Get your mobile cont...,Spam
1674,938,939.0,Todays Voda numbers ending with 7634 are selec...,Spam
1675,939,940.0,Please call our customer service representativ...,Spam


# Mapping Text labels to Numbers

In [8]:
df = df[['Message_body','Label']]

df_test = df[['Message_body',"Label"]]

# as our target variable is categorical it is important to convert our categorical varibale to Numeric variable
# Non-Spam --> 0
# SPam --> 1

final_dict = {'Non-Spam':0,'Spam':1}
df['Label'] = df['Label'].map(final_dict)
df_test['Label'] = df_test['Label'].map(final_dict)

In [9]:
# importing our Bag of Words model from sklearn
from sklearn.feature_extraction.text import CountVectorizer

train_documents_for_bow  = df['Message_body'].tolist()

test_docs = df_test['Message_body'].tolist()

# Create a Vectorizer Object
vectorizer = CountVectorizer(max_features=100)

vectorizer.fit(train_documents_for_bow )

# Printing the identified Unique words along with their indices
print("Vocabulary: ", vectorizer.vocabulary_)

# Encode the Document
X_train = vectorizer.fit_transform(train_documents_for_bow )


# Naive Bayes 
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train.toarray(), df['Label'])


X_test = vectorizer.transform(test_docs)
# Predict Class
y_pred = classifier.predict(X_test.toarray())

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_test['Label'].tolist(), y_pred)
print("Accuracy is --> ",accuracy*100)

Vocabulary:  {'now': 53, 'min': 47, 'only': 57, 'from': 27, 'just': 42, 'you': 98, 'to': 80, 'call': 14, 'your': 99, 'com': 20, 'for': 25, 'text': 75, 'stop': 73, 'out': 60, 'up': 84, 'can': 15, 'in': 38, 'my': 49, 'so': 71, 'do': 23, 'the': 77, '150p': 0, 'an': 5, 'txt': 82, 'this': 78, 'is': 39, 'has': 33, 'been': 12, 'as': 8, 'uk': 83, 'latest': 43, 'reply': 66, 'or': 58, 'we': 89, 'contact': 21, 'our': 59, 'of': 55, 'phone': 61, 'have': 34, 'with': 94, 'go': 29, 'no': 51, 'www': 96, 'are': 7, 'and': 6, 'was': 88, 'it': 40, 'me': 46, 'its': 41, 'not': 52, 'like': 44, 'that': 76, 'won': 95, 'prize': 64, 'cash': 16, 'what': 91, 'urgent': 86, 'free': 26, 'on': 56, 'but': 13, 'll': 45, 'see': 67, 'pls': 63, 'all': 4, 'her': 35, 'if': 37, 'be': 11, 'yes': 97, 'send': 68, 'will': 92, 'customer': 22, 'at': 9, 'please': 62, 'ur': 85, 'get': 28, 'how': 36, 'statement': 72, 'account': 3, 'shows': 69, 'code': 19, 'expires': 24, 'mobile': 48, 'sms': 70, 'time': 79, 'claim': 17, '50': 2, 'want':