In [1]:


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("spam.tsv",sep='\t',names=['Class','Message'])

In [3]:
data.head(8)

Unnamed: 0,Class,Message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!
5,ham,As per your request 'Melle Melle (Oru Minnamin...
6,spam,WINNER!! As a valued network customer you have...
7,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
data.loc[:0]

Unnamed: 0,Class,Message
0,ham,I've been searching for the right words to tha...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5567 entries, 0 to 5566
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Class    5567 non-null   object
 1   Message  5567 non-null   object
dtypes: object(2)
memory usage: 87.1+ KB


In [6]:
data['Length']=data['Message'].apply(len)

In [7]:
data['Length']

0       196
1       155
2        61
3        77
4        36
       ... 
5562    160
5563     36
5564     57
5565    125
5566     26
Name: Length, Length: 5567, dtype: int64

In [8]:
data.head(10)

Unnamed: 0,Class,Message,Length
0,ham,I've been searching for the right words to tha...,196
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,ham,"Nah I don't think he goes to usf, he lives aro...",61
3,ham,Even my brother is not like to speak with me. ...,77
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!,36
5,ham,As per your request 'Melle Melle (Oru Minnamin...,160
6,spam,WINNER!! As a valued network customer you have...,157
7,spam,Had your mobile 11 months or more? U R entitle...,154
8,ham,I'm gonna be home soon and i don't want to tal...,109
9,spam,"SIX chances to win CASH! From 100 to 20,000 po...",136


In [9]:
data.groupby('Class').count()

Unnamed: 0_level_0,Message,Length
Class,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,4821,4821
spam,746,746


# Data Visualization

In [10]:
data['Length'].describe()

count    5567.000000
mean       80.450153
std        59.891023
min         2.000000
25%        36.000000
50%        62.000000
75%       122.000000
max       910.000000
Name: Length, dtype: float64

In [11]:
data['Length']==910

0       False
1       False
2       False
3       False
4       False
        ...  
5562    False
5563    False
5564    False
5565    False
5566    False
Name: Length, Length: 5567, dtype: bool

In [12]:
data[data['Length']==910]['Message']

1080    For me the love should start with attraction.i...
Name: Message, dtype: object

In [13]:
data[data['Length']==910]['Message'].iloc[0]

"For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later.."

In [14]:
data[data['Length']==2]['Message'].iloc[0]

'Ok'

# Text Preprocessing

In [15]:
dObject = data['Class'].values
dObject

array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [16]:
data.loc[data['Class']=="ham","Class"] = 1

In [17]:
data.loc[data['Class']=="spam","Class"] = 0

In [18]:
dObject2=data['Class'].values
dObject2

array([1, 0, 1, ..., 1, 1, 1], dtype=object)

In [19]:
data.head()

Unnamed: 0,Class,Message,Length
0,1,I've been searching for the right words to tha...,196
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,1,"Nah I don't think he goes to usf, he lives aro...",61
3,1,Even my brother is not like to speak with me. ...,77
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36


Removing Punctuation

In [20]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [21]:
def remove_punct(text):
    text="".join([char for char in text if char not in string.punctuation])
    return text
data["clean_text"] = data['Message'].apply(lambda x: remove_punct(x))


In [22]:
data.head(10)

Unnamed: 0,Class,Message,Length,clean_text
0,1,I've been searching for the right words to tha...,196,Ive been searching for the right words to than...
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155,Free entry in 2 a wkly comp to win FA Cup fina...
2,1,"Nah I don't think he goes to usf, he lives aro...",61,Nah I dont think he goes to usf he lives aroun...
3,1,Even my brother is not like to speak with me. ...,77,Even my brother is not like to speak with me T...
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36,I HAVE A DATE ON SUNDAY WITH WILL
5,1,As per your request 'Melle Melle (Oru Minnamin...,160,As per your request Melle Melle Oru Minnaminun...
6,0,WINNER!! As a valued network customer you have...,157,WINNER As a valued network customer you have b...
7,0,Had your mobile 11 months or more? U R entitle...,154,Had your mobile 11 months or more U R entitled...
8,1,I'm gonna be home soon and i don't want to tal...,109,Im gonna be home soon and i dont want to talk ...
9,0,"SIX chances to win CASH! From 100 to 20,000 po...",136,SIX chances to win CASH From 100 to 20000 poun...


In [23]:
#Countvectorizer converts words to number
CV=CountVectorizer(stop_words="english")

In [24]:
xSet=data["clean_text"].values
ySet=data["Class"].values
ySet

array([1, 0, 1, ..., 1, 1, 1], dtype=object)

In [30]:
ySet = ySet.astype('int')
ySet

array([1, 0, 1, ..., 1, 1, 1])

In [31]:
xSet

array(['Ive been searching for the right words to thank you for this breather I promise i wont take your help for granted and will fulfil my promise You have been wonderful and a blessing at all times',
       'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s',
       'Nah I dont think he goes to usf he lives around here though', ...,
       'Pity  was in mood for that Soany other suggestions',
       'The guy did some bitching but I acted like id be interested in buying something else next week and he gave it to us for free',
       'Rofl Its true to its name'], dtype=object)

In [32]:
# Training and Testing data
xSet_train,xSet_test,ySet_train,ySet_test = train_test_split(xSet,ySet,test_size=0.2, random_state=10)

In [33]:

xSet_train_CV = CV.fit_transform(xSet_train) #this generates matrix means frequency
xSet_train_CV

<4453x8159 sparse matrix of type '<class 'numpy.int64'>'
	with 34532 stored elements in Compressed Sparse Row format>

# Training the model

In [34]:
NB = MultinomialNB() 

In [35]:
NB.fit(xSet_train_CV,ySet_train)

In [36]:
xSet_test_CV = CV.transform(xSet_test)

In [37]:
ySet_predict = NB.predict(xSet_test_CV)
ySet_predict

array([1, 1, 1, ..., 1, 1, 1])

In [38]:

accuracyScore = accuracy_score(ySet_test,ySet_predict)*100

print("Prediction Accuracy :",accuracyScore)

Prediction Accuracy : 98.29443447037703


In [40]:
msg = input("Enter Message: ") # to get the input message
msgInput = CV.transform([msg]) # 
predict = NB.predict(msgInput)
if(predict[0]==0):
    print("------------------------MESSAGE-SENT-[CHECK-SPAM-FOLDER]---------------------------")
else:
    print("---------------------------MESSAGE-SENT-[CHECK-INBOX]------------------------------")

Enter Message:  i have fever 


---------------------------MESSAGE-SENT-[CHECK-INBOX]------------------------------


# Bag of words

In [42]:
# creating a list of sentences
documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
processed_docs = [doc.lower().replace(".","") for doc in documents]
processed_docs[3]

'man eats food'

In [43]:
# corpus is the collection of text
#look at the documents list
print("Our corpus: ", processed_docs)
# Initialise the object for CountVectorizer
count_vect = CountVectorizer()

#Build a BOW representation for the corpus
bow_rep = count_vect.fit_transform(processed_docs)

#Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)

#see the BOW rep for first 2 documents
print("BoW representation for 'dog bites man': ", bow_rep[0].toarray())
print("BoW representation for 'man bites dog: ",bow_rep[1].toarray())

#Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

Our corpus:  ['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']
Our vocabulary:  {'dog': 1, 'bites': 0, 'man': 4, 'eats': 2, 'meat': 5, 'food': 3}
BoW representation for 'dog bites man':  [[1 1 0 0 1 0]]
BoW representation for 'man bites dog:  [[1 1 0 0 1 0]]
Bow representation for 'dog and dog are friends': [[0 2 0 0 0 0]]


In [44]:
X = data['clean_text'].values
y = data['Class'].values
y

array([1, 0, 1, ..., 1, 1, 1], dtype=object)

In [45]:
y = y.astype('int')
y

array([1, 0, 1, ..., 1, 1, 1])

In [46]:
type(X)

numpy.ndarray

In [47]:
## text preprocessing and feature vectorizer
# To extract features from a document of words, we import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


tf=TfidfVectorizer() ## object creation
X=tf.fit_transform(X) ## fitting and transforming the data into vectors


In [48]:
X.shape

(5567, 9537)

In [49]:
X

<5567x9537 sparse matrix of type '<class 'numpy.float64'>'
	with 72701 stored elements in Compressed Sparse Row format>

In [50]:
X=X.toarray()

In [51]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=6)

In [52]:
## Model creation
from sklearn.naive_bayes import BernoulliNB

## model object creation
nb=BernoulliNB(alpha=0.01) 

## fitting the model
nb.fit(X_train,y_train)

## getting the prediction
y_hat=nb.predict(X_test) 

In [53]:
y_hat

array([1, 1, 1, ..., 1, 1, 1])

In [54]:
from sklearn.metrics import classification_report,confusion_matrix


In [55]:
print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       186
           1       0.99      0.99      0.99      1206

    accuracy                           0.99      1392
   macro avg       0.98      0.98      0.98      1392
weighted avg       0.99      0.99      0.99      1392



In [56]:

pd.crosstab(y_test,y_hat)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,178,8
1,7,1199
