# Problem Statement
### Spam filtering using naive Bayes classifiers in order to predict whether a new mail based on its content, can be categorized as spam or not-spam.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("spam.tsv",sep='\t',names=['Class','Message'])
data

Unnamed: 0,Class,Message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!
...,...,...
5562,spam,This is the 2nd time we have tried 2 contact u...
5563,ham,Will ü b going to esplanade fr home?
5564,ham,"Pity, * was in mood for that. So...any other s..."
5565,ham,The guy did some bitching but I acted like i'd...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5567 entries, 0 to 5566
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Class    5567 non-null   object
 1   Message  5567 non-null   object
dtypes: object(2)
memory usage: 87.1+ KB


In [4]:
data.describe()

Unnamed: 0,Class,Message
count,5567,5567
unique,2,5164
top,ham,"Sorry, I'll call later"
freq,4821,30


In [5]:
# create a column to keep a count of character present in each record
data['Length'] = data['Message'].apply(len)

In [6]:
data['Length']

0       196
1       155
2        61
3        77
4        36
       ... 
5562    160
5563     36
5564     57
5565    125
5566     26
Name: Length, Length: 5567, dtype: int64

In [7]:
data.head()

Unnamed: 0,Class,Message,Length
0,ham,I've been searching for the right words to tha...,196
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,ham,"Nah I don't think he goes to usf, he lives aro...",61
3,ham,Even my brother is not like to speak with me. ...,77
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!,36


In [8]:
data.groupby('Class').count()

Unnamed: 0_level_0,Message,Length
Class,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,4821,4821
spam,746,746


## Data Visualization

In [9]:
data['Length'].describe() # find max length of message

count    5567.000000
mean       80.450153
std        59.891023
min         2.000000
25%        36.000000
50%        62.000000
75%       122.000000
max       910.000000
Name: Length, dtype: float64

In [10]:
data['Length'] == 910

0       False
1       False
2       False
3       False
4       False
        ...  
5562    False
5563    False
5564    False
5565    False
5566    False
Name: Length, Length: 5567, dtype: bool

In [11]:
# message with max character
data[data['Length']==910]['Message']

1080    For me the love should start with attraction.i...
Name: Message, dtype: object

In [12]:
data[data['Length']==910]['Message'].iloc[0]

"For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later.."

In [13]:
data[data['Length']==2]['Message'].iloc[0]

'Ok'

## Text Pre-Processing

In [14]:
# creating an object for the target value
dObject = data['Class'].values
dObject

array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [15]:
# assigning ham as 1
data.loc[data['Class']=='ham','Class'] = 1

In [16]:
data.loc[data['Class']=='spam','Class'] = 0

In [17]:
dObject2 = data['Class'].values
dObject2

array([1, 0, 1, ..., 1, 1, 1], dtype=object)

In [18]:
data.head()

Unnamed: 0,Class,Message,Length
0,1,I've been searching for the right words to tha...,196
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,1,"Nah I don't think he goes to usf, he lives aro...",61
3,1,Even my brother is not like to speak with me. ...,77
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36


In [19]:
# First removing punctuation. We can just take advantage of Python's built-in string library to get a quick list of all the possible punctuation:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [20]:
"messages"=="message's"
"This message is spam" == "This message is spam."

False

In [21]:
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text
data['text_clean'] = data['Message'].apply(lambda x : remove_punct(x))
data.head()

Unnamed: 0,Class,Message,Length,text_clean
0,1,I've been searching for the right words to tha...,196,Ive been searching for the right words to than...
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155,Free entry in 2 a wkly comp to win FA Cup fina...
2,1,"Nah I don't think he goes to usf, he lives aro...",61,Nah I dont think he goes to usf he lives aroun...
3,1,Even my brother is not like to speak with me. ...,77,Even my brother is not like to speak with me T...
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36,I HAVE A DATE ON SUNDAY WITH WILL


__Tokenization__ (process of converting the normal text strings in to a list of tokens(also known as lemmas.

Now we need to convert each of those messages into a vector the SciKit Learn's algorithm models can work with and machine learning model which we will gonig to use can understand.

In [22]:
# Countervectorizer is a method to convert text to numerical
# Initialize the object for countervectorizer
CV = CountVectorizer(stop_words="english")
#Stopwords are the words in any language which does not add much meaning to a sentence. They are the words which are very common in text documents such as a, an, the, you, your, etc. The Stop Words highly appear in text documents. However, they are not being helpful for text analysis in many of the cases, So it is better to remove from the text. We can focus on the important words if stop words have removed.

In [23]:
# Splitting x and y
xSet = data['text_clean'].values
ySet = data['Class'].values
ySet

array([1, 0, 1, ..., 1, 1, 1], dtype=object)

In [24]:
# Datatype for y is object. lets convert it into int
ySet = ySet.astype('int')
ySet

array([1, 0, 1, ..., 1, 1, 1])

In [25]:
xSet

array(['Ive been searching for the right words to thank you for this breather I promise i wont take your help for granted and will fulfil my promise You have been wonderful and a blessing at all times',
       'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s',
       'Nah I dont think he goes to usf he lives around here though', ...,
       'Pity  was in mood for that Soany other suggestions',
       'The guy did some bitching but I acted like id be interested in buying something else next week and he gave it to us for free',
       'Rofl Its true to its name'], dtype=object)

## Splitting Train Test Data

In [26]:
xSet_train,xSet_test,ySet_train,ySet_test = train_test_split(xSet,ySet,test_size=0.2, random_state=10)

In [27]:
xSet_train_CV = CV.fit_transform(xSet_train)
xSet_train_CV

<4453x8159 sparse matrix of type '<class 'numpy.int64'>'
	with 34532 stored elements in Compressed Sparse Row format>

## Training a Model

In [28]:
# With messages represented as vectors, we can finally train our spam/ham classifier. Now we can actually use almost any sort of classification algorithms. For a variety of reasons, the Naive Bayes classifier algorithm is a good choice.
NB = MultinomialNB()

In [29]:
## feed data to model
NB.fit(xSet_train_CV,ySet_train)

In [30]:
# Let's test CV on our test data
xSet_test_CV = CV.transform(xSet_test)

In [31]:
# prediction for xSet_test_CV
ySet_predict = NB.predict(xSet_test_CV)
ySet_predict

array([1, 1, 1, ..., 1, 1, 1])

In [32]:
# Checking accuracy
accuracyScore = accuracy_score(ySet_test,ySet_predict)*100

print("Prediction Accuracy :",accuracyScore)

Prediction Accuracy : 98.29443447037703


## SpamClassificationApplication

In [33]:
msg = input("Enter Message: ") # to get the input message
msgInput = CV.transform([msg]) # 
predict = NB.predict(msgInput)
if(predict[0]==0):
    print("------------------------MESSAGE-SENT-[CHECK-SPAM-FOLDER]---------------------------")
else:
    print("---------------------------MESSAGE-SENT-[CHECK-INBOX]------------------------------")

Enter Message:  the product is good


---------------------------MESSAGE-SENT-[CHECK-INBOX]------------------------------


In [35]:
msg = input("Enter Message: ") # to get the input message
msgInput = CV.transform([msg]) # 
predict = NB.predict(msgInput)
if(predict[0]==0):
    print("------------------------MESSAGE-SENT-[CHECK-SPAM-FOLDER]---------------------------")
else:
    print("---------------------------MESSAGE-SENT-[CHECK-INBOX]------------------------------")
    

Enter Message:  you won a lottery ticket


------------------------MESSAGE-SENT-[CHECK-SPAM-FOLDER]---------------------------


## TF-IDF

In [36]:
# Splitting x and y

X = data['text_clean'].values
y = data['Class'].values
y

array([1, 0, 1, ..., 1, 1, 1], dtype=object)

In [37]:
# Datatype for y is object. lets convert it into int
y = y.astype('int')
y

array([1, 0, 1, ..., 1, 1, 1])

In [38]:
type(X)

numpy.ndarray

In [39]:
# text preprocessing and feature vectorizer
# To extract features from a document of words, we import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


tf=TfidfVectorizer() # object creation
X=tf.fit_transform(X) # fitting and transforming the data into vectors


In [40]:
X.shape

(5567, 9537)

In [41]:
X

<5567x9537 sparse matrix of type '<class 'numpy.float64'>'
	with 72701 stored elements in Compressed Sparse Row format>

In [42]:
# getting the feature vectors
X=X.toarray()

In [43]:
# Creating training and testing
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=6)

In [44]:
# Model creation
from sklearn.naive_bayes import BernoulliNB

# model object creation
nb=BernoulliNB(alpha=0.01) 

# fitting the model
nb.fit(X_train,y_train)

# getting the prediction
y_hat=nb.predict(X_test) 

In [45]:
y_hat

array([1, 1, 1, ..., 1, 1, 1])

In [46]:
# Evaluating the model
from sklearn.metrics import classification_report,confusion_matrix

In [47]:
print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       186
           1       0.99      0.99      0.99      1206

    accuracy                           0.99      1392
   macro avg       0.98      0.98      0.98      1392
weighted avg       0.99      0.99      0.99      1392



In [48]:
# confusion matrix
pd.crosstab(y_test,y_hat)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,178,8
1,7,1199
