# Sms Spam-Classification Project using Naive Bayes

In [1]:
import pandas as pd

In [2]:
# data= pd.read_csv("D:\Study\Data Science\Dataset\SMSSpamCollection.csv",sep = "\t", header= None, names = ["label", "Messages"],index_col= 0 )
data= pd.read_csv("D:\Study\Data Science\Dataset\SMSSpamCollection.csv",sep = "\t", header= None, names = ["label", "messages"])
data.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.messages[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [4]:
data.shape

(5572, 2)

In [5]:
data.messages[5571]

'Rofl. Its true to its name'

In [6]:
# nltk is a suite of libraries and programs for symbolic and statistical natural language processing for English
import nltk

In [9]:
# Regular expression to predict or analyze the text
import re

In [10]:
# Stopwords are the English words which does not add much meaning to a sentence.
# They can safely be ignored without sacrificing the meaning of the sentence. 
# For example, the words like the, he, have etc.
# Such words are already captured this in corpus named corpus.
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ravi0dubey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Difference between Stemming and Lemmatization

### Stemming 
 1. It is the process of removing the last few characters of a given word, to obtain a shorter form also called based word, even if that form doesn’t  have any meaning. eg: It will replace words “history” and “historical” with base word “histori”. 
 2.  Stemming is used to get that base word. In cases such as sentiment analysis, spam classification, restaurant reviews etc., getting base word is important to know whether the word is positive or negative. 

### Lemmatization
1. It overcomes the drawbacks of stemming,base word does not have any meaning, whereas lemmatization gives meaningful word.

2. It takes more time then stemming because it finds meaningful word/ representation. Stemming just needs to get a base word and therefore takes less time.

3. Stemming has its application in Sentiment Analysis while Lemmatization has its application in Chatbots, human-answering.

### Preprocessing of Message(X variable) using Stemming Approach

In [13]:
from nltk.stem.porter import PorterStemmer

In [14]:
ps = PorterStemmer()
ps

<PorterStemmer>

In [15]:
len(data)

5572

In [16]:
# Stemming  is used to get root word of different variety of the words
#i.e. likely, liked, likes -> root word is like

In [17]:
## Preprocessing of the input data

corpus = []
for i in range(0,len(data)):
    # It will remove any words in the message wich does not belong to a-z or A-Z
    review= re.sub('[^a-zA-Z]',' ' , data['messages'][i])   
    # It will convert case of the message to lower
    review_lower= review.lower()
    # It will split entire messages into individual words
    review_split = review_lower.split()
    # it will remove all the words from the messages which are part of stopwords
    review_split_stem= [ps.stem(word) for word in review_split if not word in stopwords.words('english')]
    # words will be joined within ' '
    review_final = ' '.join(review_split_stem)
    corpus.append(review_final)
    

### Printing output of each steps peformed above to showcase what happened in background

In [23]:
# Review holds all words but except the ones which does not have valid alphabets in it

review

'Rofl  Its true to its name'

In [24]:
# review_lower will store the lowercase of the the sentence
review_lower

'rofl  its true to its name'

In [25]:
# review_split will store the individual words of the sentences which are divided/splitted by spaces
review_split

['rofl', 'its', 'true', 'to', 'its', 'name']

In [26]:
# review_split_stem stores words which do not belongs to stopwords 

review_split_stem

['rofl', 'true', 'name']

In [27]:
# review_final will store the sentence formed after above all process performed on the words
review_final

'rofl true name'

### After data cleanup is done, countvectorizer is called to convert the message(X variable) into vector

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer()

In [29]:
X= cv.fit_transform(corpus).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [30]:
X.shape

(5572, 6296)

### Explanation of Bag of Words and how Vector gets created for the message(X variable) using Bag of words

In [31]:
# Let say we have 4 text

# Text1 = Burger is delicious and expensive
# Text2 = Burger is not delicious and expensive
# Text3 = Burger is okay and cheap
# Text4 = Burger is okay and burger is not cheap

# Step 1 : Create bag of words which are part of Text1 to 4 
# Bagofwords = [Burger, delicious, expensive, not, okay, cheap]

# Now check what all text from Bagwords list belongs to each text(From 1 to 4)and place them in the array
# if word is present mark it 1 and if not then mark it 0
# if word is present more than once then count the number of occurence and place it in the vector

# Text1 Vector = [1,1,1,0,0,0]
# Text2 Vector = [1,1,1,1,0,0]
# Text3 Vector = [1,0,0,0,1,1]
# Text4 Vector = [2,0,0,1,1,1] -> Burger occured twice hence the count is 2

In [32]:
# Create Count Vector with maximum features of 2500

cv= CountVectorizer(max_features= 2500)

In [33]:
X= cv.fit_transform(corpus).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
X.shape

(5572, 2500)

### Converting label(y_variable) using one hot encoding 

In [37]:
y= pd.get_dummies(data['label'], drop_first= True)
y

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0
...,...
5567,1
5568,0
5569,0
5570,0


### Splitting of the dataset into train and test column 

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X_train, X_test,y_train, y_test= train_test_split(X,y,test_size= 0.25,random_state=10)

In [40]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Gaussian Naive Bayes is used for model training 

In [42]:
from sklearn.naive_bayes import GaussianNB

In [43]:
model= GaussianNB()

In [44]:
model.fit(X_train, y_train)

  return f(*args, **kwargs)


GaussianNB()

In [45]:
y_predict = model.predict(X_test)
y_predict

array([0, 0, 0, ..., 0, 0, 1], dtype=uint8)

In [46]:
from sklearn.metrics import accuracy_score

In [47]:
accuracy_score(y_test,y_predict)

0.8628858578607322

### Multinomial Naive Bayes is used for model training 

In [48]:
from sklearn.naive_bayes import MultinomialNB
model2= MultinomialNB()
model2.fit(X_train,y_train)
y_predict1 = model.predict(X_test)
accuracy_score(y_test,y_predict1)

  return f(*args, **kwargs)


0.8628858578607322

### Prediction 

In [49]:
Test_msg = "common you are not in good mood"

In [50]:
corpus1 = []
# It will remove any words in the message wich does not belong to a-z or A-Z
review= re.sub('[^a-zA-Z]',' ' , Test_msg)   
# It will convert case of the message to lower
review_lower= review.lower()
# It will split entire messages into individual words
review_split = review_lower.split()
# it will remove all the words from the messages which are part of stopwords
review_split_stem= [ps.stem(word) for word in review_split if not word in stopwords.words('english')]
# words will be joined within ' '
review_final = ' '.join(review_split_stem)
corpus1.append(review_final)

In [51]:
corpus1

['common good mood']

In [52]:
X1= cv.fit_transform(corpus1).toarray()
X1

array([[1, 1, 1]], dtype=int64)

In [53]:
y_predict1 = model.predict(X1)
y_predict1

ValueError: operands could not be broadcast together with shapes (1,3) (2500,) 