# SPAM CLASSIFIER

### Importing libraries :

In [2]:
import pandas as pd
import re
import nltk

### Reading the dataset :

In [3]:
messages = pd.read_csv('SMSSpamCollection', sep='\t',
                           names=["label", "message"])


In [4]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data cleaning and preprocessing :

In [5]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
print(len(corpus))

5572


In [8]:
# corpus1 = []
# for i in range(0, len(messages)):
#     review1 = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
#     review1 = review1.lower()
#     review1 = review1.split()
    
#     review2 = [ps.stem(word) for word in review1 if not word in stopwords.words('english')]
#     review2 = ' '.join(review2)
#     corpus1.append(review2)
# print(len(corpus1))

### Comparing the new corpus to original document :

In [6]:
pd.set_option('display.max_colwidth', -1)
print('Corpus :')
print(corpus[:4],'\n')
print('Original form :')
print(messages['message'][:4])

Corpus :
['go jurong point crazi avail bugi n great world la e buffet cine got amor wat', 'ok lar joke wif u oni', 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli', 'u dun say earli hor u c alreadi say'] 

Original form :
0    Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            
1    Ok lar... Joking wif u oni...                                                                                                                              
2    Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3    U dun say so early hor... U c already then say...                                                                                                          
Name: message, dtype: object


### Creating the TF-IDF model :

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()
print(X[X > 0])

#print(messages['label'][:5])
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values
#y[:5]

[0.30390947 0.38787044 0.34325553 ... 0.4841431  0.69815878 0.52742751]


### Splitting the data into training and testing set :

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.20, random_state = 0 )

In [9]:
#Training model using Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

y_pred=spam_detect_model.predict(X_test)

### Accuracy of model on testing set :

In [10]:
spam_detect_model.score(X_test,y_test)

0.979372197309417

### Testing my own messages :

In [11]:
l = ['winner gets ten lakhs cash prize',
     'Exchange your mobile today at a very minimal price',
     'I will be home at night',
     'Congratulations on winning a brand new iPhone 11',
      'Your meeting is scheduled on this saturday at 10am']
df=pd.DataFrame(l,columns=['message'])

corpus=[]

for i in range(0,len(df)):
  review=re.sub('[^a-zA-Z]',' ',df['message'][i])
  review=review.lower()
  review=review.split()

  review=[ps.stem(word) for word in review if word not in stopwords.words('english')]
  review=' '.join(review)

  corpus.append(review)

df=cv.transform(corpus).toarray()

pred=spam_detect_model.predict(df)
j=0
for i in pred:
    if i==1:
        print("Spam : ",l[j])
        j = j+1
    else:
        print("Ham : ",l[j])
        j = j+1

# if label==1:
#   print('Spam')
# else:
#   print('Ham')


Spam :  winner gets ten lakhs cash prize
Spam :  Exchange your mobile today at a very minimal price
Ham :  I will be home at night
Spam :  Congratulations on winning a brand new iPhone 11
Ham :  Your meeting is scheduled on this saturday at 10am


# Thank you  :)

                                                                                                    Source :- YouTube