In [1]:
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\simon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data = pd.read_csv('spam.csv',encoding='ISO-8859-1')

In [3]:
data=data[['v2','v1']]

In [4]:
data=data.rename(columns={'v2':'message','v1':'labels'})

In [5]:
data.isnull().sum()

message    0
labels     0
dtype: int64

In [6]:
data=data.drop_duplicates(keep='first')

In [7]:
data['num_character']=data['message'].apply(len)
data['num_words']=data['message'].apply(lambda x: len(nltk.word_tokenize(x)))
data['num_sentence']=data['message'].apply(lambda x:len(nltk.sent_tokenize(x)))
data['labels']=data.labels.map({'ham':0,'spam':1})

**preprocessing**

In [8]:
STOPWORD=set(stopwords.words('english'))

In [9]:
def clean_text(text):
  text=text.lower()
  text=re.sub('\W'," ",text)
  text=re.sub('\d+'," ",text)
  text=re.sub("\s+"," ",text)
  text=" ".join(words for words in text.split() if words not in STOPWORD)
  return text

In [10]:
data['clean_text']=data['message'].apply(clean_text)
data.head()

Unnamed: 0,message,labels,num_character,num_words,num_sentence,clean_text
0,"Go until jurong point, crazy.. Available only ...",0,111,24,2,go jurong point crazy available bugis n great ...
1,Ok lar... Joking wif u oni...,0,29,8,2,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,155,37,2,free entry wkly comp win fa cup final tkts st ...
3,U dun say so early hor... U c already then say...,0,49,13,1,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",0,61,15,1,nah think goes usf lives around though


**split the input**

**model bulid**

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=3000)

In [12]:
X=tfidf.fit_transform(data['clean_text']).toarray()
y=data['labels'].values

In [13]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=1)

In [14]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [15]:
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

**Finding the best model**

In [16]:
model=[gnb,mnb,bnb]

In [17]:
best_model={}
for i in model:
    i.fit(X_train,y_train)
    y_pred=i.predict(X_test)
    ac=accuracy_score(y_test,y_pred)
    cm=confusion_matrix(y_test,y_pred)
    ps=precision_score(y_test,y_pred)
    best_model[i]=[ac,ps]
    

In [18]:
model=max(best_model, key= lambda x: best_model[x]) #which model is best

In [19]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(max(best_model, key= lambda x: best_model[x]),open('model.pkl','wb'))

In [21]:
input_sms=input("Enter the sms:")

transformed_sms=clean_text(input_sms)

vector_input=tfidf.transform([transformed_sms])

result=model .predict(vector_input)

if result ==1:
    print("\n\n\n\****----->Spam<------*****\n\n\n")
else:
    print("\n\n\n****----->Not Spam<------*****\n\n\n")

Enter the sms: Yay! It’s another Two for Tuesday. Order a box of candy today and get another one FREE. Hurry now and visit chocolates.com to order. Valid for orders above $20.





\****----->Spam<------*****



