In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [3]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5173,ham,U 2.,,,
3682,ham,When did i use soc... I use it only at home......,,,
4616,ham,Good. do you think you could send me some pix?...,,,
4917,ham,Sitting in mu waiting for everyone to get out ...,,,
3522,ham,Try neva mate!!,,,


In [4]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [5]:
# renaming...
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

Unnamed: 0,target,text
319,ham,"Coffee cake, i guess..."
3744,ham,"Aight, let me know when you're gonna be around..."
5052,ham,Lmao you know me so well...
1631,ham,We not watching movie already. Xy wants 2 shop...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [6]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [7]:
df['target']=encoder.fit_transform(df['target'])

In [8]:
df.sample(5)

Unnamed: 0,target,text
850,0,U reach orchard already? U wan 2 go buy ticket...
2913,1,Sorry! U can not unsubscribe yet. THE MOB offe...
1557,0,Wat r u doing?
4332,0,Y bishan lei... I tot Ì_ say lavender?
4581,0,Wow didn't think it was that common. I take it...


In [9]:
df = df.drop_duplicates(keep='first')

In [10]:
df.duplicated().sum()

0

In [11]:
df['target'].value_counts()

0    4516
1     653
Name: target, dtype: int64

In [12]:
import nltk

In [13]:
df['num_characters']=df['text'].apply(len)

In [14]:
df.head()

Unnamed: 0,target,text,num_characters
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61


In [15]:
df['num_words']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [16]:
df.head()

Unnamed: 0,target,text,num_characters,num_words
0,0,"Go until jurong point, crazy.. Available only ...",111,24
1,0,Ok lar... Joking wif u oni...,29,8
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,37
3,0,U dun say so early hor... U c already then say...,49,13
4,0,"Nah I don't think he goes to usf, he lives aro...",61,15


In [17]:
df['num_sentence']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [18]:
df.head()

Unnamed: 0,target,text,num_characters,num_words,num_sentence
0,0,"Go until jurong point, crazy.. Available only ...",111,24,2
1,0,Ok lar... Joking wif u oni...,29,8,2
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2
3,0,U dun say so early hor... U c already then say...,49,13,1
4,0,"Nah I don't think he goes to usf, he lives aro...",61,15,1


In [19]:
df[df['target']==0][['num_characters','num_words', 'num_sentence']].describe()

# Ham Messages

Unnamed: 0,num_characters,num_words,num_sentence
count,4516.0,4516.0,4516.0
mean,70.459256,17.120903,1.799601
std,56.358207,13.493725,1.278465
min,2.0,1.0,1.0
25%,34.0,8.0,1.0
50%,52.0,13.0,1.0
75%,90.0,22.0,2.0
max,910.0,220.0,28.0


In [20]:
df[df['target']==1][['num_characters','num_words', 'num_sentence']].describe()

# SPAM MSG

Unnamed: 0,num_characters,num_words,num_sentence
count,653.0,653.0,653.0
mean,137.891271,27.667688,2.967841
std,30.137753,7.008418,1.483201
min,13.0,2.0,1.0
25%,132.0,25.0,2.0
50%,149.0,29.0,3.0
75%,157.0,32.0,4.0
max,224.0,46.0,8.0


In [21]:
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [22]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
            
    return " ".join(y)

In [23]:
transform_text('Save extra Rs 2000 over existing offers on appliances & electronics + up to 10% instant card offers! Get them now on Tata Neu https://tneu.in/x07QWVQ T&C apply')

'save extra rs 2000 exist offer applianc electron 10 instant card offer get tata neu http c appli'

In [24]:
df['transformed_text']=df['text'].apply(transform_text)

In [25]:
df.head()

Unnamed: 0,target,text,num_characters,num_words,num_sentence,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",111,24,2,go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,29,8,2,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,49,13,1,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",61,15,1,nah think goe usf live around though


In [26]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [27]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [28]:
y=df['target'].values

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=21)

In [30]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [31]:
mnb=MultinomialNB()

In [32]:
mnb.fit(X_train,y_train)
y_pred1 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

0.965183752417795
1.0


In [33]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model1.pkl','wb'))

# Exporting Model And Vectorizer.