### Import File and libraries

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

### Explore Data

In [2]:
data = pd.read_csv('spam_ham_dataset.csv')

In [3]:
data.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [4]:
data.shape

(5171, 3)

In [5]:
data.columns

Index(['label', 'text', 'label_num'], dtype='object')

In [6]:
data.drop_duplicates(inplace=True)

In [7]:
data.shape

(4993, 3)

In [8]:
data.isnull().sum()

label        0
text         0
label_num    0
dtype: int64

### Process Tect

In [9]:
def process_text(text):
  #1-remove puncuation
  #2-remove stopwords
  
  nopunc = [word for word in text if word not in string.punctuation]

  nopunc = ''.join(nopunc)

  clean_word = [word for word in nopunc.split() if word .lower() not in stopwords.words('english')]

  return clean_word

### Split Dataset

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label_num'], test_size=0.2, random_state=0)

### Create Pipeline

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [12]:
pipeline = make_pipeline(
    CountVectorizer(analyzer=process_text),
    TfidfTransformer(),
    SVC(kernel='linear')
)

In [13]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(analyzer=<function process_text at 0x00000195F60C4B80>)),
                ('tfidftransformer', TfidfTransformer()),
                ('svc', SVC(kernel='linear'))])

In [14]:
pred = pipeline.predict(X_test)

In [15]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [16]:
print(classification_report(y_test, pred))
print()
print('Confusion Metrix \n', confusion_matrix(y_test, pred))
print()
print('Accuracy Score \n', accuracy_score(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       722
           1       0.99      1.00      0.99       277

    accuracy                           1.00       999
   macro avg       0.99      1.00      1.00       999
weighted avg       1.00      1.00      1.00       999


Confusion Metrix 
 [[718   4]
 [  0 277]]

Accuracy Score 
 0.995995995995996


In [17]:
import pickle

In [18]:
pickle.dump(pipeline, open('model.pkl', 'wb'))

### Test on Random data sample

In [20]:
model = pickle.load(open('model.pkl', 'rb'))

In [19]:
text = '''McAfee(TM)
Recommended by:   Lenovo
BUY NOW
Your trial expired
13 Mar 2022
Your McAfee protection expired 3 days ago

Save an extra 10% with
this email exclusive!
That's a total savings of 70%
on protection!
Get protection and save
Your all-in-one protection
includes these great features


Online privacy with Secure VPN

Award-winning antivirus

Mobile protection app

Safer web browsing

Multi-device compatibility
Award-winning internet security
Protecting more than 600 million consumer‑connected devices.
PC EDITORS CHOICE
AV TEST | TOP PRODUCT'''

In [21]:
model.predict([text])

array([1], dtype=int64)