In [1]:
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
spam = pd.read_csv("spam_ham_dataset.csv")
spam.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [3]:
spam.shape

(5171, 4)

In [4]:
spam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [5]:
spam = spam.drop(columns = ["Unnamed: 0"])
spam.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [6]:
spam.label.value_counts(), spam.label_num.value_counts()

(ham     3672
 spam    1499
 Name: label, dtype: int64,
 0    3672
 1    1499
 Name: label_num, dtype: int64)

In [7]:
stopword = set(stopwords.words("english"))
words = set(nltk.corpus.words.words())

def remove_subject(text):
    text = [word for word in str(text).split()]
    text = text[1:]
    return " ".join(text)

def lower_case(text):
    text = [word.lower() for word in str(text).split()]
    return " ".join(text)

def stop_words(text):
    text = [word for word in str(text).split() if word not in stopword]
    return " ".join(text)

def remove_numbers(text):
    text = [word for word in str(text).split() if not word.isdigit()]
    return " ".join(text)

def remove_spl(text):
    spl = ['!','@','#','$','%','^','&','*','(',')','_','-','+','=','~','`','[',']','{','}',';',"'",':','"',',','<','.','>','/','?','|']
    text = [word for word in str(text).split() if word not in spl]
    return " ".join(text)

def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(i) for i in str(text).split()]       # lemmatizing
    text = [word for word in text if word in words]     # only including english words
    return " ".join(text)

def preprocess(df):
    df.text = df.text.apply(lambda x: remove_subject(x))
    df.text = df.text.apply(lambda x: lower_case(x))
    df.text = df.text.apply(lambda x: stop_words(x))
    df.text = df.text.apply(lambda x: remove_numbers(x))
    df.text = df.text.apply(lambda x: remove_spl(x))
    df.text = df.text.apply(lambda x: lemmatization(x))
    return df

In [8]:
spam = preprocess(spam)

spam.head()

Unnamed: 0,label,text,label_num
0,ham,meter follow note gave preliminary flow data p...,0
1,ham,see attached file,0
2,ham,neon retreat ho ho ho around wonderful time ye...,0
3,spam,window office cheap main abasement darer prude...,1
4,ham,spring deal book revenue understanding u check...,0


In [9]:
data = spam["text"]
target = spam["label_num"]

In [10]:
# Making a model using TF-IDF vectorizer i.e. each unique word is given a tf-idf score which is then used in model input

vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(data)

In [11]:
train_x, test_x, train_y, test_y = train_test_split(X_tfidf, target, test_size = 0.2, random_state = 50)

In [12]:
model = SVC()
model.fit(train_x, train_y)

In [13]:
pred = model.predict(test_x)
pred

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [14]:
print("Accuracy = {}%".format(round(accuracy_score(test_y, pred)*10,2)))

Accuracy = 9.81%


In [15]:
print(classification_report(test_y, pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       744
           1       0.95      0.99      0.97       291

    accuracy                           0.98      1035
   macro avg       0.97      0.98      0.98      1035
weighted avg       0.98      0.98      0.98      1035



In [16]:
# Making a model using Count vectorizer i.e. Bag of words

vectorizer = CountVectorizer()
X_CV = vectorizer.fit_transform(data)

In [17]:
train_x, test_x, train_y, test_y = train_test_split(X_CV, target, test_size = 0.2, random_state = 50)

In [18]:
model = SVC()

model.fit(train_x, train_y)

In [19]:
pred = model.predict(test_x)
pred

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [20]:
print("Accuracy = {}%".format(round(accuracy_score(test_y, pred)*100,2)))

Accuracy = 96.71%


In [21]:
print(classification_report(test_y, pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98       744
           1       0.92      0.96      0.94       291

    accuracy                           0.97      1035
   macro avg       0.95      0.97      0.96      1035
weighted avg       0.97      0.97      0.97      1035

