# 1. Importing libraries

In [1]:
import regex as re
import pandas as pd
import numpy as np
import nltk

from nltk import LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# 2. Loading the dataset

In [2]:
data = pd.read_csv("dataset/spam classification 3 (csv)/Datasets/spam_normal_emails.csv")
print("The dataset has ", len(data), " emails")
data.head()

The dataset has  5728  emails


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


# 3. Text cleaning

In [3]:
clean = []
for text in data['text']:
    cleaned_text = ""
    for char in text:
        if re.match(r"([a-zA-Z])|([0-9])", char) != None or char == " ":
            cleaned_text+=char.lower()
    clean.append(cleaned_text)

# 4. Tokenization

In [4]:
tokens = [word_tokenize(x) for x in clean]
# tokens = [word_tokenize(x) for x in clean]

filtered_tokens = []
en_stopwords = list(set(nltk.corpus.stopwords.words('english')))

# tokens that are not stopwords collected here
for i in tokens:
    filtered_tokens.append([])
    for j in i:
        if j in en_stopwords:
            continue
        else: filtered_tokens[-1].append(j)

# 5. Stemming

In [5]:
# initialize Lancaster Stemmer
LS = LancasterStemmer()
stemmed_words = []
for l in filtered_tokens: stemmed_words.append([LS.stem(w) for w in l])

# 6. Generating words occurrence matrix

In [6]:
flat_list = [i for sublist in stemmed_words for i in sublist]

flat_list_reduced = list(set(flat_list))

In [7]:
data_stemmed_words = pd.DataFrame(columns = flat_list_reduced, index=np.arange(len(stemmed_words)))
index=0
for sequence in stemmed_words:
    words_in_seq = [sequence.count(word) for word in flat_list_reduced]
    data_stemmed_words.iloc[index] = words_in_seq
    index+=1

# 7. Preparing the train/test sets

In [8]:
scaler = MinMaxScaler()
X = scaler.fit_transform(data_stemmed_words.values)
y = data["spam"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

# 8. SVM Classification

In [9]:
# Perform classification with SVM, kernel=linear
classifier_linear = SVC(kernel='linear')
classifier_linear.fit(X_train, y_train)
prediction_linear = classifier_linear.predict(X_test)
acc = accuracy_score(y_test, prediction_linear)*100
print("acc = ", "{:.2f}".format(acc), "%")

acc =  96.04 %
