In [1]:
!pip install pandas numpy scikit-learn matplotlib -q

In [1]:
!python -m spacy download en_core_web_sm -q

[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [1]:
import numpy as np
import pandas as pd
import spacy
from gensim.models import Word2Vec, FastText
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [7]:
class SpamClassifier:
    def __init__(self, data_path, random_state=42):
        self.y_test = None
        self.y_train = None
        self.X_test = None
        self.X_train = None
        self.y = None
        self.X = None
        self.data = pd.read_csv(data_path)
        self.nlp = spacy.load("en_core_web_sm")
        self.random_state = random_state
        self.vector_size = 256

    def preprocess_data(self):
        self.data.dropna(subset=['email'], inplace=True)

        self.data['cleaned_text'] = self.data['email'].apply(
            lambda x: [
                token.lemma_.lower() for token in self.nlp(x) if
                not token.is_stop
                and not token.is_punct
                and not token.is_digit
                and not token.like_email
                and not token.like_num
                and not token.is_space
            ]
        )

        self.X = self.data['cleaned_text']
        self.y = self.data['label']

    def split_data(self, test_size=0.2):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=test_size, random_state=self.random_state
        )

    def create_model(self, vectorization_method="Word2Vec", sg=1):
        if vectorization_method == 'Word2Vec':
            model = Word2Vec(self.X_train,
                             vector_size=256,
                             window=7,
                             min_count=10,
                             sg=sg,
                             hs=0,
                             negative=5,
                             epochs=25,
                             seed=self.random_state)
        elif vectorization_method == 'FastText':
            model = FastText(self.X_train,
                             vector_size=256,
                             window=7,
                             min_count=10,
                             sg=sg,
                             hs=0,
                             negative=5,
                             epochs=25,
                             seed=self.random_state)
        else:
            raise ValueError("Invalid vectorization method. Choose from ['Word2Vec', 'FastText']")
        return model

    def evaluate_model(self, model):
        train_vectors = self.get_avg_word_vector(self.X_train, model)
        test_vectors = self.get_avg_word_vector(self.X_test, model)

        lr = LogisticRegression(random_state=self.random_state)
        lr.fit(train_vectors, self.y_train)
        y_pred_test = lr.predict(test_vectors)

        accuracy_test = accuracy_score(self.y_test, y_pred_test)
        return accuracy_test

    def get_avg_word_vector(self, text, model):
        zero_vector = np.zeros(self.vector_size)
        vectors = []
        for tokens in text:
            sums = np.zeros(self.vector_size)
            counts = 0 + 1e-5
            for token in tokens:
                if token in model.wv:
                    sums += model.wv[token]
                    counts += 1
            if counts != 0:
                vectors.append(sums / counts)
            else:
                vectors.append(zero_vector)
        return vectors

In [8]:
spam_classifier = SpamClassifier(data_path='spam_or_not_spam.csv', random_state=42)

In [9]:
%%time
spam_classifier.preprocess_data()

CPU times: total: 21.6 s
Wall time: 1min 11s


In [10]:
spam_classifier.split_data(0.2)

## Word2Vec SG

In [30]:
w2v_sg_model = spam_classifier.create_model(vectorization_method='Word2Vec', sg=1)
print("Accuracy of test sample:", spam_classifier.evaluate_model(w2v_sg_model))

Accuracy of test sample: 0.985


In [31]:
w2v_sg_model.wv.most_similar(positive=['virus'], topn=5)

[('hazardous', 0.5417191982269287),
 ('unwanted', 0.4414699375629425),
 ('destructive', 0.41856658458709717),
 ('prey', 0.4155874252319336),
 ('norton', 0.40235215425491333)]

In [34]:
w2v_sg_model.wv.doesnt_match(['buy', 'money', 'hacker', 'virus', 'linux'])

'money'

In [32]:
w2v_sg_model.wv.similarity('virus', 'hacker')

0.40045553

## Word2Vec CBOW

In [28]:
w2v_cbow_model = spam_classifier.create_model(vectorization_method='Word2Vec', sg=0)
print("Accuracy of test sample:", spam_classifier.evaluate_model(w2v_cbow_model))

Accuracy of test sample: 0.9933333333333333


In [18]:
w2v_cbow_model.wv.most_similar(positive=['virus'], topn=5)

[('prey', 0.670638382434845),
 ('destructive', 0.6245026588439941),
 ('hacker', 0.6128127574920654),
 ('computer', 0.5817774534225464),
 ('hazardous', 0.5692459344863892)]

In [33]:
w2v_cbow_model.wv.doesnt_match(['buy', 'money', 'hacker', 'virus', 'linux'])

'money'

In [27]:
w2v_cbow_model.wv.similarity('virus', 'hacker')

0.61281276

## FastText

In [29]:
fasttext_model = spam_classifier.create_model(vectorization_method='FastText', sg=1)
print("Accuracy of test sample:", spam_classifier.evaluate_model(fasttext_model))

Accuracy of test sample: 0.985


In [19]:
fasttext_model.wv.most_similar(positive=['virus'], topn=5)

[('hazardous', 0.5704951882362366),
 ('unwanted', 0.48871809244155884),
 ('hacker', 0.4587479531764984),
 ('valuable', 0.4294900596141815),
 ('norton', 0.40088531374931335)]

In [22]:
fasttext_model.wv.doesnt_match(['buy', 'money', 'hacker', 'virus', 'linux'])

'money'

In [26]:
fasttext_model.wv.similarity('virus', 'hacker')

0.45874798