In [1]:
import pandas as pd
import re
import numpy as np
import gensim
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
import gensim.downloader as api
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

  "class": algorithms.Blowfish,


In [2]:
def load_data(filepath, sep_type):
    """Load dataset from file and return dataframe."""
    return pd.read_csv(filepath, sep=f'{sep_type}', names=['label', 'message'])

In [3]:
def preprocess_email(email):
    lemmatizer = WordNetLemmatizer()
    # Loại bỏ ký tự không mong muốn
    email_cleaned = re.sub('[^a-zA-Z]', ' ', email)
    # Chuyển thành chữ thường và tách từ
    email_cleaned = email_cleaned.lower().split()
    # Lemmatization
    email_cleaned = [lemmatizer.lemmatize(word, pos='v') for word in email_cleaned if not word in stopwords.words('english')]
    email_cleaned = ' '.join(email_cleaned)
    
    return email_cleaned

In [4]:
def avg_word2vec(words):
    """Chuyển văn bản thành vector trung bình của Word2Vec (áp dụng cho cả huấn luyện và dự đoán)."""
    vectors = [model.wv[word] for word in words if word in model.wv.index_to_key]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [5]:
def tokenizer(corpus):
    words = []
    for sent in corpus:
        sent_token = sent_tokenize(sent)
        for sent in sent_token:
            words.append(simple_preprocess(sent))
    return words

In [6]:
def data_preparation(corpus, words):
    X = []
    for i in range(len(words)):
        X.append(avg_word2vec(words[i]))

    # Dependent feature
    y = messages[list(map(lambda x: len(x) > 0, corpus))]
    y = pd.get_dummies(y['label']).astype(int)
    y = y.iloc[:, 0].values

    df_list = [pd.DataFrame(X[i].reshape(1, -1)) for i in range(len(X))]
    df = pd.concat(df_list, ignore_index=True)
    df['Output'] = y
    df.dropna(inplace = True)

    # Independent feature

    X = df.drop('Output', axis=1)

    y = df['Output']

    return X, y

In [7]:
# ------------------ LOAD NECESSARY DATASET AND MODEL ------------------ 

In [8]:
messages = load_data('SMSSpamCollection.txt', '\t')

wv = api.load('word2vec-google-news-300')

# Data pre-processing
corpus = [preprocess_email(text) for text in messages['message']]

words = tokenizer(corpus)

# CREATE WORD2VEC MODEL
# model = gensim.models.Word2Vec(words, vector_size=300, window=10, min_count=2, workers=4)

model = gensim.models.Word2Vec(
    words,              # your tokenized corpus
    vector_size=100,    # embedding size
    window=5,           # context window size
    min_count=5,        # include all words
    sg=1,               # use Skip-Gram
    workers=4,          # parallelization
    epochs=10           # number of iterations
)

In [9]:
# ------------------ X Y IMPLEMENT ------------------ 

In [10]:
X, y = data_preparation(corpus, words)

In [11]:
# ------------------ TRAINING MODEL ------------------ 

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# from sklearn.ensemble import RandomForestClassifier
# classifier = RandomForestClassifier()

# classifier.fit(X_train, y_train)

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [13]:
# ------------------ EVALUATE ------------------ 

In [14]:
from sklearn.metrics import accuracy_score, classification_report

In [15]:
y_pred = classifier.predict(X_test)

In [16]:
accuracy_score(y_test, y_pred)

0.9712488769092543

In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.87      0.89       151
           1       0.98      0.99      0.98       962

    accuracy                           0.97      1113
   macro avg       0.95      0.93      0.94      1113
weighted avg       0.97      0.97      0.97      1113



In [18]:
# ------------------ PREDICT ------------------ 

In [19]:
# Ví dụ email mới
new_email = "Hey mohan, can we get together to watch footbal game tomorrow?"

# Bước 1: Xử lý dữ liệu email mới
def predict_email(email, classifier):
    lemmatizer = WordNetLemmatizer()

    def preprocess_email(email):
        # Loại bỏ ký tự không mong muốn
        email_cleaned = re.sub('[^a-zA-Z]', ' ', email)
        # Chuyển thành chữ thường và tách từ
        email_cleaned = email_cleaned.lower().split()
        # Lemmatization
        email_cleaned = [lemmatizer.lemmatize(word, pos='v') for word in email_cleaned if not word in stopwords.words('english')]

        print(email_cleaned)

        email_vector = avg_word2vec(email_cleaned)
        email_vector = email_vector.reshape(1, -1)
        predicted_label = classifier.predict(email_vector)

        if predicted_label[0] == 0:
            print("Đây là email spam.")
        else:
            print("Đây là email bình thường (ham).")

    if(type(email) == str):
      preprocess_email(email)
    if(type(email) == list):
      for corpus in email:
        preprocess_email(corpus)
        

predict_email(new_email, classifier)
predict_email('Get rich quick with our investment opportunity! Join now and start earning big profits!', classifier)

['hey', 'mohan', 'get', 'together', 'watch', 'footbal', 'game', 'tomorrow']
Đây là email bình thường (ham).
['get', 'rich', 'quick', 'investment', 'opportunity', 'join', 'start', 'earn', 'big', 'profit']
Đây là email bình thường (ham).


In [22]:
emails=[
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'hi, how are you?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!',
    'i won free vacation , you can win click below link',
    "Hey Sarah, are you available for a coffee chat this weekend?",
    "Congratulations! You've been selected as the winner of a $1000 cash prize. Click the link to claim your reward now!"
    'Claim your exclusive discount code now and save 30% on your next purchase! Limited time offer!',
    "You've won a luxury vacation package! Click the link to claim your prize and enjoy a dream getaway!",
    'Get rich quick with our investment opportunity! Join now and start earning big profits!',
    'Hey there, how about catching up for lunch this weekend? It\'s been a while since we last met!',
    'Reminder: Your appointment with the doctor is scheduled for tomorrow at 10 AM. Please remember to bring any necessary documents.',
    'Invitation: Join us for a team-building event this Friday at the local park. Food and games provided!',
]

predict_email(emails, classifier)

['hey', 'mohan', 'get', 'together', 'watch', 'footbal', 'game', 'tomorrow']
Đây là email bình thường (ham).
['hi']
Đây là email bình thường (ham).
['upto', 'discount', 'park', 'exclusive', 'offer', 'dont', 'miss', 'reward']
Đây là email spam.
['free', 'vacation', 'win', 'click', 'link']
Đây là email spam.
['hey', 'sarah', 'available', 'coffee', 'chat', 'weekend']
Đây là email bình thường (ham).
['congratulations', 'select', 'winner', 'cash', 'prize', 'click', 'link', 'claim', 'reward', 'claim', 'exclusive', 'discount', 'code', 'save', 'next', 'purchase', 'limit', 'time', 'offer']
Đây là email spam.
['luxury', 'vacation', 'package', 'click', 'link', 'claim', 'prize', 'enjoy', 'dream', 'getaway']
Đây là email spam.
['get', 'rich', 'quick', 'investment', 'opportunity', 'join', 'start', 'earn', 'big', 'profit']
Đây là email bình thường (ham).
['hey', 'catch', 'lunch', 'weekend', 'since', 'last', 'meet']
Đây là email bình thường (ham).
['reminder', 'appointment', 'doctor', 'schedule', 'tomo

In [21]:
predict_email('free 1000 claim please kill self', classifier)

['free', 'claim', 'please', 'kill', 'self']
Đây là email spam.
