In [1]:
import pandas as pd
import re
import numpy as np
import gensim
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
import gensim.downloader as api
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

  "class": algorithms.Blowfish,


In [2]:
def load_data(filepath, sep_type):
    """Load dataset from file and return dataframe."""
    return pd.read_csv(filepath, sep=f'{sep_type}', names=['label', 'message'])

In [3]:
def preprocess_email(email):
    lemmatizer = WordNetLemmatizer()
    # Loại bỏ ký tự không mong muốn
    email_cleaned = re.sub('[^a-zA-Z]', ' ', email)
    # Chuyển thành chữ thường và tách từ
    email_cleaned = email_cleaned.lower().split()
    # Lemmatization
    email_cleaned = [lemmatizer.lemmatize(word, pos='v') for word in email_cleaned if not word in stopwords.words('english')]
    email_cleaned = ' '.join(email_cleaned)
    
    return email_cleaned

In [4]:
def avg_word2vec(words):
    """Chuyển văn bản thành vector trung bình của Word2Vec (áp dụng cho cả huấn luyện và dự đoán)."""
    vectors = [model.wv[word] for word in words if word in model.wv.index_to_key]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [5]:
def tokenizer(corpus):
    words = []
    for sent in corpus:
        sent_token = sent_tokenize(sent)
        for sent in sent_token:
            words.append(simple_preprocess(sent))
    return words

In [6]:
def data_preparation(corpus, words):
    X = []
    for i in range(len(words)):
        X.append(avg_word2vec(words[i]))

    # Dependent feature
    y = messages[list(map(lambda x: len(x) > 0, corpus))]
    y = pd.get_dummies(y['label']).astype(int)
    y = y.iloc[:, 0].values

    df_list = [pd.DataFrame(X[i].reshape(1, -1)) for i in range(len(X))]
    df = pd.concat(df_list, ignore_index=True)
    df['Output'] = y
    df.dropna(inplace = True)

    # Independent feature

    X = df.drop('Output', axis=1)

    y = df['Output']

    return X, y

In [7]:
# ------------------ LOAD NECESSARY DATASET AND MODEL ------------------ 

In [8]:
messages = load_data('SMSSpamCollection.txt', '\t')

wv = api.load('word2vec-google-news-300')

# Data pre-processing
corpus = [preprocess_email(text) for text in messages['message']]

words = tokenizer(corpus)

# CREATE WORD2VEC MODEL
# model = gensim.models.Word2Vec(words, vector_size=300, window=10, min_count=2, workers=4)

model = gensim.models.Word2Vec(
    words,              # your tokenized corpus
    vector_size=100,    # embedding size
    window=5,           # context window size
    min_count=5,        # include all words
    sg=1,               # use Skip-Gram
    workers=4,          # parallelization
    epochs=10           # number of iterations
)

In [9]:
# ------------------ X Y IMPLEMENT ------------------ 

In [10]:
X, y = data_preparation(corpus, words)

In [11]:
# ------------------ TRAINING MODEL ------------------ 

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# from sklearn.ensemble import RandomForestClassifier
# classifier = RandomForestClassifier()
# classifier.fit(X_train, y_train)

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [13]:
# ------------------ EVALUATE ------------------ 

In [14]:
from sklearn.metrics import accuracy_score, classification_report

In [15]:
y_pred = classifier.predict(X_test)

In [16]:
accuracy_score(y_test, y_pred)

0.9694519317160827

In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88       139
           1       0.99      0.98      0.98       974

    accuracy                           0.97      1113
   macro avg       0.92      0.95      0.93      1113
weighted avg       0.97      0.97      0.97      1113



In [18]:
# ------------------ PREDICT ------------------ 

In [19]:
# Ví dụ email mới
new_email = "Hey mohan, can we get together to watch footbal game tomorrow?"

# Bước 1: Xử lý dữ liệu email mới
def predict_email(email, classifier):
    lemmatizer = WordNetLemmatizer()

    def preprocess_email(email):
        # Loại bỏ ký tự không mong muốn
        email_cleaned = re.sub('[^a-zA-Z]', ' ', email)
        # Chuyển thành chữ thường và tách từ
        email_cleaned = email_cleaned.lower().split()
        # Lemmatization
        email_cleaned = [lemmatizer.lemmatize(word, pos='v') for word in email_cleaned if not word in stopwords.words('english')]

        email_vector = avg_word2vec(email_cleaned)
        email_vector = email_vector.reshape(1, -1)
        predicted_label = classifier.predict(email_vector)

        return email, 'Spam' if predicted_label[0] == 0 else 'Ham'

    results = []
    if isinstance(email, str):
        results.append(preprocess_email(email))
    elif isinstance(email, list):
        for corpus in email:
            results.append(preprocess_email(corpus))

    # Chuyển kết quả sang DataFrame
    results_df = pd.DataFrame(results, columns=['Email', 'Label'])
    print(results_df)
        

predict_email(new_email, classifier)
predict_email('Get rich quick with our investment opportunity! Join now and start earning big profits!', classifier)

                                               Email Label
0  Hey mohan, can we get together to watch footba...   Ham
                                               Email Label
0  Get rich quick with our investment opportunity...   Ham


In [20]:
emails=[
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'hi, how are you?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!',
    'i won free vacation , you can win click below link',
    "Hey Sarah, are you available for a coffee chat this weekend?",
    "Congratulations! You've been selected as the winner of a $1000 cash prize. Click the link to claim your reward now!"
    'Claim your exclusive discount code now and save 30% on your next purchase! Limited time offer!',
    "You've won a luxury vacation package! Click the link to claim your prize and enjoy a dream getaway!",
    'Get rich quick with our investment opportunity! Join now and start earning big profits!',
    'Hey there, how about catching up for lunch this weekend? It\'s been a while since we last met!',
    'Reminder: Your appointment with the doctor is scheduled for tomorrow at 10 AM. Please remember to bring any necessary documents.',
    'Invitation: Join us for a team-building event this Friday at the local park. Food and games provided!',
]

predict_email(emails, classifier)

                                                Email Label
0   Hey mohan, can we get together to watch footba...   Ham
1                                    hi, how are you?   Ham
2   Upto 20% discount on parking, exclusive offer ...  Spam
3   i won free vacation , you can win click below ...  Spam
4   Hey Sarah, are you available for a coffee chat...   Ham
5   Congratulations! You've been selected as the w...  Spam
6   You've won a luxury vacation package! Click th...  Spam
7   Get rich quick with our investment opportunity...   Ham
8   Hey there, how about catching up for lunch thi...   Ham
9   Reminder: Your appointment with the doctor is ...   Ham
10  Invitation: Join us for a team-building event ...   Ham


In [21]:
predict_email('free 1000 claim please kill self', classifier)

                              Email Label
0  free 1000 claim please kill self  Spam


In [22]:
predict_email('It’s the last day of our sale. Shop now and save.', classifier)

                                               Email Label
0  It’s the last day of our sale. Shop now and save.   Ham


In [23]:
spam_email_contents = [
    "You have been selected to receive a $10,000 gift card from our exclusive rewards program!",
    "Your application has been pre-approved for a $50,000 loan with no credit check required!",
    "We have detected suspicious activity on your account.",
    "Looking for a way to earn money from the comfort of your home?",
    "You have been selected to receive a FREE cruise to the Bahamas!",
    "Your PayPal account has been temporarily suspended due to suspicious activity.",
    "You have been selected to win a brand-new iPhone 15!",
    "Invest in Bitcoin today and double your money within 48 hours!",
    "You have won a $500 Amazon gift card!",
    "Sign up today for a FREE trial and gain access to premium resources that will change your life."
]


predict_email(spam_email_contents, classifier)

                                               Email Label
0  You have been selected to receive a $10,000 gi...  Spam
1  Your application has been pre-approved for a $...   Ham
2  We have detected suspicious activity on your a...   Ham
3  Looking for a way to earn money from the comfo...   Ham
4  You have been selected to receive a FREE cruis...  Spam
5  Your PayPal account has been temporarily suspe...   Ham
6  You have been selected to win a brand-new iPho...  Spam
7  Invest in Bitcoin today and double your money ...   Ham
8              You have won a $500 Amazon gift card!  Spam
9  Sign up today for a FREE trial and gain access...   Ham


In [24]:
spam_email_contents = [
    "Congratulations! You've won a $1000 gift card.",
    "Claim your exclusive discount code for 50% off your next purchase!",
    "You've been selected for a free vacation package!",
    "Get a limited-time offer: $500 off your next order!",
    "Your entry has won a cash prize! Click here to claim it.",
    "Unlock your special coupon code for a free trial!",
    "You've received a reward for being a loyal customer!",
    "Act now! This offer expires in 24 hours.",
    "Congratulations on your recent purchase! Here’s a bonus discount.",
    "Claim your free gift now! No purchase necessary.",
     "You have been selected to receive a $10,000 gift card from our exclusive rewards program!",
    "Your application has been pre-approved for a $50,000 loan with no credit check required!",
    "We have detected suspicious activity on your account.",
    "Looking for a way to earn money from the comfort of your home?",
    "You have been selected to receive a FREE cruise to the Bahamas!",
    "Your PayPal account has been temporarily suspended due to suspicious activity.",
    "You have been selected to win a brand-new iPhone 15!",
    "Invest in Bitcoin today and double your money within 48 hours!",
    "You have won a $500 Amazon gift card!",
    "Sign up today for a FREE trial and gain access to premium resources that will change your life."
]
predict_email(spam_email_contents, classifier)

                                                Email Label
0      Congratulations! You've won a $1000 gift card.  Spam
1   Claim your exclusive discount code for 50% off...  Spam
2   You've been selected for a free vacation package!  Spam
3   Get a limited-time offer: $500 off your next o...  Spam
4   Your entry has won a cash prize! Click here to...  Spam
5   Unlock your special coupon code for a free trial!  Spam
6   You've received a reward for being a loyal cus...  Spam
7            Act now! This offer expires in 24 hours.  Spam
8   Congratulations on your recent purchase! Here’...  Spam
9    Claim your free gift now! No purchase necessary.  Spam
10  You have been selected to receive a $10,000 gi...  Spam
11  Your application has been pre-approved for a $...   Ham
12  We have detected suspicious activity on your a...   Ham
13  Looking for a way to earn money from the comfo...   Ham
14  You have been selected to receive a FREE cruis...  Spam
15  Your PayPal account has been tempora

In [25]:
email_samples = [
    "Congratulations! You've won a $1000 gift card.",
    "Hi team, can we schedule a meeting for next week to discuss the project?",
    "Claim your exclusive discount code for 50% off your next purchase!",
    "Don't forget to submit your expense reports by Friday.",
    "You've been selected for a free vacation package!",
    "The project deadline has been moved to next month.",
    "Get a limited-time offer: $500 off your next order!",
    "Attached is the presentation for our upcoming client meeting.",
    "Your entry has won a cash prize! Click here to claim it.",
    "Let's review the budget proposal during our next meeting.",
    "Unlock your special coupon code for a free trial!",
    "Please find the updated document for your review.",
    "You've received a reward for being a loyal customer!",
    "Reminder: Our team-building event is scheduled for Saturday.",
    "Act now! This offer expires in 24 hours.",
    "Thank you for your feedback on the last project.",
    "Congratulations on your recent purchase! Here’s a bonus discount.",
    "Can you send me the latest sales figures by end of day?",
    "Claim your free gift now! No purchase necessary.",
    "I appreciate your hard work and dedication to this project."
]


predict_email(email_samples, classifier)

                                                Email Label
0      Congratulations! You've won a $1000 gift card.  Spam
1   Hi team, can we schedule a meeting for next we...   Ham
2   Claim your exclusive discount code for 50% off...  Spam
3   Don't forget to submit your expense reports by...   Ham
4   You've been selected for a free vacation package!  Spam
5   The project deadline has been moved to next mo...   Ham
6   Get a limited-time offer: $500 off your next o...  Spam
7   Attached is the presentation for our upcoming ...   Ham
8   Your entry has won a cash prize! Click here to...  Spam
9   Let's review the budget proposal during our ne...   Ham
10  Unlock your special coupon code for a free trial!  Spam
11  Please find the updated document for your review.  Spam
12  You've received a reward for being a loyal cus...  Spam
13  Reminder: Our team-building event is scheduled...   Ham
14           Act now! This offer expires in 24 hours.  Spam
15   Thank you for your feedback on the 