In [1]:
import pandas as pd
import re
import numpy as np
import gensim
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import gensim.downloader as api
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

  "class": algorithms.Blowfish,


In [2]:
def load_data(filepath, sep_type):
    """Load dataset from file and return dataframe."""
    return pd.read_csv(filepath, sep=f'{sep_type}', names=['label', 'message'])

In [3]:
def preprocess_email(email):
    lemmatizer = WordNetLemmatizer()
    # Loại bỏ ký tự không mong muốn
    email_cleaned = re.sub('[^a-zA-Z]', ' ', email)
    # Chuyển thành chữ thường và tách từ
    email_cleaned = email_cleaned.lower().split()
    # Lemmatization
    email_cleaned = [lemmatizer.lemmatize(word) for word in email_cleaned]
    email_cleaned = ' '.join(email_cleaned)
    
    return email_cleaned

In [4]:
def avg_word2vec(words):
    """Chuyển văn bản thành vector trung bình của Word2Vec (áp dụng cho cả huấn luyện và dự đoán)."""
    vectors = [model.wv[word] for word in words if word in model.wv.index_to_key]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [5]:
def tokenizer(corpus):
    words = []
    for sent in corpus:
        sent_token = sent_tokenize(sent)
        for sent in sent_token:
            words.append(simple_preprocess(sent))
    return words

In [None]:
def data_preparation(corpus, words):
    X = []
    for i in range(len(words)):
        X.append(avg_word2vec(words[i]))

    # Dependent feature
    y = messages[list(map(lambda x: len(x) > 0, corpus))]
    y = pd.get_dummies(y['label']).astype(int)
    y = y.iloc[:, 0].values

    df_list = [pd.DataFrame(X[i].reshape(1, -1)) for i in range(len(X))]
    df = pd.concat(df_list, ignore_index=True)
    df['Output'] = y
    df.dropna(inplace = True)

    # Independent feature

    X = df.drop('Output', axis=1)

    y = df['Output']

    return X, y

In [None]:
# ------------------ LOAD NECESSARY DATASET AND MODEL ------------------ 

In [6]:
messages = load_data('SMSSpamCollection.txt', '\t')

wv = api.load('word2vec-google-news-300')

# Data pre-processing
corpus = [preprocess_email(text) for text in messages['message']]

words = tokenizer(corpus)

# CREATE WORD2VEC MODEL
model = gensim.models.Word2Vec(words)

In [None]:
# ------------------ X Y IMPLEMENT ------------------ 

In [7]:
X, y = data_preparation(corpus, words)

[-0.18742245  0.23839049  0.12214083  0.11458489  0.08225346 -0.47694078
  0.16064315  0.46297437 -0.2576366  -0.1325823  -0.14289823 -0.38258544
 -0.0494565   0.10641685  0.19028778 -0.14978538  0.10438324 -0.30171856
 -0.00830846 -0.5492333   0.1968523   0.11854635  0.12248059 -0.21289255
 -0.03403502  0.01122623 -0.2014506  -0.18108334 -0.2372589   0.04153708
  0.3093704   0.02310358  0.10164714 -0.16371997 -0.10810269  0.39712524
  0.08485963 -0.13236417 -0.1339178  -0.45190564  0.1108321  -0.25132397
 -0.1891016   0.02107293  0.15716642 -0.01503386 -0.14312    -0.06373135
  0.2147276   0.11258324  0.1925603  -0.2073191  -0.05834823  0.06850018
 -0.1231152   0.0719227   0.1635811  -0.01569516 -0.42661825  0.13437368
 -0.01297845  0.16048715  0.00496727 -0.09118137 -0.3016874   0.283421
  0.07070527  0.23263355 -0.3346211   0.3840157  -0.28539798  0.18525784
  0.38782722 -0.10626635  0.3371229   0.05741106  0.09948619 -0.09830821
 -0.17273721  0.08543866 -0.22726259 -0.11239064 -0.2

In [None]:
# ------------------ TRAINING MODEL ------------------ 

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

classifier.fit(X_train, y_train)

In [None]:
# ------------------ EVALUATE ------------------ 

In [9]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
y_pred = classifier.predict(X_test)

In [10]:
accuracy_score(y_test, y_pred)

0.9685816876122083

In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88       145
           1       0.98      0.99      0.98       969

    accuracy                           0.97      1114
   macro avg       0.94      0.92      0.93      1114
weighted avg       0.97      0.97      0.97      1114



In [None]:
# ------------------ PREDICT ------------------ 

In [12]:
new_email = "Hey mohan, can we get together to watch footbal game tomorrow?"

# Bước 1: Xử lý dữ liệu email mới
def predict_email(email, classifier):
    lemmatizer = WordNetLemmatizer()

    # Loại bỏ ký tự không mong muốn
    email_cleaned = re.sub('[^a-zA-Z]', ' ', email)
    # Chuyển thành chữ thường và tách từ
    email_cleaned = email_cleaned.lower().split()
    # Lemmatization
    email_cleaned = [lemmatizer.lemmatize(word) for word in email_cleaned]
    
    print(email_cleaned)

    email_vector = avg_word2vec(email_cleaned)
    email_vector = email_vector.reshape(1, -1)
    predicted_label = classifier.predict(email_vector)

    if predicted_label[0] == 0:
        print("Đây là email spam.")
    else:
        print("Đây là email bình thường (ham).")

predict_email(new_email, classifier)
predict_email('Get rich quick with our investment opportunity! Join now and start earning big profits!', classifier)

['hey', 'mohan', 'can', 'we', 'get', 'together', 'to', 'watch', 'footbal', 'game', 'tomorrow']
Đây là email bình thường (ham).
['get', 'rich', 'quick', 'with', 'our', 'investment', 'opportunity', 'join', 'now', 'and', 'start', 'earning', 'big', 'profit']
Đây là email spam.


In [13]:
emails=[
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'hi, how are you?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!',
    'i won free vacation , you can win click below link',
    "Hey Sarah, are you available for a coffee chat this weekend?",
    "Congratulations! You've been selected as the winner of a $1000 cash prize. Click the link to claim your reward now!"
    'Claim your exclusive discount code now and save 30% on your next purchase! Limited time offer!',
    "You've won a luxury vacation package! Click the link to claim your prize and enjoy a dream getaway!",
    'Get rich quick with our investment opportunity! Join now and start earning big profits!',
    'Hey there, how about catching up for lunch this weekend? It\'s been a while since we last met!',
    'Reminder: Your appointment with the doctor is scheduled for tomorrow at 10 AM. Please remember to bring any necessary documents.',
    'Invitation: Join us for a team-building event this Friday at the local park. Food and games provided!',
]

for email in emails:
  predict_email(email, classifier)

['hey', 'mohan', 'can', 'we', 'get', 'together', 'to', 'watch', 'footbal', 'game', 'tomorrow']
Đây là email bình thường (ham).
['hi', 'how', 'are', 'you']
Đây là email bình thường (ham).
['upto', 'discount', 'on', 'parking', 'exclusive', 'offer', 'just', 'for', 'you', 'dont', 'miss', 'this', 'reward']
Đây là email bình thường (ham).
['i', 'won', 'free', 'vacation', 'you', 'can', 'win', 'click', 'below', 'link']
Đây là email spam.
['hey', 'sarah', 'are', 'you', 'available', 'for', 'a', 'coffee', 'chat', 'this', 'weekend']
Đây là email bình thường (ham).
['congratulation', 'you', 've', 'been', 'selected', 'a', 'the', 'winner', 'of', 'a', 'cash', 'prize', 'click', 'the', 'link', 'to', 'claim', 'your', 'reward', 'now', 'claim', 'your', 'exclusive', 'discount', 'code', 'now', 'and', 'save', 'on', 'your', 'next', 'purchase', 'limited', 'time', 'offer']
Đây là email spam.
['you', 've', 'won', 'a', 'luxury', 'vacation', 'package', 'click', 'the', 'link', 'to', 'claim', 'your', 'prize', 'and', 