In [1]:
import pandas as pd
import re
import numpy as np
import gensim
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import gensim.downloader as api
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

  "class": algorithms.Blowfish,


In [2]:
def load_data(filepath, sep_type):
    """Load dataset from file and return dataframe."""
    return pd.read_csv(filepath, sep=f'{sep_type}', names=['label', 'message'])

In [3]:
def preprocess_email(email):
    lemmatizer = WordNetLemmatizer()
    # Loại bỏ ký tự không mong muốn
    email_cleaned = re.sub('[^a-zA-Z]', ' ', email)
    # Chuyển thành chữ thường và tách từ
    email_cleaned = email_cleaned.lower().split()
    # Lemmatization
    email_cleaned = [lemmatizer.lemmatize(word) for word in email_cleaned]
    email_cleaned = ' '.join(email_cleaned)
    
    return email_cleaned

In [4]:
def avg_word2vec(words):
    """Chuyển văn bản thành vector trung bình của Word2Vec (áp dụng cho cả huấn luyện và dự đoán)."""
    vectors = [model.wv[word] for word in words if word in model.wv.index_to_key]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [5]:
def tokenizer(corpus):
    words = []
    for sent in corpus:
        sent_token = sent_tokenize(sent)
        for sent in sent_token:
            words.append(simple_preprocess(sent))
    return words

In [6]:
from sklearn.model_selection import train_test_split

messages = load_data('SMSSpamCollection.txt', '\t')

wv = api.load('word2vec-google-news-300')

corpus = [preprocess_email(text) for text in messages['message']]

words = tokenizer(corpus)

model = gensim.models.Word2Vec(words)

In [7]:
X = []
for i in range(len(words)):
    X.append(avg_word2vec(words[i]))
    
print(X[0])

# Dependent feature
y = messages[list(map(lambda x: len(x) > 0, corpus))]
y = pd.get_dummies(y['label']).astype(int)
y = y.iloc[:, 0].values


df_list = [pd.DataFrame(X[i].reshape(1, -1)) for i in range(len(X))]
df = pd.concat(df_list, ignore_index=True)
df['Output'] = y
df.dropna(inplace = True)

X = df.drop('Output', axis=1)

y = df['Output']

[-0.16094023  0.24429367  0.12193104  0.11323108  0.08375572 -0.4852251
  0.18646455  0.46235788 -0.26611784 -0.16720085 -0.11719898 -0.3821741
 -0.0273889   0.09587178  0.19931193 -0.15951428  0.08937047 -0.30973163
 -0.04517631 -0.5176502   0.20041043  0.10936268  0.10300862 -0.22962676
 -0.04926636 -0.0127106  -0.20762627 -0.1821898  -0.25954726  0.03918045
  0.2960206   0.02348683  0.10209922 -0.1774608  -0.10078663  0.4112987
  0.10113541 -0.11921144 -0.15400642 -0.44177574  0.10847908 -0.23453476
 -0.17050324 -0.00461487  0.15402652 -0.00668712 -0.14629723 -0.03287847
  0.21666364  0.1222676   0.1606414  -0.21153969 -0.04806506  0.06725422
 -0.10126086  0.09384183  0.15467389 -0.02278042 -0.40434954  0.14738019
  0.01313257  0.15763931 -0.00119324 -0.07919238 -0.30182502  0.2780778
  0.07563531  0.23247293 -0.32184085  0.39147112 -0.26944056  0.1782777
  0.40150598 -0.10626078  0.35069466  0.08477375  0.1115392  -0.09624092
 -0.18632297  0.07318625 -0.22273752 -0.10988794 -0.2533

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [9]:
from sklearn.metrics import accuracy_score, classification_report

In [10]:
accuracy_score(y_test, y_pred)

0.9658886894075404

In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.85      0.88       171
           1       0.97      0.99      0.98       943

    accuracy                           0.97      1114
   macro avg       0.95      0.92      0.93      1114
weighted avg       0.97      0.97      0.97      1114



In [14]:
# Ví dụ email mới
new_email = "Hey mohan, can we get together to watch footbal game tomorrow?"

# Bước 1: Xử lý dữ liệu email mới
def predict_email(email, classifier):
    lemmatizer = WordNetLemmatizer()

    # Loại bỏ ký tự không mong muốn
    email_cleaned = re.sub('[^a-zA-Z]', ' ', email)
    # Chuyển thành chữ thường và tách từ
    email_cleaned = email_cleaned.lower().split()
    # Lemmatization
    email_cleaned = [lemmatizer.lemmatize(word) for word in email_cleaned]
    
    print(email_cleaned)

    email_vector = avg_word2vec(email_cleaned)
    email_vector = email_vector.reshape(1, -1)
    predicted_label = classifier.predict(email_vector)

    if predicted_label[0] == 0:
        print("Đây là email spam.")
    else:
        print("Đây là email bình thường (ham).")

predict_email(new_email, classifier)
predict_email('Get rich quick with our investment opportunity! Join now and start earning big profits!', classifier)

Đây là email spam.
Đây là email spam.
