In [22]:
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os
from pyvi import ViTokenizer

In [23]:
def normalize_to_unicode(text):
    # Bảng ánh xạ các ký tự có dấu sang Unicode
    vietnamese_chars = {
        # Chữ a
        'à': '\u00E0', 'á': '\u00E1', 'ả': '\u1EA3', 'ã': '\u00E3', 'ạ': '\u1EA1',
        'ă': '\u0103', 'ằ': '\u1EB1', 'ắ': '\u1EAF', 'ẳ': '\u1EB3', 'ẵ': '\u1EB5', 'ặ': '\u1EB7',
        'â': '\u00E2', 'ầ': '\u1EA7', 'ấ': '\u1EA5', 'ẩ': '\u1EA9', 'ẫ': '\u1EAB', 'ậ': '\u1EAD',
        # Chữ e
        'è': '\u00E8', 'é': '\u00E9', 'ẻ': '\u1EBB', 'ẽ': '\u1EBD', 'ẹ': '\u1EB9',
        'ê': '\u00EA', 'ề': '\u1EC1', 'ế': '\u1EBF', 'ể': '\u1EC3', 'ễ': '\u1EC5', 'ệ': '\u1EC7',
        # Chữ i
        'ì': '\u00EC', 'í': '\u00ED', 'ỉ': '\u1EC9', 'ĩ': '\u0129', 'ị': '\u1ECB',
        # Chữ o
        'ò': '\u00F2', 'ó': '\u00F3', 'ỏ': '\u1ECF', 'õ': '\u00F5', 'ọ': '\u1ECD',
        'ô': '\u00F4', 'ồ': '\u1ED3', 'ố': '\u1ED1', 'ổ': '\u1ED5', 'ỗ': '\u1ED7', 'ộ': '\u1ED9',
        'ơ': '\u01A1', 'ờ': '\u1EDD', 'ớ': '\u1EDB', 'ở': '\u1EDF', 'ỡ': '\u1EE1', 'ợ': '\u1EE3',
        # Chữ u
        'ù': '\u00F9', 'ú': '\u00FA', 'ủ': '\u1EE7', 'ũ': '\u0169', 'ụ': '\u1EE5',
        'ư': '\u01B0', 'ừ': '\u1EEB', 'ứ': '\u1EE9', 'ử': '\u1EED', 'ữ': '\u1EEF', 'ự': '\u1EF1',
        # Chữ y
        'ỳ': '\u1EF3', 'ý': '\u00FD', 'ỷ': '\u1EF7', 'ỹ': '\u1EF9', 'ỵ': '\u1EF5',
        # Chữ d
        'đ': '\u0111',
        # Chữ in hoa
        'À': '\u00C0', 'Á': '\u00C1', 'Ả': '\u1EA2', 'Ã': '\u00C3', 'Ạ': '\u1EA0',
        'Ă': '\u0102', 'Ằ': '\u1EB0', 'Ắ': '\u1EAE', 'Ẳ': '\u1EB2', 'Ẵ': '\u1EB4', 'Ặ': '\u1EB6',
        'Â': '\u00C2', 'Ầ': '\u1EA6', 'Ấ': '\u1EA4', 'Ẩ': '\u1EA8', 'Ẫ': '\u1EAA', 'Ậ': '\u1EAC',
        'È': '\u00C8', 'É': '\u00C9', 'Ẻ': '\u1EBA', 'Ẽ': '\u1EBC', 'Ẹ': '\u1EB8',
        'Ê': '\u00CA', 'Ề': '\u1EC0', 'Ế': '\u1EBE', 'Ể': '\u1EC2', 'Ễ': '\u1EC4', 'Ệ': '\u1EC6',
        'Ì': '\u00CC', 'Í': '\u00CD', 'Ỉ': '\u1EC8', 'Ĩ': '\u0128', 'Ị': '\u1ECA',
        'Ò': '\u00D2', 'Ó': '\u00D3', 'Ỏ': '\u1ECE', 'Õ': '\u00D5', 'Ọ': '\u1ECC',
        'Ô': '\u00D4', 'Ồ': '\u1ED2', 'Ố': '\u1ED0', 'Ổ': '\u1ED4', 'Ỗ': '\u1ED6', 'Ộ': '\u1ED8',
        'Ơ': '\u01A0', 'Ờ': '\u1EDC', 'Ớ': '\u1EDA', 'Ở': '\u1EDE', 'Ỡ': '\u1EE0', 'Ợ': '\u1EE2',
        'Ù': '\u00D9', 'Ú': '\u00DA', 'Ủ': '\u1EE6', 'Ũ': '\u0168', 'Ụ': '\u1EE4',
        'Ư': '\u01AF', 'Ừ': '\u1EEA', 'Ứ': '\u1EE8', 'Ử': '\u1EEC', 'Ữ': '\u1EEE', 'Ự': '\u1EF0',
        'Ỳ': '\u1EF2', 'Ý': '\u00DD', 'Ỷ': '\u1EF6', 'Ỹ': '\u1EF8', 'Ỵ': '\u1EF4',
        'Đ': '\u0110'
    }
    # Chuyển đổi văn bản
    normalized_text = ""
    for char in text:
        if char in vietnamese_chars:
            normalized_text += vietnamese_chars[char]
        else:
            normalized_text += char
    
    return normalized_text

In [24]:
def remove_stopwords(text):
    with open("vietnamese-stopwords.txt", "r", encoding="utf-8") as f:
        stopwords = set(f.read().splitlines())
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in stopwords]
        return ' '.join(filtered_words)

In [25]:
def normalize_text(text):
    text_lower = text.lower()
    text_remove_special = re.sub(r'[^\w\s]', '', text_lower)
    text_remove_space = text_remove_special.strip()
    text_to_unicode = normalize_to_unicode(text_remove_space)
    tokenized_text = ViTokenizer.tokenize(text_to_unicode)
    text_remove_stopword = remove_stopwords(tokenized_text)
    return text_remove_stopword

In [None]:
pathUrl = 'sort_label.csv'

def preprocess_text(text):
    text = normalize_text(text)
    return text
df = pd.read_csv(pathUrl) 
texts = df['text'].tolist()
labels = df['label'].tolist()
processed_texts = [preprocess_text(text) for text in texts]
print(processed_texts)
processed_df = pd.DataFrame({'text': processed_texts, 'label': labels})
processed_df.to_csv('clean_data.csv', index=False, encoding='utf-8')

In [26]:
def compute_tf(document):
    
    words = document.lower().split()
    word_count = {}
    total_words = len(words)
    for word in words:
        word_count[word] = word_count.get(word, 0) + 1
    tf = {word: count / total_words for word, count in word_count.items()}
    return tf

def compute_idf(documents):
    
    N = len(documents)
    word_df = {}
    for doc in documents:
        words = set(doc.split())
        for word in words:
            word_df[word] = word_df.get(word, 0) + 1
    idf = {word: math.log((N + 1) / (df + 1)) + 1 for word, df in word_df.items()}
    return idf

def compute_tf_idf(documents, idf_scores=None):
    
    if idf_scores is None:
        idf_scores = compute_idf(documents)
    tf_idf = []
    for doc in documents:
        tf_scores = compute_tf(doc)
        tf_idf_s = {word: tf_scores[word] * idf_scores.get(word, 0) for word in tf_scores}
        tf_idf.append(tf_idf_s)
    return tf_idf

In [27]:
def convert_to_matrix(tf_idf_scores, vocabulary):
    matrix = [[doc_scores.get(word, 0) for word in vocabulary] for doc_scores in tf_idf_scores]
    return matrix

In [28]:
# File paths for saving model and data
MODEL_PATH = 'train/svm_model.pkl'
VOCAB_PATH = 'train/vocabulary.pkl'
IDF_PATH = 'train/idf_scores.pkl'

def predict_with_model(X_test_list):
    if not (os.path.exists(MODEL_PATH) and os.path.exists(VOCAB_PATH) and os.path.exists(IDF_PATH)):
        print("No saved model or data found. Please train the model first!")
        return None

    # Load model, vocabulary, and IDF scores
    svm_model = joblib.load(MODEL_PATH)
    vocabulary = joblib.load(VOCAB_PATH)
    idf_scores = joblib.load(IDF_PATH)

    # Compute TF-IDF for test data
    tf_idf_test = compute_tf_idf(X_test_list, idf_scores)
    X_test_matrix = convert_to_matrix(tf_idf_test, vocabulary)
    X_test_array = np.array(X_test_matrix)

    # Predict
    y_pred = svm_model.predict(X_test_array)
    return y_pred

def predict_text(text):
    if not (os.path.exists(MODEL_PATH) and os.path.exists(VOCAB_PATH) and os.path.exists(IDF_PATH)):
        print("No saved model or data found. Please train the model first!")
        return None
    svm_model = joblib.load(MODEL_PATH)
    vocabulary = joblib.load(VOCAB_PATH)
    idf_scores = joblib.load(IDF_PATH)

    text_list = [text]  
    tf_idf = compute_tf_idf(text_list, idf_scores)
    text_matrix = convert_to_matrix(tf_idf, vocabulary)
    text_array = np.array(text_matrix)

    # Predict class and probability
    prediction = svm_model.predict(text_array)[0]
    confidence = svm_model.predict_proba(text_array)[0][prediction]  # Lấy xác suất của class dự đoán

    result = "Lừa đảo" if prediction == 1 else "Hợp lệ"
    print(f"Dự đoán: {result} (Độ tin cậy: {confidence:.2%})")
    return prediction, confidence


In [29]:
def train_model(X_train_list, y_train):
    idf_scores = compute_idf(X_train_list)
    tf_idf_train = compute_tf_idf(X_train_list, idf_scores)
    vocabulary = sorted(set().union(*[doc_scores.keys() for doc_scores in tf_idf_train]))

    tf_idf_train = compute_tf_idf(X_train_list, idf_scores)
    X_train_matrix = convert_to_matrix(tf_idf_train, vocabulary)
    X_train_array = np.array(X_train_matrix)

    # Train SVM model with optimized parameters
    svm_model = SVC(
        kernel='rbf',     # Thay đổi kernel thành rbf
        C=100,            # Giá trị C được điều chỉnh
        gamma='scale',    # Điều chỉnh gamma
        probability=True, 
        random_state=42
    )
    svm_model.fit(X_train_array, y_train)

    joblib.dump(svm_model, MODEL_PATH)
    joblib.dump(vocabulary, VOCAB_PATH)
    joblib.dump(idf_scores, IDF_PATH)

    print("Model trained and saved successfully!")
    return svm_model, vocabulary, idf_scores


In [30]:
data = pd.read_csv('dataset.csv')
X = data['text'].fillna('')  
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_list = X_train.tolist()
X_test_list = X_test.tolist()

if not os.path.exists(MODEL_PATH):
    svm_model, vocabulary, idf_scores = train_model(X_train_list, y_train)
else:
    print("Model already exists, skipping training.")

y_pred = predict_with_model(X_test_list)

if y_pred is not None:
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))


Model trained and saved successfully!
Accuracy: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       478
           1       1.00      1.00      1.00       523

    accuracy                           1.00      1001
   macro avg       1.00      1.00      1.00      1001
weighted avg       1.00      1.00      1.00      1001



In [34]:
if __name__ == "__main__":
   example = "mua hàng giá rẻ"
   example_2 = "bán khóa học giá rẻ"
   print("Text: ",example)
   predict_text(example)
   print("\n")
   print("Text: ",example_2)
   predict_text(example_2)


Text:  mua hàng giá rẻ
Dự đoán: Hợp lệ (Độ tin cậy: 73.03%)


Text:  bán khóa học giá rẻ
Dự đoán: Lừa đảo (Độ tin cậy: 93.30%)
