1. Import Libraries

In [2]:
# สำหรับ WangchanBERTa
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

#สำหรับ Random Forest
import joblib
import pandas as pd
import numpy as np

2. Load Model

In [5]:
content_model_dir = "content_classifier/best_sms_content_model"

# โหลด WangchanBERTa model & tokenizer (content_classifier)
tokenizer = AutoTokenizer.from_pretrained(content_model_dir)
wangchanberta_model = AutoModelForSequenceClassification.from_pretrained(content_model_dir)

# labels ของคลาส
wangchanberta_labels = ['safe', 'spam', 'scam']

In [6]:
rf_model_path = "link_classifier_73/best_sms_link_model.pkl"
tfidf_path = "link_classifier_73/tfidf_vectorizer.pkl"

# โหลด Random Forest & tfidf (link_classifier)
rf_model = joblib.load(rf_model_path)
tfidf = joblib.load(tfidf_path)

# ฟีเจอร์ที่ใช้
rf_features = ['short_url', 'sus_tld', 'has_https', 'has_http', 'sus_redirect', 'ip_th', 'domain_age_lesser_than_three_month']


3. Predict function
- predict_sms_link (RF)
- predict_sms_content (WangchanBERTa)

In [8]:
def predict_sms_link(link_text, rf_features_dict):
    """
    link_text: str
        ลิงก์ดิบ (เช่น 'http://example.com')
    rf_features_dict: dict
        dict ของ rf_features เช่น
        {
            'short_url': 1,
            'sus_tld': 0,
            'has_https': 1,
            'has_http': 1,
            'sus_redirect': 0,
            'ip_th': 0,
            'domain_age_lesser_than_three_month': 1
        }
    """
    # แปลงลิงก์ใหม่ด้วย tfidf
    link_tfidf = tfidf.transform([link_text]).toarray()

    # แปลง rf_features เป็น array
    numeric_df = pd.DataFrame([rf_features_dict])[rf_features]
    numeric_array = numeric_df.values

    # รวม numeric features + tfidf vector
    X_pred = np.hstack([numeric_array, link_tfidf])

    # Random Forest Predict
    predicted_class = rf_model.predict(X_pred)[0]
    predicted_proba = rf_model.predict_proba(X_pred)[0]
    
    # แปลง predicted_class จากตัวเลขเป็น label ข้อความ + เฉพาะ scam
    class_mapping = {0: 'safe', 1: 'spam', 2: 'scam'}
    predicted_label = class_mapping[predicted_class]
    scam_prob = float(predicted_proba[2])
    
    # สร้าง dictionary ของความน่าจะเป็น (Probabilities)
    prob_dict = {
        'safe': float(predicted_proba[0]),
        'spam': float(predicted_proba[1]),
        'scam': float(predicted_proba[2])
    }
    
    return {
        "label": predicted_label,
        "probabilities": prob_dict,
        "scam_probability": round(scam_prob, 2)
    }

In [9]:
def predict_sms_content(text):
    """
    text: str
        ข้อความ SMS ที่ต้องการทำนาย
    """
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = wangchanberta_model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(probs, dim=1).item()
    
    return {
        "label": wangchanberta_labels[predicted_class],
        "probabilities": dict(zip(wangchanberta_labels, probs.flatten().tolist())),
        "scam_probability": round(probs.flatten().tolist()[wangchanberta_labels.index('scam')], 2)
    }

4. Model Prediction Test
input
- test_sms_text = ข้อความ SMS ที่ต้องการทำนาย (เฉพาะส่วนเนื้อหา)
- test_link = ลิงก์ที่แนบมากับ sms ที่ต้องการทำนาย

In [10]:
# ข้อความ SMS ที่ต้องการทำนาย
test_sms_text = "DE899 เว็บNo.1โอนlว ให้เพิ่ม3เท่า คืน12% ถอนได้เป็นล้าน"

# ลิงก์ SMS ที่ต้องการทำนาย
test_link = "http://cutt.ly/DETT11"

# Manual add rf features (Temporary)
test_rf_features = {
    'short_url': 1,
    'sus_tld': 0,
    'has_https': 0,
    'has_http': 1,
    'sus_redirect': 1,
    'ip_th': 0,
    'domain_age_lesser_than_three_month': 0
}

# Predict Content
content_result = predict_sms_content(test_sms_text)
print("SMS Content Prediction:")
print(content_result)

# Predict Link
link_result = predict_sms_link(test_link, test_rf_features)
print("SMS Link Prediction:")
print(link_result)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


SMS Content Prediction:
{'label': 'scam', 'probabilities': {'safe': 0.0014672125689685345, 'spam': 0.029311269521713257, 'scam': 0.9692214727401733}, 'scam_probability': 0.97}
SMS Link Prediction:
{'label': 'scam', 'probabilities': {'safe': 0.1659689482565863, 'spam': 0.04982016520391997, 'scam': 0.7842108865394941}, 'scam_probability': 0.78}


In [11]:
#format result
def format_prediction(title, result):
    print(f"{title}")
    print(f"Label: {result['label']} ")
    print("probabilities: ")
    print(f"-safe: {result['probabilities']['safe']}")
    print(f"-spam: {result['probabilities']['spam']}")
    print(f"-scam: {result['probabilities']['scam']}" + "}, ")
    print(f"**scam_probability: {result['scam_probability']:.2f}")

format_prediction("SMS Content Prediction:", content_result)
print()
format_prediction("SMS Link Prediction:", link_result)

SMS Content Prediction:
Label: scam 
probabilities: 
-safe: 0.0014672125689685345
-spam: 0.029311269521713257
-scam: 0.9692214727401733}, 
**scam_probability: 0.97

SMS Link Prediction:
Label: scam 
probabilities: 
-safe: 0.1659689482565863
-spam: 0.04982016520391997
-scam: 0.7842108865394941}, 
**scam_probability: 0.78


5. Risk Assessment
- Majority Vote

In [12]:
# ดึงค่า Prob จาก WangchanBERTa (p(WangchanBERTa))
content_probs = content_result['probabilities']
content_safe = content_probs['safe']
content_spam = content_probs['spam']
content_scam = content_probs['scam']

# ดึงค่า Prob จาก Random Forest (p(RF))
link_probs = link_result['probabilities']
link_safe = link_probs['safe']
link_spam = link_probs['spam']
link_scam = link_probs['scam']

# Check Prob
print("Content Probabilities:")
print(f"p_safe: {content_safe:.4f}")
print(f"p_spam: {content_spam:.4f}")
print(f"p_scam: {content_scam:.4f}\n")

print("Link Probabilities:")
print(f"p_safe: {link_safe:.4f}")
print(f"p_spam: {link_spam:.4f}")
print(f"p_scam: {link_scam:.4f}")
print('-'*30)

#เตรียมตัวแปรสำหรับสมการ Risk Scoring (Risk Scoring Equation Variables - Majority Vote)
p_scam_var = max(content_scam,link_scam)
p_safe_var = max(content_safe,link_safe)
p_spam_var = max(content_spam,link_spam)
print('Risk Scoring Equation Variables')
print(f"p_scam = {p_scam_var:.4f}")
print(f"p_safe = {p_safe_var:.4f}")
print(f"p_spam = {p_spam_var:.4f}")

Content Probabilities:
p_safe: 0.0015
p_spam: 0.0293
p_scam: 0.9692

Link Probabilities:
p_safe: 0.1660
p_spam: 0.0498
p_scam: 0.7842
------------------------------
Risk Scoring Equation Variables
p_scam = 0.9692
p_safe = 0.1660
p_spam = 0.0498


5.1 Risk Scoring (Risk Computation)

In [13]:
#Risk Scoring Equation
risk_score = (p_scam_var/(p_scam_var+p_safe_var+p_spam_var))*100
print(f"Risk Score = {risk_score:.2f}%")

Risk Score = 81.79%


5.2 Risk Likelihood

In [14]:
#Risk Score --> SMS Category
'''
0-29 = Safe/Ham
30-59 = Spam
60-100 = Scam
'''
if risk_score <= 29:
    sms_category = 'safe'
elif 30 <= risk_score <= 59:
    sms_category = 'spam'
else:
    sms_category = 'scam'
    
print('SMS Category:', sms_category)


SMS Category: scam
