In [5]:
import joblib

# Load saved model and vectorizer (in any notebook or service)
clf = joblib.load('rf_phishing_model.joblib')
vectorizer = joblib.load('tfidf_vectorizer.joblib')


In [6]:
import joblib
import pandas as pd

# Load the saved model and vectorizer
clf = joblib.load('rf_phishing_model.joblib')
vectorizer = joblib.load('tfidf_vectorizer.joblib')

def simple_preprocess(url):
    if pd.isna(url):
        return ""
    url = url.replace('https://', '').replace('http://', '').replace('www.', '')
    return url.lower().strip()

def predict_url(url, threshold=0.4):
    clean_url = simple_preprocess(url)
    
    # Dash heuristic: if '-' in URL, classify as phishing immediately
    if '-' in clean_url:
        return {
            'url': url,
            'prediction': 'phishing',
            'confidence': 1.0
        }
    
    url_vec = vectorizer.transform([clean_url])
    proba = clf.predict_proba(url_vec)[0]
    classes = clf.classes_
    
    phishing_idx = list(classes).index('phishing')
    phishing_proba = proba[phishing_idx]
    
    if phishing_proba >= threshold:
        return {'url': url, 'prediction': 'phishing', 'confidence': phishing_proba}
    else:
        non_phish_classes = [c for c in classes if c != 'phishing']
        non_phish_probs = [proba[list(classes).index(c)] for c in non_phish_classes]
        max_idx = non_phish_probs.index(max(non_phish_probs))
        return {'url': url, 'prediction': non_phish_classes[max_idx], 'confidence': max(non_phish_probs)}

def real_time_url_check(url, threshold=0.4):
    result = predict_url(url, threshold)
    
    if result['prediction'] == 'phishing':
        action = 'block'
        alert = f"Phishing URL detected: {url}"
    else:
        action = 'allow'
        alert = None
    
    return {
        'url': url,
        'action': action,
        'alert': alert,
        'confidence': result['confidence']
    }

# Example URLs to check
test_urls = [
    "https://www.paypal.com/login",
    "http://fake-paypal-login.com",
    "secure-payment.com"
]

# Real-time detection
for url in test_urls:
    res = real_time_url_check(url)
    print(f"URL: {res['url']}, Action: {res['action']}, Confidence: {res['confidence']:.2f}")
    if res['alert']:
        print(f"Alert: {res['alert']}")


URL: https://www.paypal.com/login, Action: allow, Confidence: 0.66
URL: http://fake-paypal-login.com, Action: block, Confidence: 1.00
Alert: Phishing URL detected: http://fake-paypal-login.com
URL: secure-payment.com, Action: block, Confidence: 1.00
Alert: Phishing URL detected: secure-payment.com
