In [1]:
# CTI-NLP Enhanced Analyzer - Model Training Notebook
print("CTI-NLP Enhanced Threat Analyzer - Model Training v2.0")
print("This notebook trains TWO models:")
print("  1. CTI Report Classifier (Sentiment + Severity)")
print("  2. URL Threat Detector (55 features)")

print("\nStarting training process...")
print("This will take approximately 5-10 minutes.\n")

CTI-NLP Enhanced Threat Analyzer - Model Training v2.0
This notebook trains TWO models:
  1. CTI Report Classifier (Sentiment + Severity)
  2. URL Threat Detector (55 features)

Starting training process...
This will take approximately 5-10 minutes.



In [2]:
#Import Libraries
print("=" * 60)
print("CELL 2: Importing Required Libraries")
print("=" * 60)

import pandas as pd
import numpy as np
import re
import pickle
import json
from pathlib import Path
from urllib.parse import urlparse
from datetime import datetime

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix,
    precision_recall_fscore_support
)

# Warnings
import warnings
warnings.filterwarnings('ignore')

print("✓ All libraries imported successfully\n")

CELL 2: Importing Required Libraries
✓ All libraries imported successfully



In [3]:

#Configuration

print("=" * 60)
print("CELL 3: Configuration Settings")
print("=" * 60)

# CTI Report Configuration
CTI_FILE_PATHS = [
    'data/Cybersecurity_Dataset.csv',
    'data/cyber-threat-intelligence-all.csv',
    'data/cyber-threat-intelligence-splited_train.csv',
    'data/cyber-threat-intelligence-splited_test.csv',
    'data/cyber-threat-intelligence-splited_val.csv'
]
CTI_LABEL_COLUMN = 'Threat Category'
CTI_FEATURE_COLS = ['Sentiment in Forums', 'Severity Score']

# URL Configuration
URL_FILE_PATH = 'data/url_dataset.csv'
URL_LABEL_COLUMN = 'type'
URL_INPUT_COLUMN = 'url'

# Trusted Domains for URL Analysis
TRUSTED_DOMAINS = {
    'google.com', 'youtube.com', 'facebook.com', 'amazon.com', 'wikipedia.org',
    'twitter.com', 'instagram.com', 'linkedin.com', 'reddit.com', 'github.com',
    'microsoft.com', 'apple.com', 'netflix.com', 'yahoo.com', 'bing.com',
    'stackoverflow.com', 'medium.com', 'dropbox.com', 'adobe.com', 'paypal.com',
    'ebay.com', 'cnn.com', 'bbc.com', 'nytimes.com', 'spotify.com','paypal.com','https://geethashishu.in/'
}

LEGITIMATE_TLDS = {'.com', '.org', '.net', '.edu', '.gov', '.co', '.io', '.ai', '.in'}
SUSPICIOUS_TLDS = {'.tk', '.ml', '.ga', '.cf', '.gq', '.zip', '.review', '.xyz', '.top'}

print("✓ Configuration loaded")
print(f"  CTI Files: {len(CTI_FILE_PATHS)}")
print(f"  URL Dataset: {URL_FILE_PATH}")
print(f"  Trusted Domains: {len(TRUSTED_DOMAINS)}\n")

CELL 3: Configuration Settings
✓ Configuration loaded
  CTI Files: 5
  URL Dataset: data/url_dataset.csv
  Trusted Domains: 26



In [4]:
#Load CTI Report Data
print("=" * 60)
print("CELL 4: Loading CTI Report Training Data")
print("=" * 60)

all_dfs = []
for path in CTI_FILE_PATHS:
    try:
        df_part = pd.read_csv(path, usecols=[CTI_LABEL_COLUMN] + CTI_FEATURE_COLS)
        all_dfs.append(df_part)
        print(f"✓ Loaded: {path} ({len(df_part)} records)")
    except FileNotFoundError:
        print(f"⏭  Skipped: {path} (not found)")
    except ValueError as e:
        print(f"⚠  Error: {path} - Column mismatch")
        continue

if not all_dfs:
    print("\n⚠ WARNING: No CTI files loaded. Using simulated data.")
    data = {
        'Threat Category': ['DDoS', 'Malware', 'Phishing', 'Ransomware', 'Benign'] * 20,
        'Sentiment in Forums': np.random.uniform(0.1, 0.9, 100),
        'Severity Score': np.random.randint(1, 6, 100)
    }
    df_cti = pd.DataFrame(data)
else:
    df_cti = pd.concat(all_dfs, ignore_index=True)

# Clean data
for col in CTI_FEATURE_COLS:
    df_cti[col] = pd.to_numeric(df_cti[col], errors='coerce')

df_cti.dropna(subset=[CTI_LABEL_COLUMN] + CTI_FEATURE_COLS, inplace=True)
df_cti.drop_duplicates(inplace=True)

print(f"\n✓ CTI Data Loaded: {len(df_cti)} records after cleaning")
print(f"  Features: {CTI_FEATURE_COLS}")
print(f"  Classes: {df_cti[CTI_LABEL_COLUMN].unique()}\n")

CELL 4: Loading CTI Report Training Data
✓ Loaded: data/Cybersecurity_Dataset.csv (1100 records)
⏭  Skipped: data/cyber-threat-intelligence-all.csv (not found)
⚠  Error: data/cyber-threat-intelligence-splited_train.csv - Column mismatch
⚠  Error: data/cyber-threat-intelligence-splited_test.csv - Column mismatch
⏭  Skipped: data/cyber-threat-intelligence-splited_val.csv (not found)

✓ CTI Data Loaded: 686 records after cleaning
  Features: ['Sentiment in Forums', 'Severity Score']
  Classes: ['DDoS' 'Malware' 'Phishing' 'Ransomware']



In [5]:
#URL Feature Extraction Function
print("=" * 60)
print("CELL 5: Defining URL Feature Extraction")
print("=" * 60)

def extract_url_features(url):
    """
    Extracts 55 comprehensive features from a URL for threat detection.
    This MUST match the features in app.py!
    """
    features = {}
    
    try:
        parsed = urlparse(url.strip())
        domain = parsed.netloc.lower() if parsed.netloc else ""
        path = parsed.path if parsed.path else ""
        query = parsed.query if parsed.query else ""
        scheme = parsed.scheme if parsed.scheme else ""
    except:
        domain = path = query = scheme = ""
    
    # Clean domain
    clean_domain = re.sub(r':\d+$', '', domain)
    clean_domain = re.sub(r'^www\.', '', clean_domain)
    
    #TRUST INDICATORS
    features['is_trusted_domain'] = 1 if any(trusted in clean_domain for trusted in TRUSTED_DOMAINS) else 0
    features['has_legitimate_tld'] = 1 if any(clean_domain.endswith(tld) for tld in LEGITIMATE_TLDS) else 0
    features['has_suspicious_tld'] = 1 if any(clean_domain.endswith(tld) for tld in SUSPICIOUS_TLDS) else 0
    
    #LENGTH FEATURES
    features['url_length'] = len(url)
    features['domain_length'] = len(domain)
    features['path_length'] = len(path)
    features['query_length'] = len(query)
    
    #CHARACTER COUNTS
    features['num_dots'] = url.count('.')
    features['num_hyphens'] = url.count('-')
    features['num_underscores'] = url.count('_')
    features['num_slashes'] = url.count('/')
    features['num_at_symbol'] = url.count('@')
    features['num_question_mark'] = url.count('?')
    features['num_ampersand'] = url.count('&')
    features['num_equals'] = url.count('=')
    features['num_percent'] = url.count('%')
    features['num_digits'] = sum(c.isdigit() for c in url)
    features['num_letters'] = sum(c.isalpha() for c in url)
    features['num_special_chars'] = sum(not c.isalnum() and c not in './-_:?' for c in url)
    
    #DOMAIN ANALYSIS 
    features['num_dots_domain'] = domain.count('.')
    features['num_hyphens_domain'] = domain.count('-')
    features['has_ip_address'] = 1 if re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url) else 0
    features['subdomain_count'] = max(0, len(domain.split('.')) - 2) if domain else 0
    features['domain_has_digits'] = 1 if any(c.isdigit() for c in domain) else 0
    
    #SECURITY FEATUREs
    features['has_https'] = 1 if scheme == 'https' else 0
    features['has_http'] = 1 if scheme == 'http' else 0
    features['port_in_url'] = 1 if re.search(r':\d{2,5}', domain) else 0
    
    #PHISHING INDICATORS
    features['excessive_dots_in_path'] = 1 if path.count('..') > 0 or path.count('...') > 0 else 0
    features['has_wp_includes'] = 1 if 'wp-includes' in url.lower() or 'wp-admin' in url.lower() else 0
    features['has_admin_path'] = 1 if '/admin' in url.lower() or '/administrator' in url.lower() else 0
    
    phishing_keywords = ['login', 'signin', 'verify', 'update', 'secure', 'account', 
                         'banking', 'confirm', 'suspended', 'locked', 'paypal', 'dropbox']
    features['phishing_keyword_count'] = sum(1 for kw in phishing_keywords if kw in url.lower())
    
    trusted_brands = ['google', 'paypal', 'amazon', 'microsoft', 'apple', 'facebook', 'dropbox']
    features['brand_impersonation'] = 1 if any(brand in clean_domain for brand in trusted_brands) and clean_domain not in TRUSTED_DOMAINS else 0
    
    features['has_php_file'] = 1 if '.php' in path.lower() else 0
    features['has_html_file'] = 1 if '.htm' in path.lower() or '.html' in path.lower() else 0
    
    #ENTROPY (Randomness)
    def calculate_entropy(text):
        if not text or len(text) < 2:
            return 0
        prob = [float(text.count(c)) / len(text) for c in set(text)]
        return -sum(p * np.log2(p) for p in prob if p > 0)
    
    features['url_entropy'] = calculate_entropy(url)
    features['domain_entropy'] = calculate_entropy(domain)
    features['path_entropy'] = calculate_entropy(path) if path else 0
    features['high_domain_entropy'] = 1 if features['domain_entropy'] > 4.0 else 0
    features['high_path_entropy'] = 1 if features['path_entropy'] > 4.5 else 0
    
    #RATIO FEATURES
    url_len = max(len(url), 1)
    features['digit_ratio'] = features['num_digits'] / url_len
    features['special_char_ratio'] = features['num_special_chars'] / url_len
    features['dots_to_length_ratio'] = features['num_dots'] / url_len
    
    #SUSPICIOUS PATTERNS
    features['excessive_subdomains'] = 1 if features['subdomain_count'] > 3 else 0
    features['url_shortener'] = 1 if any(s in clean_domain for s in ['bit.ly', 'tinyurl', 'goo.gl', 't.co']) else 0
    features['hyphen_in_domain'] = 1 if '-' in clean_domain else 0
    
    path_parts = [p for p in path.split('/') if p and len(p) > 10]
    features['has_long_random_path'] = 1 if any(len(p) > 32 for p in path_parts) else 0
    
    #PATH ANALYSIS
    features['path_depth'] = len([p for p in path.split('/') if p])
    features['deep_path'] = 1 if features['path_depth'] > 5 else 0
    
    features['has_hex_encoding'] = 1 if re.search(r'%[0-9a-fA-F]{2}', url) else 0
    features['has_at_symbol'] = 1 if '@' in url else 0
    features['has_double_slash'] = 1 if '//' in path else 0
    
    #QUERY PARAMETERS
    features['num_query_params'] = query.count('&') + (1 if query else 0)
    features['has_redirect'] = 1 if any(p in query.lower() for p in ['redirect', 'url=', 'next=', 'goto=']) else 0
    
    #DOMAIN STRUCTURE
    if domain:
        parts = domain.split('.')
        if len(parts) >= 2:
            features['main_domain_length'] = len(parts[-2])
            features['tld_length'] = len(parts[-1])
        else:
            features['main_domain_length'] = 0
            features['tld_length'] = 0
    else:
        features['main_domain_length'] = 0
        features['tld_length'] = 0
    
    return features

print("✓ URL feature extraction function defined")
print("  Total features: 55\n")

CELL 5: Defining URL Feature Extraction
✓ URL feature extraction function defined
  Total features: 55



In [6]:
#Load URL Data
print("=" * 60)
print("CELL 6: Loading URL Training Data")
print("=" * 60)

try:
    df_url = pd.read_csv(URL_FILE_PATH)
    print(f"✓ Loaded: {URL_FILE_PATH}")
    print(f"  Total records: {len(df_url)}")
    
    # Check columns
    if URL_INPUT_COLUMN not in df_url.columns:
        print(f"\n⚠ Warning: Column '{URL_INPUT_COLUMN}' not found")
        print(f"  Available columns: {list(df_url.columns)}")
        # Try to find URL column
        url_cols = [col for col in df_url.columns if 'url' in col.lower()]
        if url_cols:
            URL_INPUT_COLUMN = url_cols[0]
            print(f"  Using: {URL_INPUT_COLUMN}")
    
    if URL_LABEL_COLUMN not in df_url.columns:
        label_cols = [col for col in df_url.columns if col != URL_INPUT_COLUMN]
        if label_cols:
            URL_LABEL_COLUMN = label_cols[0]
            print(f"  Using label column: {URL_LABEL_COLUMN}")
    
    # Clean data
    initial_count = len(df_url)
    df_url.dropna(subset=[URL_INPUT_COLUMN], inplace=True)
    df_url[URL_INPUT_COLUMN] = df_url[URL_INPUT_COLUMN].str.strip()
    df_url.drop_duplicates(subset=[URL_INPUT_COLUMN], inplace=True)
    
    print(f"  After cleaning: {len(df_url)} records ({initial_count - len(df_url)} removed)")
    print(f"\n  Label distribution:")
    for label, count in df_url[URL_LABEL_COLUMN].value_counts().items():
        print(f"    - {label}: {count} ({count/len(df_url)*100:.1f}%)")

except FileNotFoundError:
    print(f"⚠ ERROR: {URL_FILE_PATH} not found!")
    print("  Creating sample dataset...")
    
    # Create sample dataset
    sample_urls = {
        'legitimate': [
            'https://www.google.com',
            'https://www.facebook.com',
            'https://github.com/user/repo',
            'https://www.amazon.com/product',
            'https://stackoverflow.com/questions/123',
        ] * 100,
        'phishing': [
            'http://secure-paypal-login.tk',
            'http://verify-account-apple.ml',
            'http://update-banking-info.ga',
            'http://suspended-account-fix.cf',
            'http://bit.ly/malware123',
        ] * 100
    }
    
    urls = []
    labels = []
    for label, url_list in sample_urls.items():
        urls.extend(url_list)
        labels.extend([label] * len(url_list))
    
    df_url = pd.DataFrame({URL_INPUT_COLUMN: urls, URL_LABEL_COLUMN: labels})
    print(f"✓ Created sample dataset: {len(df_url)} records")


CELL 6: Loading URL Training Data
✓ Loaded: data/url_dataset.csv
  Total records: 450176
  After cleaning: 450176 records (0 removed)

  Label distribution:
    - legitimate: 345738 (76.8%)
    - phishing: 104438 (23.2%)


In [7]:
#Extract URL Features
print("=" * 60)
print("CELL 7: Extracting URL Features")
print("=" * 60)
print("This may take a few minutes for large datasets")

url_features = df_url[URL_INPUT_COLUMN].apply(lambda x: pd.Series(extract_url_features(x)))

print(f"\n✓ Feature extraction complete")
print(f"  Samples processed: {len(url_features)}")
print(f"  Features extracted: {len(url_features.columns)}")
print(f"  Feature names: {list(url_features.columns[:10])}... (showing first 10)\n")


CELL 7: Extracting URL Features
This may take a few minutes for large datasets

✓ Feature extraction complete
  Samples processed: 450176
  Features extracted: 55
  Feature names: ['is_trusted_domain', 'has_legitimate_tld', 'has_suspicious_tld', 'url_length', 'domain_length', 'path_length', 'query_length', 'num_dots', 'num_hyphens', 'num_underscores']... (showing first 10)



In [8]:
#Train CTI Report Model
print("=" * 60)
print("CELL 8: Training CTI Report Classifier")
print("=" * 60)

# Prepare CTI data
X_cti = df_cti[CTI_FEATURE_COLS]
y_cti = df_cti[CTI_LABEL_COLUMN].astype(str)

# Encode labels
cti_encoder = LabelEncoder()
y_cti_encoded = cti_encoder.fit_transform(y_cti)

# Split data
X_cti_train, X_cti_test, y_cti_train, y_cti_test = train_test_split(
    X_cti, y_cti_encoded, test_size=0.3, random_state=42, stratify=y_cti_encoded
)

# Train model
cti_model = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000, random_state=42)
cti_model.fit(X_cti_train, y_cti_train)

# Evaluate
y_cti_pred = cti_model.predict(X_cti_test)
cti_accuracy = accuracy_score(y_cti_test, y_cti_pred)

print(f"\n✓ CTI Model Training Complete")
print(f"  Training samples: {len(X_cti_train)}")
print(f"  Test samples: {len(X_cti_test)}")
print(f"  Accuracy: {cti_accuracy:.4f} ({cti_accuracy*100:.2f}%)")
print(f"  Classes: {list(cti_encoder.classes_)}")

# Save CTI model
with open('model.pkl', 'wb') as f:
    pickle.dump(cti_model, f)
with open('feature_list.pkl', 'wb') as f:
    pickle.dump(CTI_FEATURE_COLS, f)
with open('threat_encoder.pkl', 'wb') as f:
    pickle.dump(cti_encoder, f)

print(f"\n✓ Saved:")
print(f"  - model.pkl")
print(f"  - feature_list.pkl")
print(f"  - threat_encoder.pkl\n")

CELL 8: Training CTI Report Classifier

✓ CTI Model Training Complete
  Training samples: 480
  Test samples: 206
  Accuracy: 0.2184 (21.84%)
  Classes: ['DDoS', 'Malware', 'Phishing', 'Ransomware']

✓ Saved:
  - model.pkl
  - feature_list.pkl
  - threat_encoder.pkl



In [9]:
#Train URL Threat Model
print("=" * 60)
print("CELL 9: Training URL Threat Detector")
print("=" * 60)

# Prepare URL data
X_url = url_features
y_url = df_url[URL_LABEL_COLUMN]

# Encode labels
y_url_encoded, url_label_encoder = pd.factorize(y_url)
url_feature_names = list(X_url.columns)

# Split data
X_url_train, X_url_test, y_url_train, y_url_test = train_test_split(
    X_url, y_url_encoded, test_size=0.2, random_state=42, stratify=y_url_encoded
)

print(f"Training set: {len(X_url_train)} samples")
print(f"Test set: {len(X_url_test)} samples")

# Train Random Forest model
url_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1,
    class_weight='balanced',
    bootstrap=True,
    oob_score=True
)

print("\nTraining Random Forest... (this may take 1-2 minutes)")
url_model.fit(X_url_train, y_url_train)

# Evaluate
train_acc = url_model.score(X_url_train, y_url_train)
test_acc = url_model.score(X_url_test, y_url_test)
oob_score = url_model.oob_score_ if hasattr(url_model, 'oob_score_') else 0

print(f"\n✓ URL Model Training Complete")
print(f"  Training Accuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f"  Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"  OOB Score: {oob_score:.4f} ({oob_score*100:.2f}%)")
print(f"  Classes: {list(url_label_encoder)}")

# Save URL model
with open('url_model.pkl', 'wb') as f:
    pickle.dump(url_model, f)
with open('url_feature_names.pkl', 'wb') as f:
    pickle.dump(url_feature_names, f)
with open('url_label_encoder.pkl', 'wb') as f:
    pickle.dump(url_label_encoder, f)
with open('url_trusted_domains.pkl', 'wb') as f:
    pickle.dump(TRUSTED_DOMAINS, f)

print(f"\n✓ Saved:")
print(f"  - url_model.pkl")
print(f"  - url_feature_names.pkl")
print(f"  - url_label_encoder.pkl")
print(f"  - url_trusted_domains.pkl\n")

CELL 9: Training URL Threat Detector
Training set: 360140 samples
Test set: 90036 samples

Training Random Forest... (this may take 1-2 minutes)

✓ URL Model Training Complete
  Training Accuracy: 0.9975 (99.75%)
  Test Accuracy: 0.9964 (99.64%)
  OOB Score: 0.9963 (99.63%)
  Classes: ['legitimate', 'phishing']

✓ Saved:
  - url_model.pkl
  - url_feature_names.pkl
  - url_label_encoder.pkl
  - url_trusted_domains.pkl



In [10]:
#Detailed Evaluation - URL Model
print("=" * 60)
print("CELL 10: URL Model - Detailed Evaluation")
print("=" * 60)

y_url_pred = url_model.predict(X_url_test)

print("\nClassification Report:")
print("-" * 60)
print(classification_report(y_url_test, y_url_pred, target_names=url_label_encoder))

print("\nConfusion Matrix:")
print("-" * 60)
cm = confusion_matrix(y_url_test, y_url_pred)
print(cm)
print(f"\nRows = Actual | Columns = Predicted")
print(f"Labels: {list(url_label_encoder)}")

# Feature importance
print("\nTop 15 Most Important Features:")
print("-" * 60)
feature_importance = sorted(
    zip(url_feature_names, url_model.feature_importances_),
    key=lambda x: x[1],
    reverse=True
)

for i, (feature, importance) in enumerate(feature_importance[:15], 1):
    bar = '█' * int(importance * 100)
    print(f"{i:2d}. {feature:30s} {importance:.4f} {bar}")

# Save feature importance
importance_df = pd.DataFrame(feature_importance, columns=['Feature', 'Importance'])
importance_df.to_csv('url_feature_importance.csv', index=False)
print(f"\n✓ Saved: url_feature_importance.csv\n")

CELL 10: URL Model - Detailed Evaluation

Classification Report:
------------------------------------------------------------
              precision    recall  f1-score   support

  legitimate       1.00      1.00      1.00     69148
    phishing       1.00      0.99      0.99     20888

    accuracy                           1.00     90036
   macro avg       1.00      0.99      0.99     90036
weighted avg       1.00      1.00      1.00     90036


Confusion Matrix:
------------------------------------------------------------
[[69097    51]
 [  271 20617]]

Rows = Actual | Columns = Predicted
Labels: ['legitimate', 'phishing']

Top 15 Most Important Features:
------------------------------------------------------------
 1. has_https                      0.3344 █████████████████████████████████
 2. has_http                       0.2716 ███████████████████████████
 3. subdomain_count                0.0971 █████████
 4. num_dots_domain                0.0765 ███████
 5. has_legitimate_tld

In [11]:
#Test Predictions

print("=" * 60)
print("CELL 11: Testing Predictions")
print("=" * 60)

# Test CTI model
print("\n1. CTI Report Model Test:")
print("-" * 60)
test_cti_samples = [
    {'sentiment': 0.9, 'severity': 1, 'expected': 'Low Risk'},
    {'sentiment': 0.5, 'severity': 3, 'expected': 'Medium Risk'},
    {'sentiment': 0.2, 'severity': 5, 'expected': 'High Risk'}
]

for i, sample in enumerate(test_cti_samples, 1):
    input_data = np.array([[sample['sentiment'], sample['severity']]])
    input_df = pd.DataFrame(input_data, columns=CTI_FEATURE_COLS)
    prediction = cti_model.predict(input_df)[0]
    predicted_class = cti_encoder.inverse_transform([prediction])[0]
    confidence = cti_model.predict_proba(input_df)[0][int(prediction)]
    
    print(f"  Test {i}: Sentiment={sample['sentiment']}, Severity={sample['severity']}")
    print(f"    → Predicted: {predicted_class} ({confidence*100:.1f}% confidence)")
    print(f"    → Expected: {sample['expected']}")

# Test URL model
print("\n2. URL Threat Model Test:")
print("-" * 60)
test_urls = [
    ('https://www.google.com', 'Safe'),
    ('http://secure-paypal-verify.tk', 'Phishing'),
    ('https://github.com/user/repo', 'Safe')
]

for i, (test_url, expected) in enumerate(test_urls, 1):
    features_dict = extract_url_features(test_url)
    features_list = [features_dict.get(name, 0) for name in url_feature_names]
    input_df = pd.DataFrame([features_list], columns=url_feature_names)
    
    prediction_proba = url_model.predict_proba(input_df)[0]
    prediction = int(np.argmax(prediction_proba))
    predicted_class = str(url_label_encoder[prediction])
    confidence = prediction_proba[prediction]
    
    print(f"  Test {i}: {test_url[:50]}...")
    print(f"    → Predicted: {predicted_class} ({confidence*100:.1f}% confidence)")
    print(f"    → Expected: {expected}")

print()

CELL 11: Testing Predictions

1. CTI Report Model Test:
------------------------------------------------------------
  Test 1: Sentiment=0.9, Severity=1
    → Predicted: Malware (27.0% confidence)
    → Expected: Low Risk
  Test 2: Sentiment=0.5, Severity=3
    → Predicted: DDoS (27.1% confidence)
    → Expected: Medium Risk
  Test 3: Sentiment=0.2, Severity=5
    → Predicted: DDoS (30.8% confidence)
    → Expected: High Risk

2. URL Threat Model Test:
------------------------------------------------------------
  Test 1: https://www.google.com...
    → Predicted: legitimate (97.8% confidence)
    → Expected: Safe
  Test 2: http://secure-paypal-verify.tk...
    → Predicted: phishing (100.0% confidence)
    → Expected: Phishing
  Test 3: https://github.com/user/repo...
    → Predicted: phishing (93.9% confidence)
    → Expected: Safe



In [12]:
#Final Summary

print("=" * 60)
print("CELL 12: Training Complete - Summary")
print("=" * 60)

print("\n✓ ALL MODELS TRAINED SUCCESSFULLY!\n")

print(" Model Performance Summary:")
print("-" * 60)
print(f"1. CTI Report Classifier:")
print(f"   - Accuracy: {cti_accuracy*100:.2f}%")
print(f"   - Classes: {len(cti_encoder.classes_)}")
print(f"   - Features: {len(CTI_FEATURE_COLS)}")

print(f"\n2. URL Threat Detector:")
print(f"   - Test Accuracy: {test_acc*100:.2f}%")
print(f"   - OOB Score: {oob_score*100:.2f}%")
print(f"   - Classes: {len(url_label_encoder)}")
print(f"   - Features: {len(url_feature_names)}")

print("\nGenerated Files:")
print("-" * 60)
generated_files = [
    'model.pkl',
    'feature_list.pkl',
    'threat_encoder.pkl',
    'url_model.pkl',
    'url_feature_names.pkl',
    'url_label_encoder.pkl',
    'url_trusted_domains.pkl',
    'url_feature_importance.csv'
]

for file in generated_files:
    exists = Path(file).exists()
    status = "✓" if exists else "✗"
    size = Path(file).stat().st_size if exists else 0
    size_kb = size / 1024
    print(f"  {status} {file} ({size_kb:.1f} KB)")

print("\nNext Steps:")
print("-" * 60)
print("1. Close this notebook")
print("2. Start backend: python app.py")
print("3. Start frontend: python -m http.server 8000")
print("4. Open: http://127.0.0.1:8000/index.html")

print("\n" + "=" * 60)
print("  CTI-NLP Enhanced Analyzer - Ready for Deployment")
print("=" * 60)

CELL 12: Training Complete - Summary

✓ ALL MODELS TRAINED SUCCESSFULLY!

 Model Performance Summary:
------------------------------------------------------------
1. CTI Report Classifier:
   - Accuracy: 21.84%
   - Classes: 4
   - Features: 2

2. URL Threat Detector:
   - Test Accuracy: 99.64%
   - OOB Score: 99.63%
   - Classes: 2
   - Features: 55

Generated Files:
------------------------------------------------------------
  ✓ model.pkl (0.9 KB)
  ✓ feature_list.pkl (0.1 KB)
  ✓ threat_encoder.pkl (0.3 KB)
  ✓ url_model.pkl (34437.6 KB)
  ✓ url_feature_names.pkl (0.9 KB)
  ✓ url_label_encoder.pkl (0.2 KB)
  ✓ url_trusted_domains.pkl (0.4 KB)
  ✓ url_feature_importance.csv (2.0 KB)

Next Steps:
------------------------------------------------------------
1. Close this notebook
2. Start backend: python app.py
3. Start frontend: python -m http.server 8000
4. Open: http://127.0.0.1:8000/index.html

  CTI-NLP Enhanced Analyzer - Ready for Deployment
