# Banking Trojan Detection ML Model

This notebook implements a comprehensive machine learning model for detecting Android banking trojans using both static and dynamic analysis features.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from xgboost import XGBClassifier
import joblib
import shap
from tqdm import tqdm

# Add parent directory to path for imports
sys.path.append('..')
from static_feature_extractor import extract_static_features, extract_features_batch
from dynamic_feature_extractor import extract_dynamic_features, create_mock_dynamic_features

plt.style.use('seaborn-v0_8')
print("Libraries imported successfully!")

In [None]:
# Define data paths
DATA_DIR = Path('../data')
PROCESSED_DIR = DATA_DIR / 'processed'
MODELS_DIR = Path('../models')

# Create directories
for directory in [PROCESSED_DIR, MODELS_DIR]:
    directory.mkdir(parents=True, exist_ok=True)

print(f"Data directory: {DATA_DIR}")
print(f"Models: {MODELS_DIR}")

In [None]:
# Create synthetic dataset for demonstration
def create_synthetic_features(n_malware=200, n_benign=200):
    np.random.seed(42)
    features = []
    
    # Generate malware features
    for i in range(n_malware):
        feature = {
            'sha256': f'malware_{i:04d}',
            'total_permissions': np.random.randint(15, 40),
            'sensitive_api_count': np.random.randint(10, 50),
            'obfuscation_score': np.random.randint(20, 200),
            'exported_components': np.random.randint(2, 15),
            'has_native_code': np.random.choice([0, 1], p=[0.4, 0.6]),
            'pkg_has_bank_keyword': np.random.choice([0, 1], p=[0.3, 0.7]),
            'sensitive_api_runtime': np.random.randint(10, 60),
            'suspicious_syscalls': np.random.randint(5, 40),
            'suspicious_domain_hits': np.random.randint(1, 8),
            'malicious_behavior_score': np.random.uniform(10, 50),
            'label': 1
        }
        features.append(feature)
    
    # Generate benign features
    for i in range(n_benign):
        feature = {
            'sha256': f'benign_{i:04d}',
            'total_permissions': np.random.randint(8, 25),
            'sensitive_api_count': np.random.randint(0, 15),
            'obfuscation_score': np.random.randint(0, 50),
            'exported_components': np.random.randint(0, 8),
            'has_native_code': np.random.choice([0, 1], p=[0.6, 0.4]),
            'pkg_has_bank_keyword': np.random.choice([0, 1], p=[0.2, 0.8]),
            'sensitive_api_runtime': np.random.randint(0, 20),
            'suspicious_syscalls': np.random.randint(0, 15),
            'suspicious_domain_hits': np.random.randint(0, 3),
            'malicious_behavior_score': np.random.uniform(0, 15),
            'label': 0
        }
        features.append(feature)
    
    return pd.DataFrame(features)

# Create dataset
df = create_synthetic_features()
print(f"Created dataset with {len(df)} samples")
print(f"Class distribution: {df['label'].value_counts().to_dict()}")

In [None]:
# Prepare features for training
feature_columns = [col for col in df.columns if col not in ['sha256', 'label']]
X = df[feature_columns]
y = df['label']

print(f"Feature matrix shape: {X.shape}")
print(f"Features: {list(X.columns)}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

In [None]:
# Train XGBoost model
print("Training XGBoost model...")

model = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.8,
    reg_lambda=2.0,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate model
print("\nMODEL EVALUATION RESULTS")
print("="*40)
print(classification_report(y_test, y_pred, digits=4))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
plt.title('Top 10 Most Important Features')
plt.xlabel('Feature Importance')
plt.tight_layout()
plt.show()

print("Top 10 Most Important Features:")
for _, row in feature_importance.head(10).iterrows():
    print(f"{row['feature']}: {row['importance']:.4f}")

In [None]:
# Save the model
model_path = MODELS_DIR / 'banking_trojan_detector.joblib'
joblib.dump(model, model_path)
print(f"Model saved to: {model_path}")

# Save feature names
feature_names_path = MODELS_DIR / 'feature_names.csv'
pd.Series(X.columns).to_csv(feature_names_path, index=False, header=['feature'])
print(f"Feature names saved to: {feature_names_path}")

# Save feature importance
importance_path = MODELS_DIR / 'feature_importance.csv'
feature_importance.to_csv(importance_path, index=False)
print(f"Feature importance saved to: {importance_path}")

## Model Usage Example

In [None]:
# Example: Predict on new sample
def predict_sample(sample_features):
    """Predict if a sample is malware or benign."""
    sample_df = pd.DataFrame([sample_features])
    prediction = model.predict(sample_df)[0]
    probability = model.predict_proba(sample_df)[0, 1]
    
    label = "MALWARE" if prediction == 1 else "BENIGN"
    return label, probability

# Test with a suspicious sample
suspicious_sample = {
    'total_permissions': 35,
    'sensitive_api_count': 45,
    'obfuscation_score': 150,
    'exported_components': 12,
    'has_native_code': 1,
    'pkg_has_bank_keyword': 1,
    'sensitive_api_runtime': 50,
    'suspicious_syscalls': 30,
    'suspicious_domain_hits': 5,
    'malicious_behavior_score': 40.0
}

label, prob = predict_sample(suspicious_sample)
print(f"Prediction: {label} (confidence: {prob:.3f})")