In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scipy.stats import zscore
import joblib
import warnings
warnings.filterwarnings('ignore')



In [4]:
class LogPreprocessor:
    def __init__(self):
        self.tfidf = TfidfVectorizer(max_features=100)
        self.scaler = StandardScaler()
        
    def preprocess(self, logs):
        """Process raw logs into structured features"""
        # Extract text features from log messages
        text_features = self.tfidf.fit_transform(logs['message']).toarray()
        
        # Numerical features
        num_features = np.column_stack([
            logs['failed_attempts'],
            logs['success_attempts'],
            zscore(logs['latency_ms']),
            logs['geolocation_risk'],
            logs['privilege_level']
        ])
        
        # Temporal features (moving averages)
        window_size = 5
        logs['rolling_failures'] = logs['failed_attempts'].rolling(window=window_size).mean()
        temporal_features = logs[['rolling_failures']].values
        
        # Combine all features
        features = np.concatenate([text_features, num_features, temporal_features], axis=1)
        return self.scaler.fit_transform(features)

In [5]:
class ThreatDetector:
    def __init__(self):
        self.models = {
            'isolation_forest': IsolationForest(n_estimators=100, contamination=0.05),
            'random_forest': RandomForestClassifier(n_estimators=150),
            'xgboost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
            'lstm': self._build_lstm_model()
        }
        self.preprocessor = LogPreprocessor()
        
    def _build_lstm_model(self):
        """LSTM for sequential log analysis"""
        model = Sequential([
            LSTM(64, input_shape=(10, 15), return_sequences=True),
            Dropout(0.2),
            LSTM(32),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer=Adam(0.001), loss='binary_crossentropy')
        return model
    
    def train(self, X, y=None):
        """Train both supervised and unsupervised models"""
        # Preprocess data
        X_processed = self.preprocessor.preprocess(X)
        
        # Train unsupervised models
        self.models['isolation_forest'].fit(X_processed)
        
        if y is not None:  # Supervised training
            X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2)
            
            # Train Random Forest
            self.models['random_forest'].fit(X_train, y_train)
            print(f"Random Forest Accuracy: {self.models['random_forest'].score(X_val, y_val):.2f}")
            
            # Train XGBoost
            self.models['xgboost'].fit(X_train, y_train)
            print(f"XGBoost Accuracy: {self.models['xgboost'].score(X_val, y_val):.2f}")
            
            # Train LSTM (requires sequential data)
            X_seq = self._create_sequences(X_processed)
            y_seq = y[len(y)-X_seq.shape[0]:]  # Align labels
            self.models['lstm'].fit(X_seq, y_seq, epochs=10, batch_size=32)
    
    def _create_sequences(self, data, seq_length=10):
        """Convert tabular data to sequences for LSTM"""
        sequences = []
        for i in range(len(data) - seq_length):
            sequences.append(data[i:i+seq_length])
        return np.array(sequences)
    
    def predict_risk(self, new_logs):
        """Generate threat risk scores (0-1)"""
        X = self.preprocessor.preprocess(new_logs)
        
        # Get predictions from all models
        results = {
            'isolation': self.models['isolation_forest'].decision_function(X),
            'rf_proba': self.models['random_forest'].predict_proba(X)[:,1],
            'xgb_proba': self.models['xgboost'].predict_proba(X)[:,1],
        }
        
        # LSTM prediction (if sequential data available)
        if len(X) >= 10:
            X_seq = self._create_sequences(X[-10:])  # Last 10 events
            results['lstm_proba'] = self.models['lstm'].predict(X_seq[-1:])[0][0]
        
        # Calculate composite risk score (weighted average)
        weights = {'isolation': 0.3, 'rf_proba': 0.25, 'xgb_proba': 0.25, 'lstm_proba': 0.2}
        risk_score = np.mean([
            weights['isolation'] * self._normalize(results['isolation']),
            weights['rf_proba'] * results['rf_proba'],
            weights['xgb_proba'] * results['xgb_proba'],
            weights.get('lstm_proba', 0) * results.get('lstm_proba', 0)
        ])
        
        return float(risk_score)
    
    def _normalize(self, scores):
        """Scale isolation forest scores to 0-1 range"""
        return (scores - (-0.5)) / (0.5 - (-0.5))
    
    def save_models(self, path='models/'):
        """Persist trained models"""
        joblib.dump(self.models['isolation_forest'], f'{path}isolation_forest.pkl')
        joblib.dump(self.models['random_forest'], f'{path}random_forest.pkl')
        joblib.dump(self.models['xgboost'], f'{path}xgboost.pkl')
        self.models['lstm'].save(f'{path}lstm_model.h5')

In [6]:
class AnomalyMonitor:
    def __init__(self, model_path=None):
        self.detector = ThreatDetector()
        self.threshold = 0.85  # Rotation threshold
        self.log_buffer = []
        
        if model_path:
            self.load_models(model_path)
    
    def load_models(self, path):
        """Load pre-trained models"""
        self.detector.models['isolation_forest'] = joblib.load(f'{path}isolation_forest.pkl')
        self.detector.models['random_forest'] = joblib.load(f'{path}random_forest.pkl')
        self.detector.models['xgboost'] = joblib.load(f'{path}xgboost.pkl')
    
    def ingest_logs(self, logs):
        """Process incoming logs in real-time"""
        self.log_buffer.extend(logs)
        
        # Keep last 1000 logs to prevent memory overload
        if len(self.log_buffer) > 1000:
            self.log_buffer = self.log_buffer[-1000:]
        
        # Convert to DataFrame
        log_df = pd.DataFrame(self.log_buffer)
        
        # Calculate risk score
        risk_score = self.detector.predict_risk(log_df)
        
        # Determine action
        action = "key_rotation" if risk_score >= self.threshold else "monitor"
        
        return {
            "risk_score": risk_score,
            "action": action,
            "top_features": self._get_risk_factors(log_df)
        }
    
    def _get_risk_factors(self, logs):
        """Identify highest-contributing risk factors"""
        processed = self.detector.preprocessor.preprocess(logs)
        
        # Get feature importances from Random Forest
        importances = self.detector.models['random_forest'].feature_importances_
        top_indices = np.argsort(importances)[-3:][::-1]  # Top 3 features
        
        # Map back to feature names (simplified)
        features = ['login_failures', 'geo_risk', 'privilege_escalation', 
                   'latency', 'message_tfidf']  # Extend based on actual features
        return {features[i]: float(importances[i]) for i in top_indices}

In [None]:
# 1. Simulate training data (replace with real logs)
training_data = pd.DataFrame({
    'message': ["Failed login", "SSH brute force", "Login success"]*100,
    'failed_attempts': np.random.randint(0, 5, 300),
    'success_attempts': np.random.randint(0, 2, 300),
    'latency_ms': np.random.normal(200, 50, 300),
    'geolocation_risk': np.random.uniform(0, 1, 300),
    'privilege_level': np.random.choice([0, 1, 2], 300)
})
labels = np.random.randint(0, 2, 300)  # Mock labels (0=normal, 1=threat)

# 2. Train models
detector = ThreatDetector()
detector.train(training_data, labels)
detector.save_models()

# 3. Real-time monitoring
monitor = AnomalyMonitor(model_path='models/')

new_logs = [{
    'message': "Repeated failed login attempts from unusual IP",
    'failed_attempts': 8,
    'success_attempts': 0,
    'latency_ms': 1200,
    'geolocation_risk': 0.9,
    'privilege_level': 2
}]

result = monitor.ingest_logs(new_logs)
print(f"Risk Score: {result['risk_score']:.2f} → Action: {result['action'].upper()}")
print("Key Risk Factors:", result['top_features'])