# Tenhle prvni zatím nejlíp to vystihnul, ale je to až moc statický a zatím nevím, jak to udělat více dynamicky


In [7]:
!pip install lime




[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from lime.lime_text import LimeTextExplainer
import numpy as np
import pandas as pd
import regex as re
from IPython.display import display
import gc
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

class LogSecurityAnalyzer:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        # Load SecBERT specifically for sequence classification
        model_name = "jackaduma/SecBERT"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # We'll use a sequence classification model with 3 classes:
        # 0: Non-login event, 1: Failed login, 2: Successful login
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=3,
            ignore_mismatched_sizes=True  # Needed because original SecBERT is for MLM
        ).to(self.device)
        
        # Separate embedding model (using the base SecBERT model)
        self.embedding_model = AutoModel.from_pretrained(model_name).to(self.device)
        
        self.class_names = ["Non-login event", "Failed login", "Successful login"]
        self.explainer = LimeTextExplainer(class_names=self.class_names)
        
        # Enhanced pattern matching
        self.success_patterns = [
            r"accepted\s+password", r"login\s+successful", 
            r"authentication\s+granted", r"sign[-_]in\s+successful",
            r"credentials\s+accepted", r"result\":\"SUCCESS\"",
            r"consolelogin\":\"success\"", r"status\":\"success\"",
            r"eventid=4624", r"access\s+granted",
            r"authentication\s+succeeded", r"user\s+logged\s+in"
        ]
        
        self.failure_patterns = [
            r"login\s+failed", r"authentication\s+failure",
            r"access\s+denied", r"sign[-_]in\s+denied",
            r"invalid\s+credentials", r"failed\s+password",
            r"result\":\"FAILURE\"", r"result\":\"DENIED\"",
            r"consolelogin\":\"failure\"", r"status\":\"failure\"",
            r"eventid=4625", r"authentication\s+unsuccessful",
            r"user\s+not\s+found", r"account\s+locked"
        ]
        
        # Compile regex patterns for better performance
        self.success_regex = [re.compile(pattern, re.IGNORECASE) for pattern in self.success_patterns]
        self.failure_regex = [re.compile(pattern, re.IGNORECASE) for pattern in self.failure_patterns]
    
    def _free_memory(self):
        torch.cuda.empty_cache()
        gc.collect()
    
    def _ensure_device_consistency(self, model_inputs):
        return {k: v.to(self.device) for k, v in model_inputs.items()}
    
    def _get_embedding(self, text):
        """Generate embedding for a given text"""
        with torch.no_grad():
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=128).to(self.device)
            outputs = self.embedding_model(**inputs)
            # Use mean pooling to get a single vector representation
            return outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    
    def predict_proba(self, texts):
        if isinstance(texts, str):
            texts = [texts]
            
        try:
            encodings = self.tokenizer(
                texts,
                padding=True,
                truncation=True,
                max_length=128,
                return_tensors="pt"
            )
            encodings = self._ensure_device_consistency(encodings)
            
            with torch.no_grad():
                outputs = self.model(**encodings)
                probabilities = torch.softmax(outputs.logits, dim=1)
            
            return probabilities.cpu().numpy()
        except RuntimeError as e:
            print(f"Prediction error: {str(e)}")
            return np.array([[1.0, 0.0, 0.0]])  # Default to non-login event
    
    def is_login_event(self, log_line):
        """Check if the log line contains any login-related keywords"""
        lower_log = log_line.lower()
        login_keywords = [
            'login', 'logon', 'authenticat', 'signin', 'sign-in',
            'session', 'access', 'credential', 'password', 'auth'
        ]
        return any(keyword in lower_log for keyword in login_keywords)
    
    def analyze_log(self, log_line):
        try:
            # First check for clear patterns
            for pattern in self.success_regex:
                if pattern.search(log_line):
                    return {
                        "log": log_line,
                        "prediction": "Successful login",
                        "confidence": 1.0,
                        "explanation": f"Matched success pattern: '{pattern.pattern}'"
                    }
                    
            for pattern in self.failure_regex:
                if pattern.search(log_line):
                    return {
                        "log": log_line,
                        "prediction": "Failed login",
                        "confidence": 1.0,
                        "explanation": f"Matched failure pattern: '{pattern.pattern}'"
                    }
            
            # Skip SecBERT if clearly not a login event
            if not self.is_login_event(log_line):
                return {
                    "log": log_line,
                    "prediction": "Non-login event",
                    "confidence": 1.0,
                    "explanation": "No login-related keywords found"
                }
            
            # Use SecBERT for ambiguous cases
            probabilities = self.predict_proba(log_line)[0]
            predicted_class = np.argmax(probabilities)
            confidence = probabilities[predicted_class]
            
            if confidence < 0.6:  # Low confidence threshold
                return {
                    "log": log_line,
                    "prediction": "Ambiguous event",
                    "confidence": float(confidence),
                    "explanation": "Low confidence classification"
                }
            
            # Generate explanation if needed
            original_device = self.device
            if original_device.type == 'cuda':
                self.model = self.model.to('cpu')
                self.device = torch.device('cpu')
            
            exp = self.explainer.explain_instance(
                log_line,
                self.predict_proba,
                num_features=5,
                labels=(predicted_class,))
            
            explanation = f"Key phrases: {', '.join([f[0] for f in exp.as_list()])}"
            
            if original_device.type == 'cuda':
                self.model = self.model.to(original_device)
                self.device = original_device
            
            self._free_memory()
            
            return {
                "log": log_line,
                "prediction": self.class_names[predicted_class],
                "confidence": float(confidence),
                "explanation": explanation
            }
            
        except Exception as e:
            self._free_memory()
            return {
                "log": log_line,
                "prediction": "Error",
                "confidence": 0.0,
                "explanation": f"Processing error: {str(e)}"
            }
    
    def analyze_logs(self, log_lines, batch_size=4):
        results = []
        progress_bar = tqdm(log_lines, desc="Analyzing logs")
        
        for i, log in enumerate(progress_bar):
            try:
                results.append(self.analyze_log(log))
                if (i + 1) % batch_size == 0:
                    self._free_memory()
            except Exception as e:
                print(f"Error processing log {i}: {str(e)}")
                results.append({
                    "log": log,
                    "prediction": "Error",
                    "confidence": 0.0,
                    "explanation": f"Processing error: {str(e)}"
                })
        return pd.DataFrame(results)

class EnhancedLogSecurityAnalyzer(LogSecurityAnalyzer):
    def __init__(self, success_logs_path='data/sample-logs/successful_login_logs.csv', 
                 failure_logs_path='data/sample-logs/failed_login_logs.csv'):
        super().__init__()
        
        # Load and preprocess reference logs
        self.success_logs = self._load_reference_logs(success_logs_path)
        self.failure_logs = self._load_reference_logs(failure_logs_path)
        
        # Precompute embeddings for reference logs
        self.success_embeddings = self._precompute_embeddings(self.success_logs['Log'])
        self.failure_embeddings = self._precompute_embeddings(self.failure_logs['Log'])
        
    def _load_reference_logs(self, filepath):
        """Load reference logs from CSV"""
        try:
            df = pd.read_csv(filepath)
            # Ensure required columns exist
            if 'Log' not in df.columns:
                raise ValueError("CSV must contain 'Log' column")
            return df
        except Exception as e:
            print(f"Error loading reference logs: {str(e)}")
            return pd.DataFrame(columns=['ID', 'Source', 'Log'])
    
    def _precompute_embeddings(self, logs):
        """Precompute embeddings for reference logs"""
        embeddings = []
        for log in logs:
            try:
                embedding = self._get_embedding(log)
                embeddings.append(embedding)
            except Exception as e:
                print(f"Error embedding log: {str(e)}")
                continue
        return np.vstack(embeddings) if embeddings else np.array([])
    
    def _calculate_semantic_similarity(self, log_text):
        """Calculate semantic similarity with reference logs"""
        if len(self.success_embeddings) == 0 or len(self.failure_embeddings) == 0:
            return {'success_similarity': 0.0, 'failure_similarity': 0.0}
        
        try:
            # Get embedding for the new log
            new_embedding = self._get_embedding(log_text)
            
            # Calculate similarity with success and failure logs
            success_sim = cosine_similarity(new_embedding, self.success_embeddings).mean()
            failure_sim = cosine_similarity(new_embedding, self.failure_embeddings).mean()
            
            return {
                'success_similarity': float(success_sim),
                'failure_similarity': float(failure_sim)
            }
        except Exception as e:
            print(f"Similarity calculation error: {str(e)}")
            return {'success_similarity': 0.0, 'failure_similarity': 0.0}
    
    def analyze_log(self, log_line):
        """Enhanced analysis with semantic similarity"""
        result = super().analyze_log(log_line)
        
        # Calculate semantic similarity
        similarities = self._calculate_semantic_similarity(log_line)
        
        # Add similarity scores to result
        result.update({
            'success_semantic_similarity': similarities['success_similarity'],
            'failure_semantic_similarity': similarities['failure_similarity']
        })
        
        # Adjust confidence based on semantic similarity if needed
        if result['prediction'] in ['Successful login', 'Failed login']:
            # If pattern matched but semantic similarity contradicts, lower confidence
            if (result['prediction'] == 'Successful login' and 
                similarities['failure_similarity'] > similarities['success_similarity']):
                result['confidence'] *= 0.7
            elif (result['prediction'] == 'Failed login' and 
                  similarities['success_similarity'] > similarities['failure_similarity']):
                result['confidence'] *= 0.7
        
        return result

def highlight_row(row):
    colors = {
        'Successful login': 'lightgreen',
        'Failed login': 'lightcoral',
        'Non-login event': 'lightblue',
        'Ambiguous event': 'lightyellow',
        'Error': 'yellow'
    }
    
    # Base color based on prediction
    base_color = colors.get(row['prediction'], '')
    
    # Additional highlighting for strong semantic matches
    if 'success_semantic_similarity' in row and row['success_semantic_similarity'] > 0.8:
        base_color = 'lime'  # Strong success match
    elif 'failure_semantic_similarity' in row and row['failure_semantic_similarity'] > 0.8:
        base_color = 'salmon'  # Strong failure match
        
    return ['background-color: ' + base_color] * len(row)

if __name__ == "__main__":
    analyzer = EnhancedLogSecurityAnalyzer(
        success_logs_path='successful_login_logs.csv',
        failure_logs_path='failed_login_logs.csv'
    )
    
    test_logs = [
        # Successful logins
        "May 18 09:05:23 ubuntu sshd[3456]: Accepted password for user1",
        '{"eventTime":"2025-05-18T09:20:55Z","eventName":"ConsoleLogin","responseElements":{"ConsoleLogin":"Success"}}',
        # Failed logins
        "2025-05-18T10:00:01Z Windows Server: EventID=4625 Authentication failed",
        '{"eventTime":"2025-05-18T10:25:56Z","responseElements":{"ConsoleLogin":"Failure"}}',
        # Other events
        '{"published":"2025-05-18T11:40:29Z","eventType":"user.repository.delete","outcome":{"result":"SUCCESS"}}',
        "2025-05-18 12:10:00 Fortinet FortiGate: system_reboot_initiated",
        # Edge cases
        "This is not a security log at all",
        "12424",
    ]
    
    results = analyzer.analyze_logs(test_logs)
    display(results.style.apply(highlight_row, axis=1))

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at jackaduma/SecBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Error loading reference logs: [Errno 2] No such file or directory: 'successful_login_logs.csv'
Error loading reference logs: [Errno 2] No such file or directory: 'failed_login_logs.csv'


Analyzing logs: 100%|██████████| 8/8 [00:00<00:00, 25.92it/s]


Unnamed: 0,log,prediction,confidence,explanation,success_semantic_similarity,failure_semantic_similarity
0,May 18 09:05:23 ubuntu sshd[3456]: Accepted password for user1,Successful login,1.0,Matched success pattern: 'accepted\s+password',0.0,0.0
1,"{""eventTime"":""2025-05-18T09:20:55Z"",""eventName"":""ConsoleLogin"",""responseElements"":{""ConsoleLogin"":""Success""}}",Successful login,1.0,"Matched success pattern: 'consolelogin\"":\""success\""'",0.0,0.0
2,2025-05-18T10:00:01Z Windows Server: EventID=4625 Authentication failed,Failed login,1.0,Matched failure pattern: 'eventid=4625',0.0,0.0
3,"{""eventTime"":""2025-05-18T10:25:56Z"",""responseElements"":{""ConsoleLogin"":""Failure""}}",Failed login,1.0,"Matched failure pattern: 'consolelogin\"":\""failure\""'",0.0,0.0
4,"{""published"":""2025-05-18T11:40:29Z"",""eventType"":""user.repository.delete"",""outcome"":{""result"":""SUCCESS""}}",Successful login,1.0,"Matched success pattern: 'result\"":\""SUCCESS\""'",0.0,0.0
5,2025-05-18 12:10:00 Fortinet FortiGate: system_reboot_initiated,Non-login event,1.0,No login-related keywords found,0.0,0.0
6,This is not a security log at all,Non-login event,1.0,No login-related keywords found,0.0,0.0
7,12424,Non-login event,1.0,No login-related keywords found,0.0,0.0


In [3]:
test_logs_bert = [
    # 5 successful login events
        "2025-05-18T09:00:12Z Windows Server: EventID=4624 Authentication granted for user021 IP=192.0.2.200",
        "May 18 09:05:23 ubuntu sshd[3456]: Access granted for user022 from 192.0.2.201 port 56321 ssh2",
        "2025-05-18 09:10:45 macOS loginwindow[6001]: Sign-in successful for user user023 (UID 560)",
        "2025-05-18 09:15:30 Cisco ASA: %ASA-6-722051: VPN Connection: credentials accepted: User=user024 IP=192.0.2.202 Duration=00:05:00",
        '{"eventTime":"2025-05-18T09:20:55Z","eventName":"ConsoleLogin","userIdentity":{"type":"IAMUser","userName":"user025"},"sourceIPAddress":"192.0.2.203","responseElements":{"ConsoleLogin":"Success"}}',

    # 10 failed login attempts
        "2025-05-18T10:00:01Z Windows Server: EventID=4625 Authentication failed for user026 IP=192.0.2.204",
        "May 18 10:05:12 ubuntu sshd[3567]: Access denied for user027 from 192.0.2.205 port 57322 ssh2",
        "2025-05-18 10:10:23 macOS loginwindow[6002]: Sign-in denied for user user028 (UID 561)",
        "2025-05-18 10:15:34 Cisco ASA: %ASA-6-722051: VPN Connection: authentication unsuccessful: User=user029 IP=192.0.2.206 Duration=00:00:30",
        "2025/05/18 10:20:45,001801000014,SYSTEM,gplogin,0,2025/05/18 10:20:45,192.0.2.207,10.0.0.2,Login,globalprotect,user=user030,tunnel=no,result=denied",
        '{"eventTime":"2025-05-18T10:25:56Z","eventName":"ConsoleLogin","userIdentity":{"type":"IAMUser","userName":"user031"},"sourceIPAddress":"192.0.2.208","responseElements":{"ConsoleLogin":"Failure"}}',
        '{"TimeGenerated":"2025-05-18T10:30:07Z","UserPrincipalName":"user032@example.com","AppDisplayName":"Azure Portal","Status":{"value":"1","additionalDetails":"Failure"},"IPAddress":"192.0.2.209"}',
        "timestamp=2025-05-18T10:35:18Z event=login login_type=authorized principal=user033@example.com ip_address=192.0.2.210 outcome=FAILURE",
        '{"published":"2025-05-18T10:40:29Z","eventType":"user.session.start","outcome":{"result":"DENIED"},"actor":{"displayName":"user034"},"client":{"ipAddress":"192.0.2.211"}}',
        "2025-05-18T10:45:40,user035@example.com,Login,login.salesforce.com,Failed,192.0.2.212,OAuth",

    # 15 other types of events
        "2025-05-18T11:00:00Z Windows Server: EventID=4647 User logoff for user021",
        "May 18 11:05:12 ubuntu sshd[4000]: session closed for user022",
        "2025-05-18 11:10:23 macOS loginwindow[6003]: User logout for user user023 (UID 562)",
        "2025-05-18 11:15:34 Cisco ASA: %ASA-6-722052: VPN Logout: User=user024 IP=192.0.2.202 Duration=00:05:10",
        "2025/05/18 11:20:45,001801000015,SYSTEM,gplogin,0,2025/05/18 11:20:45,192.0.2.207,10.0.0.2,Logout,globalprotect,user=user026,tunnel=yes",
        '{"eventTime":"2025-05-18T11:25:56Z","eventName":"ModifyUser","userIdentity":{"type":"IAMUser","userName":"user027"},"sourceIPAddress":"192.0.2.208","requestParameters":{"groupName":"Admins"}}',
        '{"TimeGenerated":"2025-05-18T11:30:07Z","Operation":"UserLoggedOut","UserId":"user028@example.com","ClientIP":"192.0.2.209"}',
        "timestamp=2025-05-18T11:35:18Z event=logout principal=user029@example.com ip_address=192.0.2.210",
        '{"published":"2025-05-18T11:40:29Z","eventType":"user.repository.delete","outcome":{"result":"SUCCESS"},"actor":{"displayName":"user030"},"client":{"ipAddress":"192.0.2.211"}}',
        "2025-05-18T11:45:40,user031@example.com,API,updateRecord,login.salesforce.com,Success",
        '{"CreationTime":"2025-05-18T11:50:00","Operation":"FileDownloaded","UserId":"user032@example.com","ClientIP":"192.0.2.212","ItemName":"report.pdf"}',
        "2025-05-18T11:55:00 UTC [3900]: [user033]@hrdb LOG:  statement: SELECT * FROM employees WHERE department='Sales';",
        "2025-05-18T12:00:00Z 50 Query user034@192.0.2.213 on payrolldb: execute UPDATE payroll SET amount=5000;",
        "192.0.2.91 - - [18/May/2025:12:05:00 +0000] \"GET /api/data HTTP/1.1\" 200 1285 \"-\" \"curl/7.68.0\"",
        "2025-05-18 12:10:00 Fortinet FortiGate device_id=FGT002 log_id=0100030001 type=event subtype=system level=notice action=system_reboot_initiated"
    ]



results = analyzer.analyze_logs(test_logs_bert)
display(results.style.apply(highlight_row, axis=1))

Analyzing logs: 100%|██████████| 30/30 [00:01<00:00, 20.33it/s]


Unnamed: 0,log,prediction,confidence,explanation,success_semantic_similarity,failure_semantic_similarity
0,2025-05-18T09:00:12Z Windows Server: EventID=4624 Authentication granted for user021 IP=192.0.2.200,Successful login,1.0,Matched success pattern: 'authentication\s+granted',0.583255,0.51965
1,May 18 09:05:23 ubuntu sshd[3456]: Access granted for user022 from 192.0.2.201 port 56321 ssh2,Successful login,1.0,Matched success pattern: 'access\s+granted',0.526285,0.497641
2,2025-05-18 09:10:45 macOS loginwindow[6001]: Sign-in successful for user user023 (UID 560),Successful login,1.0,Matched success pattern: 'sign[-_]in\s+successful',0.509473,0.486721
3,2025-05-18 09:15:30 Cisco ASA: %ASA-6-722051: VPN Connection: credentials accepted: User=user024 IP=192.0.2.202 Duration=00:05:00,Successful login,1.0,Matched success pattern: 'credentials\s+accepted',0.588285,0.548283
4,"{""eventTime"":""2025-05-18T09:20:55Z"",""eventName"":""ConsoleLogin"",""userIdentity"":{""type"":""IAMUser"",""userName"":""user025""},""sourceIPAddress"":""192.0.2.203"",""responseElements"":{""ConsoleLogin"":""Success""}}",Successful login,1.0,"Matched success pattern: 'consolelogin\"":\""success\""'",0.601685,0.598055
5,2025-05-18T10:00:01Z Windows Server: EventID=4625 Authentication failed for user026 IP=192.0.2.204,Failed login,0.7,Matched failure pattern: 'eventid=4625',0.59148,0.536765
6,May 18 10:05:12 ubuntu sshd[3567]: Access denied for user027 from 192.0.2.205 port 57322 ssh2,Failed login,0.7,Matched failure pattern: 'access\s+denied',0.550402,0.534344
7,2025-05-18 10:10:23 macOS loginwindow[6002]: Sign-in denied for user user028 (UID 561),Failed login,0.7,Matched failure pattern: 'sign[-_]in\s+denied',0.542227,0.515524
8,2025-05-18 10:15:34 Cisco ASA: %ASA-6-722051: VPN Connection: authentication unsuccessful: User=user029 IP=192.0.2.206 Duration=00:00:30,Failed login,0.7,Matched failure pattern: 'authentication\s+unsuccessful',0.596626,0.552554
9,"2025/05/18 10:20:45,001801000014,SYSTEM,gplogin,0,2025/05/18 10:20:45,192.0.2.207,10.0.0.2,Login,globalprotect,user=user030,tunnel=no,result=denied",Ambiguous event,0.533959,Low confidence classification,0.554545,0.542106


In [2]:
sample_logs = [
    "May 26 10:00:45 server sshd[1234]: Failed password for root",
    "User admin logged in successfully",
    "Connection closed by 192.168.1.2",
    "Invalid user guest from 10.0.0.1",
    "Session opened for user admin by (uid=0)",
]


results = analyzer.analyze_logs(sample_logs)
display(results.style.apply(highlight_row, axis=1))

Analyzing logs: 100%|██████████| 5/5 [00:00<00:00, 14.06it/s]


Unnamed: 0,log,prediction,confidence,explanation,success_semantic_similarity,failure_semantic_similarity
0,May 26 10:00:45 server sshd[1234]: Failed password for root,Failed login,1.0,Matched failure pattern: 'failed\s+password',0.0,0.0
1,User admin logged in successfully,Non-login event,1.0,No login-related keywords found,0.0,0.0
2,Connection closed by 192.168.1.2,Non-login event,1.0,No login-related keywords found,0.0,0.0
3,Invalid user guest from 10.0.0.1,Non-login event,1.0,No login-related keywords found,0.0,0.0
4,Session opened for user admin by (uid=0),Ambiguous event,0.354891,Low confidence classification,0.0,0.0


In [None]:
second_test_logs = [
    "<Event><System><EventID>4624</EventID></System><EventData><Data Name=\"SubjectUserName\">user000</Data><Data Name=\"IpAddress\">192.0.2.0</Data></EventData></Event>",
    "May 08 12:03:00 ubuntu sshd[2584]: Accepted password for user001 from 192.0.2.7 port 51763 ssh2",
    "2025-05-08 loginwindow[5333]: User login succeeded for user user002 (UID 559)",
    "May 08 2025 12:09:00: %ASA-6-722051: VPN Login: User=user003 IP=192.0.2.21 Duration=00:00:00 GroupPolicy=vpngroup",
    "2025/05/08 12:12:00,001801000013,SYSTEM,gplogin,0,2025/05/08 12:12:00,192.0.2.28,10.0.0.1,Login,globalprotect,user=user004,tunnel=yes,result=success",
    "{\"eventTime\": \"2025-05-08T12:15:00Z\", \"eventName\": \"ConsoleLogin\", \"userIdentity\": {\"type\": \"IAMUser\", \"userName\": \"user005\"}, \"sourceIPAddress\": \"192.0.2.35\", \"responseElements\": {\"ConsoleLogin\": \"Success\"}}",
    "{\"TimeGenerated\": \"2025-05-08T12:18:00Z\", \"UserPrincipalName\": \"user006@example.com\", \"AppDisplayName\": \"Azure Portal\", \"Status\": {\"value\": \"0\", \"additionalDetails\": \"Success\"}, \"IPAddress\": \"192.0.2.42\"}",
    "timestamp=2025-05-08T12:21:00Z event=login login_type=authorized principal=user007@example.com ip_address=192.0.2.49 outcome=SUCCESS",
    "{\"published\": \"2025-05-08T12:24:00Z\", \"eventType\": \"user.session.start\", \"outcome\": {\"result\": \"SUCCESS\"}, \"actor\": {\"displayName\": \"user008\"}, \"client\": {\"ipAddress\": \"192.0.2.56\"}}",
    "\"2025-05-08T12:27:00\",\"user009@example.com\",\"Login\",\"login.salesforce.com\",\"Successful\",\"192.0.2.63\",\"OAuth\"",
    "{\"CreationTime\": \"2025-05-08T12:30:00\", \"Operation\": \"UserLoggedIn\", \"UserId\": \"user010@example.com\", \"ClientIP\": \"192.0.2.70\", \"ResultStatus\": \"Succeeded\"}",
    "2025-05-08T12:33:00 UTC [3803]: [user011]@hrdb LOG:  connection authorized: user=user011 database=hrdb SSL enabled",
    "2025-05-08T12:36:00Z  49 Connect user012@192.0.2.84 on payrolldb",
    "192.0.2.91 - user013 [08/May/2025:12:39:00 +0000] \"POST /login HTTP/1.1\" 302 518 \"-\" \"Mozilla/5.0\"",
    "8.May.2025 12:42:00 SAPR3 DEV100 0 USER_LOGON      user014 192.0.2.98 SUCCESS",
    "8-MAY-2025 12:45:00 * (CONNECT_DATA=(SERVICE_NAME=prod)) (ADDRESS=(PROTOCOL=tcp)(HOST=192.0.2.105)(PORT=48773)) * establish * user015 * 0",
    "date=2025-05-08 time=12:48:00 device_id=FGT001 log_id=0100020001 type=event subtype=system level=notice vd=root user=user016 ui=ssh action=login status=success srcip=192.0.2.112",
    "2025-05-08 12:51:00 srx01 sshd[7482]: Accepted password for user017 from 192.0.2.119 port 55035 ssh2",
    "May 10 2025 12:54:00 bigip notice tmm[3287]: 01490100:5: AUDIT - user user018 (partition[Common]) from 192.0.2.126, Successful login.",
    "{\"@timestamp\": \"2025-05-08T12:57:00Z\", \"action\": \"user.login\", \"actor\": \"user019\", \"result\": \"success\", \"ip\": \"192.0.2.133\"}",
    "<Event><System><EventID>4624</EventID></System><EventData><Data Name=\"SubjectUserName\">user020</Data><Data Name=\"IpAddress\">192.0.2.140</Data></EventData></Event>",
    "May 08 13:03:00 ubuntu sshd[4593]: Accepted password for user021 from 192.0.2.147 port 44532 ssh2",
    "2025-05-08 loginwindow[2489]: User login succeeded for user user022 (UID 520)",
    "May 08 2025 13:09:00: %ASA-6-722051: VPN Login: User=user023 IP=192.0.2.161 Duration=00:00:00 GroupPolicy=vpngroup",
    "2025/05/08 13:12:00,001801000013,SYSTEM,gplogin,0,2025/05/08 13:12:00,192.0.2.168,10.0.0.1,Login,globalprotect,user=user024,tunnel=yes,result=success",
    "{\"eventTime\": \"2025-05-08T13:15:00Z\", \"eventName\": \"ConsoleLogin\", \"userIdentity\": {\"type\": \"IAMUser\", \"userName\": \"user025\"}, \"sourceIPAddress\": \"192.0.2.175\", \"responseElements\": {\"ConsoleLogin\": \"Success\"}}",
    "{\"TimeGenerated\": \"2025-05-08T13:18:00Z\", \"UserPrincipalName\": \"user026@example.com\", \"AppDisplayName\": \"Azure Portal\", \"Status\": {\"value\": \"0\", \"additionalDetails\": \"Success\"}, \"IPAddress\": \"192.0.2.182\"}",
    "timestamp=2025-05-08T13:21:00Z event=login login_type=authorized principal=user027@example.com ip_address=192.0.2.189 outcome=SUCCESS",
    "{\"published\": \"2025-05-08T13:24:00Z\", \"eventType\": \"user.session.start\", \"outcome\": {\"result\": \"SUCCESS\"}, \"actor\": {\"displayName\": \"user028\"}, \"client\": {\"ipAddress\": \"192.0.2.196\"}}",
    "\"2025-05-08T13:27:00\",\"user029@example.com\",\"Login\",\"login.salesforce.com\",\"Successful\",\"192.0.2.203\",\"OAuth\""
]



results = analyzer.analyze_logs(second_test_logs)
display(results.style.apply(highlight_row, axis=1))

Analyzing logs:  13%|█▎        | 4/30 [00:22<00:01, 18.10it/s]

In [9]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from lime.lime_text import LimeTextExplainer
import numpy as np
import pandas as pd
from IPython.display import display
import gc
from tqdm import tqdm
import re
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertModel

class LogSecurityAnalyzer:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
        self.classifier = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels=2
        ).to(self.device)
        
        self.embedding_model = BertModel.from_pretrained('bert-base-uncased').to(self.device)
        
        self.class_names = ["Failed login", "Successful login"]
        
        self.explainer = LimeTextExplainer(class_names=self.class_names)
        
        # Common patterns
        self.success_patterns = [
            "accepted password for", "login successful", 
            "authentication granted", "sign-in successful",
            "credentials accepted", "result\":\"SUCCESS\"",
            "consolelogin\":\"success\"", "status\":\"success\""
        ]
        
        self.failure_patterns = [
            "login failed", "authentication failure",
            "access denied", "sign-in denied",
            "invalid credentials", "failed password for",
            "result\":\"FAILURE\"", "result\":\"DENIED\"",
            "consolelogin\":\"failure\"", "status\":\"failure\""
        ]
        
        # Reference embeddings for cosine similarity
        self.reference_embeddings = {
            "success": self._get_embedding("successful login authentication granted"),
            "failure": self._get_embedding("failed login authentication denied")
        }
    
    def _free_memory(self):
        torch.cuda.empty_cache()
        gc.collect()
    
    def _get_embedding(self, text):
        with torch.no_grad():
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=128).to(self.device)
            outputs = self.embedding_model(**inputs)
            return outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    
    def _calculate_cosine_similarity(self, text):
        """Calculate cosine similarity with reference embeddings"""
        try:
            embedding = self._get_embedding(text)
            similarities = {
                "success": cosine_similarity(embedding, self.reference_embeddings["success"])[0][0],
                "failure": cosine_similarity(embedding, self.reference_embeddings["failure"])[0][0]
            }
            return similarities
        except Exception as e:
            print(f"Similarity calculation error: {str(e)}")
            return {"success": 0.0, "failure": 0.0}
    
    def predict_proba(self, texts):
        """Predict probabilities with device safety"""
        if isinstance(texts, str):
            texts = [texts]
            
        try:
            # Tokenize and move to device
            encodings = self.tokenizer(
                texts,
                padding=True,
                truncation=True,
                max_length=128,
                return_tensors="pt"
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.classifier(**encodings)
                probabilities = torch.softmax(outputs.logits, dim=1)
            
            return probabilities.cpu().numpy()
        except RuntimeError as e:
            print(f"Prediction error: {str(e)}")
            return np.array([[0.5, 0.5]])  # Neutral fallback
    
    def analyze_log(self, log_line):
        """Analyze a single log line with device safety"""
        try:
            lower_log = log_line.lower()
            
            for pattern in self.success_patterns:
                if pattern.lower() in lower_log:
                    similarities = self._calculate_cosine_similarity(log_line)
                    return {
                        "log": log_line,
                        "prediction": "Successful login",
                        "confidence": 1.0,
                        "explanation": f"Matched success pattern: '{pattern}'",
                        "cosine_similarity_success": similarities["success"],
                        "cosine_similarity_failure": similarities["failure"]
                    }
                    
            for pattern in self.failure_patterns:
                if pattern.lower() in lower_log:
                    similarities = self._calculate_cosine_similarity(log_line)
                    return {
                        "log": log_line,
                        "prediction": "Failed login",
                        "confidence": 1.0,
                        "explanation": f"Matched failure pattern: '{pattern}'",
                        "cosine_similarity_success": similarities["success"],
                        "cosine_similarity_failure": similarities["failure"]
                    }
            
            probabilities = self.predict_proba(log_line)[0]
            predicted_class = np.argmax(probabilities)
            confidence = probabilities[predicted_class]
            similarities = self._calculate_cosine_similarity(log_line)
            

            if confidence < 0.7:
                return {
                    "log": log_line,
                    "prediction": "Other event",
                    "confidence": float(confidence),
                    "explanation": "Not recognized as a login event",
                    "cosine_similarity_success": similarities["success"],
                    "cosine_similarity_failure": similarities["failure"]
                }
            
            original_device = self.device
            if original_device.type == 'cuda':
                self.classifier = self.classifier.to('cpu')
                self.device = torch.device('cpu')
            
            exp = self.explainer.explain_instance(
                log_line,
                self.predict_proba,
                num_features=5,
                labels=(predicted_class,))
            
            explanation = f"Key phrases: {', '.join([f[0] for f in exp.as_list()])}"

            if original_device.type == 'cuda':
                self.classifier = self.classifier.to(original_device)
                self.device = original_device
            
            self._free_memory()
            
            return {
                "log": log_line,
                "prediction": self.class_names[predicted_class],
                "confidence": float(confidence),
                "explanation": explanation,
                "cosine_similarity_success": similarities["success"],
                "cosine_similarity_failure": similarities["failure"]
            }
            
        except Exception as e:
            self._free_memory()
            return {
                "log": log_line,
                "prediction": "Other event",
                "confidence": 0.0,
                "explanation": f"Not a login event: {str(e)}",
                "cosine_similarity_success": 0.0,
                "cosine_similarity_failure": 0.0
            }
    
    def analyze_logs(self, log_lines, batch_size=4):
        results = []
        progress_bar = tqdm(log_lines, desc="Analyzing logs")
        
        for i, log in enumerate(progress_bar):
            try:
                results.append(self.analyze_log(log))
                if (i + 1) % batch_size == 0:
                    self._free_memory()
            except Exception as e:
                print(f"Error processing log {i}: {str(e)}")
                results.append({
                    "log": log,
                    "prediction": "Error",
                    "confidence": 0.0,
                    "explanation": f"Processing error: {str(e)}",
                    "cosine_similarity_success": 0.0,
                    "cosine_similarity_failure": 0.0
                })
        return pd.DataFrame(results)

# Example usage
if __name__ == "__main__":
    analyzer = LogSecurityAnalyzer()
    
    test_logs = [
        # Successful logins
        "May 18 09:05:23 ubuntu sshd[3456]: Accepted password for user1",
        '{"eventTime":"2025-05-18T09:20:55Z","eventName":"ConsoleLogin","responseElements":{"ConsoleLogin":"Success"}}',
        
        # Failed logins
        "2025-05-18T10:00:01Z Windows Server: EventID=4625 Authentication failed",
        '{"eventTime":"2025-05-18T10:25:56Z","responseElements":{"ConsoleLogin":"Failure"}}',
        
        # Other events
        '{"published":"2025-05-18T11:40:29Z","eventType":"user.repository.delete","outcome":{"result":"SUCCESS"}}',
        "2025-05-18 12:10:00 Fortinet FortiGate: system_reboot_initiated",
        "jmore tohle - neni - login success - mocka"
    ]
    
    test_logs_bert = [
        # 5 successful login events
        "2025-05-18T09:00:12Z Windows Server: EventID=4624 Authentication granted for user021 IP=192.0.2.200",
        "May 18 09:05:23 ubuntu sshd[3456]: Access granted for user022 from 192.0.2.201 port 56321 ssh2",
        "2025-05-18 09:10:45 macOS loginwindow[6001]: Sign-in successful for user user023 (UID 560)",
        "2025-05-18 09:15:30 Cisco ASA: %ASA-6-722051: VPN Connection: credentials accepted: User=user024 IP=192.0.2.202 Duration=00:05:00",
        '{"eventTime":"2025-05-18T09:20:55Z","eventName":"ConsoleLogin","userIdentity":{"type":"IAMUser","userName":"user025"},"sourceIPAddress":"192.0.2.203","responseElements":{"ConsoleLogin":"Success"}}',

        # 10 failed login attempts
        "2025-05-18T10:00:01Z Windows Server: EventID=4625 Authentication failed for user026 IP=192.0.2.204",
        "May 18 10:05:12 ubuntu sshd[3567]: Access denied for user027 from 192.0.2.205 port 57322 ssh2",
        "2025-05-18 10:10:23 macOS loginwindow[6002]: Sign-in denied for user user028 (UID 561)",
        "2025-05-18 10:15:34 Cisco ASA: %ASA-6-722051: VPN Connection: authentication unsuccessful: User=user029 IP=192.0.2.206 Duration=00:00:30",
        "2025/05/18 10:20:45,001801000014,SYSTEM,gplogin,0,2025/05/18 10:20:45,192.0.2.207,10.0.0.2,Login,globalprotect,user=user030,tunnel=no,result=denied",
        '{"eventTime":"2025-05-18T10:25:56Z","eventName":"ConsoleLogin","userIdentity":{"type":"IAMUser","userName":"user031"},"sourceIPAddress":"192.0.2.208","responseElements":{"ConsoleLogin":"Failure"}}',
        '{"TimeGenerated":"2025-05-18T10:30:07Z","UserPrincipalName":"user032@example.com","AppDisplayName":"Azure Portal","Status":{"value":"1","additionalDetails":"Failure"},"IPAddress":"192.0.2.209"}',
        "timestamp=2025-05-18T10:35:18Z event=login login_type=authorized principal=user033@example.com ip_address=192.0.2.210 outcome=FAILURE",
        '{"published":"2025-05-18T10:40:29Z","eventType":"user.session.start","outcome":{"result":"DENIED"},"actor":{"displayName":"user034"},"client":{"ipAddress":"192.0.2.211"}}',
        "2025-05-18T10:45:40,user035@example.com,Login,login.salesforce.com,Failed,192.0.2.212,OAuth",

        # 15 other types 
        "2025-05-18T11:00:00Z Windows Server: EventID=4647 User logoff for user021",
        "May 18 11:05:12 ubuntu sshd[4000]: session closed for user022",
        "2025-05-18 11:10:23 macOS loginwindow[6003]: User logout for user user023 (UID 562)",
        "2025-05-18 11:15:34 Cisco ASA: %ASA-6-722052: VPN Logout: User=user024 IP=192.0.2.202 Duration=00:05:10",
        "2025/05/18 11:20:45,001801000015,SYSTEM,gplogin,0,2025/05/18 11:20:45,192.0.2.207,10.0.0.2,Logout,globalprotect,user=user026,tunnel=yes",
        '{"eventTime":"2025-05-18T11:25:56Z","eventName":"ModifyUser","userIdentity":{"type":"IAMUser","userName":"user027"},"sourceIPAddress":"192.0.2.208","requestParameters":{"groupName":"Admins"}}',
        '{"TimeGenerated":"2025-05-18T11:30:07Z","Operation":"UserLoggedOut","UserId":"user028@example.com","ClientIP":"192.0.2.209"}',
        "timestamp=2025-05-18T11:35:18Z event=logout principal=user029@example.com ip_address=192.0.2.210",
        '{"published":"2025-05-18T11:40:29Z","eventType":"user.repository.delete","outcome":{"result":"SUCCESS"},"actor":{"displayName":"user030"},"client":{"ipAddress":"192.0.2.211"}}',
        "2025-05-18T11:45:40,user031@example.com,API,updateRecord,login.salesforce.com,Success",
        '{"CreationTime":"2025-05-18T11:50:00","Operation":"FileDownloaded","UserId":"user032@example.com","ClientIP":"192.0.2.212","ItemName":"report.pdf"}',
        "2025-05-18T11:55:00 UTC [3900]: [user033]@hrdb LOG:  statement: SELECT * FROM employees WHERE department='Sales';",
        "2025-05-18T12:00:00Z 50 Query user034@192.0.2.213 on payrolldb: execute UPDATE payroll SET amount=5000;",
        "192.0.2.91 - - [18/May/2025:12:05:00 +0000] \"GET /api/data HTTP/1.1\" 200 1285 \"-\" \"curl/7.68.0\"",
        "2025-05-18 12:10:00 Fortinet FortiGate device_id=FGT002 log_id=0100030001 type=event subtype=system level=notice action=system_reboot_initiated"
    ]
    
    results = analyzer.analyze_logs(test_logs)
    
    def highlight_row(row):
        if row['prediction'] == 'Successful login':
            return ['background-color: lightgreen'] * len(row)
        elif row['prediction'] == 'Failed login':
            return ['background-color: lightcoral'] * len(row)
        elif row['prediction'] == 'Error':
            return ['background-color: yellow'] * len(row)
        else:
            return [''] * len(row)
    
    # Format cosine similarity columns to show 4 decimal places
    styled_results = results.style.format({
        'cosine_similarity_success': '{:.4f}',
        'cosine_similarity_failure': '{:.4f}',
        'confidence': '{:.4f}'
    }).apply(highlight_row, axis=1)
    
    display(styled_results)

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Analyzing logs: 100%|██████████| 7/7 [00:00<00:00, 12.95it/s]


Unnamed: 0,log,prediction,confidence,explanation,cosine_similarity_success,cosine_similarity_failure
0,May 18 09:05:23 ubuntu sshd[3456]: Accepted password for user1,Successful login,1.0,Matched success pattern: 'accepted password for',0.6074,0.6424
1,"{""eventTime"":""2025-05-18T09:20:55Z"",""eventName"":""ConsoleLogin"",""responseElements"":{""ConsoleLogin"":""Success""}}",Successful login,1.0,"Matched success pattern: 'consolelogin"":""success""'",0.5822,0.6315
2,2025-05-18T10:00:01Z Windows Server: EventID=4625 Authentication failed,Other event,0.6155,Not recognized as a login event,0.6341,0.6779
3,"{""eventTime"":""2025-05-18T10:25:56Z"",""responseElements"":{""ConsoleLogin"":""Failure""}}",Failed login,1.0,"Matched failure pattern: 'consolelogin"":""failure""'",0.5843,0.6408
4,"{""published"":""2025-05-18T11:40:29Z"",""eventType"":""user.repository.delete"",""outcome"":{""result"":""SUCCESS""}}",Successful login,1.0,"Matched success pattern: 'result"":""SUCCESS""'",0.5753,0.6239
5,2025-05-18 12:10:00 Fortinet FortiGate: system_reboot_initiated,Other event,0.6181,Not recognized as a login event,0.5953,0.6364
6,jmore tohle - neni - login success - mocka,Other event,0.5867,Not recognized as a login event,0.424,0.4557
