In [1]:
import os
import csv
from syslog_processing import Processing

obj = Processing()
# input_csv_name = "GPN_Syslog_500.csv"
# input_csv_path = "GPN_Syslog_500.csv"
input_csv_path = 'processed_data/output_batch_1.csv'

STEP 1: Preprocess the Log Data for a particular router.

STEP 1a. Take router info from user

In [2]:
physical_site = input("Enter physical site id (eg-0001): ")
print(f"Physical site id chosen= {physical_site}")
geo_code = input("Enter geolocation code (eg-TLK): ").upper()
print(f"Geolocation code chosen= {geo_code}")
device_role = input("Enter device role (eg-CR for core router): ").upper()
print(f"Device role chosen= {device_role}")
device_model_number = input("Enter device model number (eg-M14): ").upper()
print(f"Device model number chosen= {device_model_number}")
# device_importance = input("Enter device importance (eg-01 for primary router): ")
# print(f"Device importance chosen= {device_importance}")

Physical site id chosen= 0008
Geolocation code chosen= SBA
Device role chosen= CR
Device model number chosen= M14


In [3]:
# desc = obj.get_specific_device_with_descriptions_formatted(input_csv_path= input_csv_path,physical_site_id='0001', geolocation_code='TLK', device_role='CR', device_model_number='M14', device_importance='01')
desc = obj.get_specific_device_with_descriptions_formatted(input_csv_path= input_csv_path,physical_site_id=physical_site, geolocation_code=geo_code, device_role=device_role, device_model_number=device_model_number)

In [4]:
print(desc)
len(desc)

['gpn-0008-sba-cr--m14-01 -> the source ip was unlocked.', 'gpn-0008-sba-cr--m14-01 -> failed to login through snmp.', 'gpn-0008-sba-cr--m14-01 -> the source ip was locked because of the failure of login through snmp.', 'gpn-0008-sba-cr--m14-01 -> the source ip was unlocked.', 'gpn-0008-sba-cr--m14-01 -> the source ip was locked because of the failure of login through snmp.', 'gpn-0008-sba-cr--m14-01 -> the source ip was unlocked.', 'gpn-0008-sba-cr--m14-01 -> log snmp/authenticationfailure is suppressed 1 in last 60 seconds.', 'gpn-0008-sba-cr--m14-01 -> log snmp/snmp_authen_failed is suppressed 1 in last 60 seconds.', 'gpn-0008-sba-cr--m14-01 -> failed to login through snmp.', 'gpn-0008-sba-cr--m14-01 -> the source ip was locked because of the failure of login through snmp.', 'gpn-0008-sba-cr--m14-01 -> the source ip was unlocked.', 'gpn-0008-sba-cr--m14-01 -> component plug in or plug out information.', 'gpn-0008-sba-cr--m14-01 -> component plug in or plug out information.']


13

Step 2: Initialize DistilBERT Model and Tokenizer

In [5]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel

# Initialize the DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


Step 3: Function to Generate Log Embeddings

In [6]:
def get_embeddings(logs):
    # Tokenize and encode each log message
    inputs = tokenizer(logs, return_tensors="pt", padding=True, truncation=True, max_length=128)

    # Get the embeddings from the last hidden layer of DistilBERT
    with torch.no_grad():
        outputs = model(**inputs)

    # Take the mean of the hidden states as the log's embedding
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.numpy()


Step 4: Generate Embeddings for All Logs

In [7]:
log_embeddings = get_embeddings(desc)


Step 5: Normalize the Embeddings (Optional but Recommended)

In [8]:
from sklearn.preprocessing import StandardScaler

# Normalize the embeddings
scaler = StandardScaler()
log_embeddings_scaled = scaler.fit_transform(log_embeddings)

Step 6: Apply Isolation Forest for Anomaly Detection

In [9]:
from sklearn.ensemble import IsolationForest

# Fit Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(log_embeddings_scaled)


Step 7: Predict Anomalies

In [10]:
# Predict anomalies
anomaly_labels = iso_forest.predict(log_embeddings_scaled)

# Print logs flagged as anomalies
for i, log in enumerate(desc):
    if anomaly_labels[i] == -1:
        print(f"Rare Event Detected: {log}")


Rare Event Detected: gpn-0008-sba-cr--m14-01 -> log snmp/snmp_authen_failed is suppressed 1 in last 60 seconds.


In the log messages, there are many failures that are recurring (like snmp login fails, ip lock/unlock authentication fails). The anomalies detected above are the failures which occur rarely in the logs, posibly hinting at being more important to look at.

In [11]:
log

'gpn-0008-sba-cr--m14-01 -> component plug in or plug out information.'

In [12]:
def get_frequencies(lst):
    frequency_dict = {}
    for item in lst:
        if item in frequency_dict:
            frequency_dict[item] += 1
        else:
            frequency_dict[item] = 1
    return frequency_dict

In [13]:
frequency_dict = get_frequencies(desc)

In [14]:
frequency_dict

{'gpn-0008-sba-cr--m14-01 -> the source ip was unlocked.': 4,
 'gpn-0008-sba-cr--m14-01 -> failed to login through snmp.': 2,
 'gpn-0008-sba-cr--m14-01 -> the source ip was locked because of the failure of login through snmp.': 3,
 'gpn-0008-sba-cr--m14-01 -> log snmp/authenticationfailure is suppressed 1 in last 60 seconds.': 1,
 'gpn-0008-sba-cr--m14-01 -> log snmp/snmp_authen_failed is suppressed 1 in last 60 seconds.': 1,
 'gpn-0008-sba-cr--m14-01 -> component plug in or plug out information.': 2}

In [15]:
data = [['DEVICE', 'ANOMALY EVENTS', 'FREQUENCY']]
# Store logs flagged as anomalies
for i, log in enumerate(desc):
    if anomaly_labels[i] == -1:
        # print(f"Rare Event Detected: {log}")
        ans = []
        ans = []
        device, event = log.split('->')
        ans.append(device.strip())
        ans.append(event.strip())
        ans.append(frequency_dict.get(log,1))
        data.append(ans)

In [16]:
data

[['DEVICE', 'ANOMALY EVENTS', 'FREQUENCY'],
 ['gpn-0008-sba-cr--m14-01',
  'log snmp/snmp_authen_failed is suppressed 1 in last 60 seconds.',
  1]]

In [17]:
# Specify the output file path
output_filepath = 'anomaly_detection.csv'

In [18]:
with open(output_filepath, mode='w', newline='', encoding='utf-8') as outfile:
    csv_writer = csv.writer(outfile)  # Initialize CSV writer
    csv_writer.writerows(data)

In [19]:
from openpyxl import Workbook
from openpyxl.styles import PatternFill

In [20]:
# Create a new workbook and select the active worksheet
wb = Workbook()
ws = wb.active

In [21]:
# Define a fill color for the header and entry rows
header_fill = PatternFill(start_color='FF6666', end_color='FF6666', fill_type='solid')  # Light Red
entry_fill = PatternFill(start_color='FFB6C1', end_color='FFB6C1', fill_type='solid')  # Light pink


In [22]:
device_entry_fill = PatternFill(start_color='CCFFCC', end_color='CCFFCC', fill_type='solid')  # Light green
event_entry_fill = PatternFill(start_color='FFA07A', end_color='FFA07A', fill_type='solid')  # light red
frequence_entry_fill = PatternFill(start_color='ADD8E6', end_color='ADD8E6', fill_type='solid')  # Light blue

In [23]:
# Add data to the worksheet, color the rows, and adjust column widths
for row_index, row_data in enumerate(data):
    for col_index, cell_value in enumerate(row_data):
        cell = ws.cell(row=row_index + 1, column=col_index + 1, value=cell_value)
        
        # Color the header row
        if row_index == 0:  # First row is the header
            cell.fill = header_fill
        else:
            cell.fill = entry_fill


In [24]:
# Adjust column widths based on the length of the data
for col in ws.columns:
    max_length = 0
    column = col[0].column_letter  # Get the column letter
    for cell in col:
        try:
            if len(str(cell.value)) > max_length:
                max_length = len(str(cell.value))
        except:
            pass
    adjusted_width = max_length + 2  # Add some padding
    ws.column_dimensions[column].width = adjusted_width

In [25]:
# Save the workbook to a file
output_filepath = 'anomaly_detection.xlsx'
wb.save(output_filepath)