In [1]:
import csv
import re
import os
from datetime import datetime

In [2]:
# Define the input and output CSV file paths
input_csv_name = "TACACS.csv"
input_dir = "data"
output_dir = "processed_data"
if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
output_csv_name = input_csv_name[:-4] + "_output" + ".csv"
input_csv_path = os.path.join(input_dir,input_csv_name)
output_csv_path = os.path.join(output_dir,output_csv_name)

In [3]:
def extract_time_info(message):
        # Extract timestamp
        split_message = message.split(' ')
        if split_message[1] == '':
            split_message.remove('')
        timestamp_str = split_message[0:3]    
        timestamp_str = ' '.join(timestamp_str)
        dt = datetime.strptime(timestamp_str, "%b %d %H:%M:%S")
        month = dt.month
        day_of_month = dt.day
        hour = dt.hour
        minute = dt.minute
        second = dt.second
        millisecond = dt.microsecond // 1000
        day_of_week = dt.weekday()  # Monday = 0, Sunday = 6
        day_of_year = dt.timetuple().tm_yday
        week_of_year = dt.isocalendar()[1]
        is_weekend = int(dt.weekday() >= 5)
        seconds_since_midnight = hour * 3600 + minute * 60 + second
        time_of_day_bucket = 'Morning' if hour < 12 else 'Afternoon' if hour < 18 else 'Evening' if hour < 21 else 'Night'
        time_dict = {
            'timestamp': timestamp_str,
            'month': month,
            'day_of_month': day_of_month,
            'hour': hour,
            'minute': minute,
            'second': second,
            'millisecond': millisecond,
            'day_of_week': day_of_week,
            'day_of_year': day_of_year,
            'week_of_year': week_of_year,
            'is_weekend': is_weekend,
            'seconds_since_midnight': seconds_since_midnight,
            'time_of_day_bucket': time_of_day_bucket
        }
        return time_dict


In [4]:
def remove_time_info_from_message(message):
        # Find the position of the first colon
        first_colon_pos = message.find(':')

        # Find the position of the second colon by starting the search after the first colon
        second_colon_pos = message.find(':', first_colon_pos + 1)

        # Slice the string two characters after the second colon
        truncated_message = message[second_colon_pos + 5:]

        # Print the truncated message
        return truncated_message

In [5]:

# Sample message
message = "Jul 29 08:06:46  172.31.210.13 nms_admin vty0  172.31.25.125 stop  task_id=2094590 timezone=2  service=shell disc-cause=3  disc-cause-ext=1022 elapsed_time=597"
            
            
time_info = extract_time_info(message=message)



In [6]:
truncated_message = remove_time_info_from_message(message)
message = truncated_message


In [7]:
print(time_info)

{'timestamp': 'Jul 29 08:06:46', 'month': 7, 'day_of_month': 29, 'hour': 8, 'minute': 6, 'second': 46, 'millisecond': 0, 'day_of_week': 6, 'day_of_year': 210, 'week_of_year': 30, 'is_weekend': 1, 'seconds_since_midnight': 29206, 'time_of_day_bucket': 'Morning'}


In [8]:
print(message)

172.31.210.13 nms_admin vty0  172.31.25.125 stop  task_id=2094590 timezone=2  service=shell disc-cause=3  disc-cause-ext=1022 elapsed_time=597


In [9]:
pattern = re.compile(r"""
    ^(?P<source_ip>\d+\.\d+\.\d+\.\d+)\s+
    (?P<user>\w+)\s+
    (?P<terminal>\w+)\s+
    (?P<destination_ip>\d+\.\d+\.\d+\.\d+)\s+stop\s+
    task_id=(?P<task_id>\d+)\s+
    timezone=(?P<timezone>\d+)\s+
    service=(?P<service>\w+)\s+
    disc-cause=(?P<disc_cause>\d+)\s+
    disc-cause-ext=(?P<disc_cause_ext>\d+)\s+
    elapsed_time=(?P<elapsed_time>\d+)
""", re.VERBOSE)


In [10]:
pattern2 = re.compile(r"""
    ^(?P<source_ip>\d+\.\d+\.\d+\.\d+)\s+
    (?P<user>\w+)\s+
    (?P<terminal>\w+)\s+
    (?P<destination_ip>\d+\.\d+\.\d+\.\d+)\s+stop\s+
    (?P<description>.+)""", re.VERBOSE)


In [11]:

# Extract fields using the combined regex pattern
def extract_fields(message):
    match = pattern2.search(message)
    if match:
        return match.groupdict()
    return {}

# Extract the fields
extracted_fields = extract_fields(message)
print(extracted_fields)
# Print the extracted fields
for key, value in extracted_fields.items():
    print(f"{key}: {value}")


{'source_ip': '172.31.210.13', 'user': 'nms_admin', 'terminal': 'vty0', 'destination_ip': '172.31.25.125', 'description': 'task_id=2094590 timezone=2  service=shell disc-cause=3  disc-cause-ext=1022 elapsed_time=597'}
source_ip: 172.31.210.13
user: nms_admin
terminal: vty0
destination_ip: 172.31.25.125
description: task_id=2094590 timezone=2  service=shell disc-cause=3  disc-cause-ext=1022 elapsed_time=597


In [12]:
# NOTE!
# For extracting the known fields and 
# not having to deal with uncertainties like random spaces, 
# misorder of fields, use this approach. 

# Basically, here we just look for what fields we want
# in the message and apart from that anything can be there, 
# and it wont harm us, unlike the case when we match one 
# big pattern for extracting the entire info. 

# It is slightly less optimal computationally but 
# more effective when there is uncertainty about the connectors present
# between useful fields in the raw message.
# For eg: suppose in a particular data source, for 10000 raw messages ingested, 
# in between the useful fields of 'time' and 'hostname', 
# there is 1 space in some messages and there are 2 spaces in rest of messages.


import re

# Sample message
message = '''172.31.210.13  
            nms_admin vty0  172.31.25.125 stop  
            task_id=2094590 timezone=2  
            service=shell disc-cause=3  
            disc-cause-ext=1022 elapsed_time=597'''

# Define regex patterns for each field
patterns = {
    'source_ip': r'^(\d+\.\d+\.\d+\.\d+)',
    'destination_ip': r'\s+(\d+\.\d+\.\d+\.\d+)\s+stop',
    'task_id': r'task_id=(\d+)',
    'timezone': r'timezone=(\d+)',
    'service': r'service=([\w]+)',
    'disc_cause': r'disc-cause=(\d+)',
    'disc_cause_ext': r'disc-cause-ext=(\d+)',
    'elapsed_time': r'elapsed_time=(\d+)'
}

# Extract fields using regex patterns
def extract_fields(message):
    fields = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, message)
        if match:
            fields[key] = match.group(1)
    return fields

# Extract the fields
extracted_fields = extract_fields(message)

# Print the extracted fields
for key, value in extracted_fields.items():
    print(f"{key}: {value}")


source_ip: 172.31.210.13
destination_ip: 172.31.25.125
task_id: 2094590
timezone: 2
service: shell
disc_cause: 3
disc_cause_ext: 1022
elapsed_time: 597
