In [96]:
import csv
import re
import os
import pandas as pd
from datetime import datetime

In [97]:
# Define the input and output CSV file paths
input_csv_name = "GPN_Syslog_10000.csv"
input_dir = "data"
output_dir = "processed_data"
if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
output_csv_name = input_csv_name[:-4] + "_output" + ".csv"
input_csv_path = os.path.join(input_dir,input_csv_name)
output_csv_path = os.path.join(output_dir,output_csv_name)

In [98]:
pattern1 = r"(?P<vendor>[A-Z]+)-(?P<physical_site_id>\d+)_(?P<geolocation_code>[A-Z\d]+)-(?P<router_type>[A-Z\d]+)_(?P<router_model_number>[A-Z\d]+)-(?P<router_importance>[A-Z\d]+) (?P<vendor_identifier>%%)(?P<version_number>\d+)(?P<module_name>[A-Z]+)/(?P<severity>\d+)/(?P<log_type>[A-Z_]+)(?P<log_status>[\(\)a-z]+):CID=(?P<cid>[0-9a-zA-Z]+);(?P<description>.+)"


In [3]:
def extract_time_info(message):
    # Extract timestamp
    timestamp_str = message.split(' ')[0:4]
    timestamp_str = ' '.join(timestamp_str)
    dt = datetime.strptime(timestamp_str, "%b %d %Y %H:%M:%S")
    year = dt.year
    month = dt.month
    day_of_month = dt.day
    hour = dt.hour
    minute = dt.minute
    second = dt.second
    millisecond = dt.microsecond // 1000
    day_of_week = dt.weekday()  # Monday = 0, Sunday = 6
    day_of_year = dt.timetuple().tm_yday
    week_of_year = dt.isocalendar()[1]
    is_weekend = int(dt.weekday() >= 5)
    seconds_since_midnight = hour * 3600 + minute * 60 + second
    time_of_day_bucket = 'Morning' if hour < 12 else 'Afternoon' if hour < 18 else 'Evening' if hour < 21 else 'Night'
    time_dict = {
        'timestamp': timestamp_str,
        'year': year,
        'month': month,
        'day_of_month': day_of_month,
        'hour': hour,
        'minute': minute,
        'second': second,
        'millisecond': millisecond,
        'day_of_week': day_of_week,
        'day_of_year': day_of_year,
        'week_of_year': week_of_year,
        'is_weekend': is_weekend,
        'seconds_since_midnight': seconds_since_midnight,
        'time_of_day_bucket': time_of_day_bucket
    }
    return time_dict

In [100]:
def remove_time_info_from_message(message):
    # Find the position of the first colon
    first_colon_pos = message.find(':')

    # Find the position of the second colon by starting the search after the first colon
    second_colon_pos = message.find(':', first_colon_pos + 1)

    # Slice the string two characters after the second colon
    truncated_message = message[second_colon_pos + 4:]

    # Print the truncated message
    return truncated_message

In [101]:
def extract_metadata_from_description(description):
    if description:
        opening_bracket_pos = description.find('(')
        if opening_bracket_pos >= 0:
            
            # Slice the string and separate metadata
            metadata = description[opening_bracket_pos + 1:-1]

            
        else:
            print("no metadata available inside the description")
            metadata = "" 
    else:
        print("no description and hence no metadata")
        metadata = ""
        
    return metadata    

In [102]:
def extract_network_info(pattern,message):
    matching = re.search(pattern, message)
    if matching:
        extracted_info = matching.groupdict()
        description = extracted_info.get("description")
        metadata = extract_metadata_from_description(description=description)
        extracted_info["metadata"] = metadata
        return extracted_info
    else:
        # Return empty strings for all expected fields if pattern does not match
        return {
            "vendor": "",
            "physical_site_id": "",
            "geolocation_code": "",
            "router_type": "",
            "router_model_number": "",
            "router_importance": "",
            "vendor_identifier": "",
            "version_number": "",
            "module_name": "",
            "severity": "",
            "log_type": "",
            "log_status": "",
            "cid": "",
            "description": "",
            "metadata": "",
        }   


In [103]:
# Define headers for the new CSV file
headers = [
    'timestamp_of_received_log',
    'timestamp_of_message',
    'year',
    'month',
    'day_of_month',
    'hour',
    'minute',
    'second',
    'millisecond',
    'day_of_week',
    'day_of_year',
    'week_of_year',
    'is_weekend',
    'seconds_since_midnight',
    'time_of_day_bucket',
    'vendor', 
    'physical_site_id', 
    'geolocation_code', 
    'router_type', 
    'router_model_number',
    'router_importance', 
    'vendor_identifier', 
    'version_number', 
    'module_name', 
    'severity',
    'log_type', 
    'log_status', 
    'cid', 
    'description', 
    'metadata']

In [104]:
# Open the input CSV file for reading and the output CSV file for writing
with open(input_csv_path, mode='r', newline='', encoding='utf-8') as infile, \
     open(output_csv_path, mode='w', newline='', encoding='utf-8') as outfile:

    # Create CSV reader and writer
    csv_reader = csv.reader(infile)
    next(csv_reader) # skipping the header row
    csv_writer = csv.DictWriter(outfile, fieldnames=headers)

    # Write the header to the new CSV file
    csv_writer.writeheader()

    # Process each row in the input CSV
    for row in csv_reader:
        print(row)
        if len(row) < 3:
            continue

        
        timestamp_of_received_log = row[0]
        # Extract the message from the third column
        message = row[2]

        time_dict = extract_time_info(message=message)
        final_time_dict = {"timestamp_of_received_log": timestamp_of_received_log} | time_dict
        truncated_message = remove_time_info_from_message(message=message)
        network_info_dict = extract_network_info(pattern=pattern1, message=truncated_message)
        extracted_data = final_time_dict | network_info_dict

        # Write the extracted data to the new CSV file
        
        data_to_write = {
            'timestamp_of_received_log': extracted_data.get('timestamp_of_received_log', ''),
            'timestamp_of_message': extracted_data.get('timestamp', ''),
            'year': extracted_data.get('year', ''),
            'month': extracted_data.get('month', ''),
            'day_of_month': extracted_data.get('day_of_month', ''),
            'hour': extracted_data.get('hour', ''),
            'minute': extracted_data.get('minute', ''),
            'second': extracted_data.get('second', ''),
            'millisecond': extracted_data.get('millisecond', ''),
            'day_of_week': extracted_data.get('day_of_week', ''),
            'day_of_year': extracted_data.get('day_of_year', ''),
            'week_of_year': extracted_data.get('week_of_year', ''),
            'is_weekend': extracted_data.get('is_weekend', ''),
            'seconds_since_midnight': extracted_data.get('seconds_since_midnight', ''),
            'time_of_day_bucket': extracted_data.get('time_of_day_bucket', ''),
            'vendor': extracted_data.get('vendor', ''),
            'physical_site_id': extracted_data.get('physical_site_id', ''),
            'geolocation_code': extracted_data.get('geolocation_code', ''),
            'router_type': extracted_data.get('router_type', ''),
            'router_model_number': extracted_data.get('router_model_number', ''),
            'router_importance': extracted_data.get('router_importance', ''),
            'vendor_identifier': extracted_data.get('vendor_identifier', ''),
            'version_number': extracted_data.get('version_number', ''),
            'module_name': extracted_data.get('module_name', ''),
            'severity': extracted_data.get('severity', ''),
            'log_type': extracted_data.get('log_type', ''),
            'log_status': extracted_data.get('log_status', ''),
            'cid': extracted_data.get('cid', ''),
            'description': extracted_data.get('description', ''),
            'metadata': extracted_data.get('metadata', '')
        }
        csv_writer.writerow(data_to_write)

print("CSV processing complete. Data written to:", output_csv_path)

FileNotFoundError: [Errno 2] No such file or directory: 'data\\GPN_Syslog.csv'

In [105]:
import json


def standardising_live_message(message):
        message_dict = json.loads(message)
        raw_msg = message_dict.get('payload','')
        if raw_msg != '':
            index = raw_msg.find('>')
            msg_str = raw_msg[index+1:]
            return msg_str
        else:
            return ''   



In [107]:
json_string_live_message = '{"schema":null,"payload":"<188>Sep  3 2024 05:19:17 GPN-0484_GEO-MUK-HOS_AR12-01 %%01IFPDT/4/IF_STATE(l)[133047]:Interface GigabitEthernet0/0/2 has turned into UP state."}'

In [108]:
standard_live_message = standardising_live_message(json_string_live_message)

In [109]:
print(standard_live_message)

Sep  3 2024 05:19:17 GPN-0484_GEO-MUK-HOS_AR12-01 %%01IFPDT/4/IF_STATE(l)[133047]:Interface GigabitEthernet0/0/2 has turned into UP state.


In [110]:
time_dict = extract_time_info(message=standard_live_message)


here
['Sep', '', '3', '2024', '05:19:17', 'GPN-0484_GEO-MUK-HOS_AR12-01', '%%01IFPDT/4/IF_STATE(l)[133047]:Interface', 'GigabitEthernet0/0/2', 'has', 'turned', 'into', 'UP', 'state.']
None
['Sep', '3', '2024', '05:19:17']
Sep 3 2024 05:19:17


In [92]:
d = ['Sep', '', '3', '2024', '05:19:17', 'GPN-0484_GEO-MUK-HOS_AR12-01', '%%01IFPDT/4/IF_STATE(l)[133047]:Interface', 'GigabitEthernet0/0/2', 'has', 'turned', 'into', 'UP', 'state.']


In [93]:

d.remove('')

In [94]:
print(d)

['Sep', '3', '2024', '05:19:17', 'GPN-0484_GEO-MUK-HOS_AR12-01', '%%01IFPDT/4/IF_STATE(l)[133047]:Interface', 'GigabitEthernet0/0/2', 'has', 'turned', 'into', 'UP', 'state.']


In [118]:
from syslog_processing import Processing

In [130]:
obj = Processing()

In [150]:
desc = 'Automatic record:'

In [152]:
obj.extract_unhappy_status_from_description('j')

'1'

In [148]:
desc = 'hellog'

In [153]:
'error' in 'd'

False

In [156]:
descrip = 'd'

In [34]:

def extract_time_info(message):
    # Extract timestamp
    split_message = message.split(' ')
    if split_message[1] == '':
        print('here')
        print(split_message)
        split_message_2 = split_message.remove('')
        print(split_message_2)
    timestamp_str = split_message[0:4]    
    print(timestamp_str)
    timestamp_str = ' '.join(timestamp_str)
    print(timestamp_str)
    try:
        dt = datetime.strptime(timestamp_str, "%b %d %Y %H:%M:%S")
    except Exception as e:
        dt = datetime.strptime('Jan 01 0001 01:01:01', "%b %d %Y %H:%M:%S")  
    print(dt)    
    print(type(dt))    
    year = dt.year
    month = dt.month
    day_of_month = dt.day
    hour = dt.hour
    minute = dt.minute
    second = dt.second
    millisecond = dt.microsecond // 1000
    day_of_week = dt.weekday()  # Monday = 0, Sunday = 6
    day_of_year = dt.timetuple().tm_yday
    week_of_year = dt.isocalendar()[1]
    is_weekend = int(dt.weekday() >= 5)
    seconds_since_midnight = hour * 3600 + minute * 60 + second
    time_of_day_bucket = 'Morning' if hour < 12 else 'Afternoon' if hour < 18 else 'Evening' if hour < 21 else 'Night'
    time_dict = {
        'timestamp': timestamp_str,
        'year': year,
        'month': month,
        'day_of_month': day_of_month,
        'hour': hour,
        'minute': minute,
        'second': second,
        'millisecond': millisecond,
        'day_of_week': day_of_week,
        'day_of_year': day_of_year,
        'week_of_year': week_of_year,
        'is_weekend': is_weekend,
        'seconds_since_midnight': seconds_since_midnight,
        'time_of_day_bucket': time_of_day_bucket
    }
    return time_dict



In [38]:
from datetime import datetime
message = '2024-9-24 10:02:17 GPN-0041_CHA-MAX-HOS_AR12-01 %%01INFO/4/SUPPRESS_LOG(l)[26051]:Last message repeated 3 times.(InfoID=3247640584, ModuleName=LLDP, InfoAlias=BAD_PACKET)'
# message ='Sep 24 2024 10:02:08 GPN-1205_BOI-SEC-SCH_AR12-01 %%01QOS/4/SACL_LOG(l)[10240]:Ipv4 acl 3500,rule 30 permit 17 10.204.183.254(48618)-> 178.62.244.63(53) (1) packets.'


In [39]:
d = extract_time_info(message)

['2024-9-24', '10:02:17', 'GPN-0041_CHA-MAX-HOS_AR12-01', '%%01INFO/4/SUPPRESS_LOG(l)[26051]:Last']
2024-9-24 10:02:17 GPN-0041_CHA-MAX-HOS_AR12-01 %%01INFO/4/SUPPRESS_LOG(l)[26051]:Last
0001-01-01 01:01:01
<class 'datetime.datetime'>


In [40]:
d

{'timestamp': '2024-9-24 10:02:17 GPN-0041_CHA-MAX-HOS_AR12-01 %%01INFO/4/SUPPRESS_LOG(l)[26051]:Last',
 'year': 1,
 'month': 1,
 'day_of_month': 1,
 'hour': 1,
 'minute': 1,
 'second': 1,
 'millisecond': 0,
 'day_of_week': 0,
 'day_of_year': 1,
 'week_of_year': 1,
 'is_weekend': 0,
 'seconds_since_midnight': 3661,
 'time_of_day_bucket': 'Morning'}