<a href="https://colab.research.google.com/github/sravyasambaturu/preprocessing_logs/blob/main/PreProcessing_logs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install loguru pandas

Collecting loguru
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Downloading loguru-0.7.3-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.7.3


In [None]:
# prompt: modify the above code to give more specific error data

import pandas as pd
from loguru import logger
import re
from google.colab import files
import traceback

# Function to process the error log
def process_log_file(log_file_path):
    try:
        with open(log_file_path, 'r') as file:
            logs = file.readlines()
    except FileNotFoundError:
        logger.error(f"Error: Log file not found at {log_file_path}")
        return None
    except Exception as e:
        logger.error(f"An unexpected error occurred while reading the log file: {e}")
        logger.error(traceback.format_exc()) # Log detailed traceback
        return None

    timestamp_pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
    system_message_pattern = r'\[INFO\] System.*|System shutting down'

    def clean_log(log):
        log = re.sub(timestamp_pattern, "", log)
        if re.search(system_message_pattern, log):
            return None
        return log.strip()

    cleaned_logs = [clean_log(log) for log in logs]
    cleaned_logs = [log for log in cleaned_logs if log]
    return cleaned_logs

def structure_logs(cleaned_logs):
    if cleaned_logs is None: # Handle the case where process_log_file returned None
        return None
    log_data = {
        'Log Message': cleaned_logs,
        'Severity': ['ERROR' if '[ERROR]' in log else 'INFO' for log in cleaned_logs],
    }
    df = pd.DataFrame(log_data)
    return df

def log_with_loguru(cleaned_logs):
    logger.add("cleaned_logs.log", rotation="1 MB")
    for log in cleaned_logs:
        if "ERROR" in log:
            logger.error(log)
        else:
            logger.info(log)

# ... (rest of the code remains the same)

uploaded = files.upload()
log_file_path = next(iter(uploaded))

cleaned_logs = process_log_file(log_file_path)

if cleaned_logs: # Check if cleaned logs were successfully retrieved
    df = structure_logs(cleaned_logs)
    if df is not None: # Check if DataFrame creation was successful
      display(df)
      log_with_loguru(cleaned_logs)




Saving new_job_logs.txt to new_job_logs (1).txt


Unnamed: 0,Log Message,Severity
0,﻿2025-03-03T23:16:34.4743654Z Current runner v...,INFO
1,2025-03-03T23:16:34.4769602Z ##[group]Operatin...,INFO
2,2025-03-03T23:16:34.4770487Z Ubuntu,INFO
3,2025-03-03T23:16:34.4771030Z 24.04.2,INFO
4,2025-03-03T23:16:34.4771497Z LTS,INFO
...,...,...
155,2025-03-03T23:16:35.8144554Z [command]/usr/bin...,INFO
156,2025-03-03T23:16:35.8164647Z http.https://gith...,INFO
157,2025-03-03T23:16:35.8177521Z [command]/usr/bin...,INFO
158,2025-03-03T23:16:35.8209097Z [command]/usr/bin...,INFO


[32m2025-03-04 17:42:46.968[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_with_loguru[0m:[36m51[0m - [1m﻿2025-03-03T23:16:34.4743654Z Current runner version: '2.322.0'[0m
[32m2025-03-04 17:42:46.972[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_with_loguru[0m:[36m51[0m - [1m2025-03-03T23:16:34.4769602Z ##[group]Operating System[0m
[32m2025-03-04 17:42:46.973[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_with_loguru[0m:[36m51[0m - [1m2025-03-03T23:16:34.4770487Z Ubuntu[0m
[32m2025-03-04 17:42:46.974[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_with_loguru[0m:[36m51[0m - [1m2025-03-03T23:16:34.4771030Z 24.04.2[0m
[32m2025-03-04 17:42:46.975[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_with_loguru[0m:[36m51[0m - [1m2025-03-03T23:16:34.4771497Z LTS[0m
[32m2025-03-04 17:42:46.975[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_with_loguru[0m:[36m51[0m - [1m2025-03-03T23:16:34.4772021Z ##[endgroup][0m
[32m2025-03-04 17:

In [None]:
# Regular expression to remove timestamps
timestamp_pattern = r"\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z\b"

# Identify indices where 'ERROR' occurs
error_indices = df[df["Log Message"].str.contains("ERROR", case=False, na=False)].index

# Collect relevant rows (error + the line before it)
rows_to_include = []
for idx in error_indices:
    if idx > 0:  # Ensure there is a previous line
        rows_to_include.append(idx - 1)
    rows_to_include.append(idx)

df_filtered = df.loc[sorted(rows_to_include)].copy()

# Extract error message and error code
def extract_error_details(log_message):
    log_message = re.sub(timestamp_pattern, "", log_message).strip()

    # Extract meaningful error message
    error_msg_match = re.search(r"##\[error\](.+)", log_message)
    error_msg = error_msg_match.group(1).strip() if error_msg_match else log_message

    # Extract error code if available
    error_code_match = re.search(r"exit code (\d+)", error_msg, re.IGNORECASE)
    error_code = error_code_match.group(1) if error_code_match else "N/A"

    return error_msg, error_code

# Apply extraction
df_filtered[["Error Message", "Error Code"]] = df_filtered["Log Message"].apply(lambda x: pd.Series(extract_error_details(x)))

# Select final columns and reset index to remove row numbers
df_final = df_filtered[["Error Message", "Error Code", "Severity"]].reset_index(drop=True)

# Display the table nicely formatted
print(df_final.to_string(index=False))

                                                                                       Error Message Error Code Severity
/home/runner/work/_temp/39ca960c-7e83-451b-8ee6-87466defac22.sh: line 1: test.txt: Permission denied        N/A     INFO
                                                                 Process completed with exit code 1.          1     INFO


In [None]:
df.to_csv('df_12:50PM.csv', index=False)
