<a href="https://colab.research.google.com/github/sravyasambaturu/preprocessing_logs/blob/main/PreProcessing_logs_with_LLama_7b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install loguru pandas

Collecting loguru
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Downloading loguru-0.7.3-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.7.3


In [None]:
# prompt: modify the above code to give more specific error data

import pandas as pd
from loguru import logger
import re
from google.colab import files
import traceback

# Function to process the error log
def process_log_file(log_file_path):
    try:
        with open(log_file_path, 'r') as file:
            logs = file.readlines()
    except FileNotFoundError:
        logger.error(f"Error: Log file not found at {log_file_path}")
        return None
    except Exception as e:
        logger.error(f"An unexpected error occurred while reading the log file: {e}")
        logger.error(traceback.format_exc()) # Log detailed traceback
        return None

    timestamp_pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
    system_message_pattern = r'\[INFO\] System.*|System shutting down'

    def clean_log(log):
        log = re.sub(timestamp_pattern, "", log)
        if re.search(system_message_pattern, log):
            return None
        return log.strip()

    cleaned_logs = [clean_log(log) for log in logs]
    cleaned_logs = [log for log in cleaned_logs if log]
    return cleaned_logs

def structure_logs(cleaned_logs):
    if cleaned_logs is None: # Handle the case where process_log_file returned None
        return None
    log_data = {
        'Log Message': cleaned_logs,
        'Severity': ['ERROR' if '[ERROR]' in log else 'INFO' for log in cleaned_logs],
    }
    df = pd.DataFrame(log_data)
    return df

def log_with_loguru(cleaned_logs):
    logger.add("cleaned_logs.log", rotation="1 MB")
    for log in cleaned_logs:
        if "ERROR" in log:
            logger.error(log)
        else:
            logger.info(log)

# ... (rest of the code remains the same)

uploaded = files.upload()
log_file_path = next(iter(uploaded))

cleaned_logs = process_log_file(log_file_path)

if cleaned_logs: # Check if cleaned logs were successfully retrieved
    df = structure_logs(cleaned_logs)
    if df is not None: # Check if DataFrame creation was successful
      display(df)
      log_with_loguru(cleaned_logs)




Saving new_job_logs.txt to new_job_logs (1).txt


Unnamed: 0,Log Message,Severity
0,﻿2025-03-03T23:16:34.4743654Z Current runner v...,INFO
1,2025-03-03T23:16:34.4769602Z ##[group]Operatin...,INFO
2,2025-03-03T23:16:34.4770487Z Ubuntu,INFO
3,2025-03-03T23:16:34.4771030Z 24.04.2,INFO
4,2025-03-03T23:16:34.4771497Z LTS,INFO
...,...,...
155,2025-03-03T23:16:35.8144554Z [command]/usr/bin...,INFO
156,2025-03-03T23:16:35.8164647Z http.https://gith...,INFO
157,2025-03-03T23:16:35.8177521Z [command]/usr/bin...,INFO
158,2025-03-03T23:16:35.8209097Z [command]/usr/bin...,INFO


[32m2025-03-04 17:42:46.968[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_with_loguru[0m:[36m51[0m - [1m﻿2025-03-03T23:16:34.4743654Z Current runner version: '2.322.0'[0m
[32m2025-03-04 17:42:46.972[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_with_loguru[0m:[36m51[0m - [1m2025-03-03T23:16:34.4769602Z ##[group]Operating System[0m
[32m2025-03-04 17:42:46.973[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_with_loguru[0m:[36m51[0m - [1m2025-03-03T23:16:34.4770487Z Ubuntu[0m
[32m2025-03-04 17:42:46.974[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_with_loguru[0m:[36m51[0m - [1m2025-03-03T23:16:34.4771030Z 24.04.2[0m
[32m2025-03-04 17:42:46.975[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_with_loguru[0m:[36m51[0m - [1m2025-03-03T23:16:34.4771497Z LTS[0m
[32m2025-03-04 17:42:46.975[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_with_loguru[0m:[36m51[0m - [1m2025-03-03T23:16:34.4772021Z ##[endgroup][0m
[32m2025-03-04 17:

In [None]:
# Regular expression to remove timestamps
timestamp_pattern = r"\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z\b"

# Identify indices where 'ERROR' occurs
error_indices = df[df["Log Message"].str.contains("ERROR", case=False, na=False)].index

# Collect relevant rows (error + the line before it)
rows_to_include = []
for idx in error_indices:
    if idx > 0:  # Ensure there is a previous line
        rows_to_include.append(idx - 1)
    rows_to_include.append(idx)

df_filtered = df.loc[sorted(rows_to_include)].copy()

# Extract error message and error code
def extract_error_details(log_message):
    log_message = re.sub(timestamp_pattern, "", log_message).strip()

    # Extract meaningful error message
    error_msg_match = re.search(r"##\[error\](.+)", log_message)
    error_msg = error_msg_match.group(1).strip() if error_msg_match else log_message

    # Extract error code if available
    error_code_match = re.search(r"exit code (\d+)", error_msg, re.IGNORECASE)
    error_code = error_code_match.group(1) if error_code_match else "N/A"

    return error_msg, error_code

# Apply extraction
df_filtered[["Error Message", "Error Code"]] = df_filtered["Log Message"].apply(lambda x: pd.Series(extract_error_details(x)))

# Select final columns and reset index to remove row numbers
df_final = df_filtered[["Error Message", "Error Code", "Severity"]].reset_index(drop=True)

# Display the table nicely formatted
print(df_final.to_string(index=False))

                                                                                       Error Message Error Code Severity
/home/runner/work/_temp/39ca960c-7e83-451b-8ee6-87466defac22.sh: line 1: test.txt: Permission denied        N/A     INFO
                                                                 Process completed with exit code 1.          1     INFO


In [None]:
df.to_csv('df_12:50PM.csv', index=False)


In [None]:
!pip install --upgrade langchain langchain-community llama-cpp-python


Collecting langchain
  Downloading langchain-0.3.20-py3-none-any.whl.metadata (7.7 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.7.tar.gz (66.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting langchain-core<1.0.0,>=0.3.41 (from langchain)
  Downloading langchain_core-0.3.41-py3-none-any.whl.metadata (5.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.

In [30]:
import pandas as pd
import re
import time
from langchain.llms import LlamaCpp
from langchain.utilities import GoogleSearchAPIWrapper
from google.colab import drive

drive.mount('/content/drive')

model_path = "/content/drive/My Drive/Colab_Notebooks/codellama-7b.Q4_K_M.gguf"

import os
print("File exists:", os.path.exists(model_path))

# Load the model
llm = LlamaCpp(model_path=model_path)

# Function to get AI-generated insights with sanitization
def analyze_logs_with_llm(error_summary):
    prompt = (
        f"Analyze the following error log and suggest possible fixes:\n"
        f"Error Log:\n{error_summary}\n"
        f"Provide a short and clear summary of the issue and possible solutions."
    )

    # Invoke LLM and sanitize output
    raw_response = llm.invoke(prompt)
    cleaned_response = (
        raw_response.replace("```", "")  # Remove code blocks
        .replace("\n", " ")  # Remove excessive newlines
        .strip()  # Trim spaces
    )

    # Limit output length to avoid large responses
    max_length = 300  # Adjust as needed
    return cleaned_response[:max_length] if cleaned_response else "Unable to generate a meaningful summary."

# Processed error logs DataFrame
df_final = df_filtered[["Error Message", "Error Code", "Severity"]].reset_index(drop=True)

# Add a new column for AI-generated insights
df_final["AI Summary"] = ""

# Process each error message
for index, row in df_final.iterrows():
    error_summary = row["Error Message"]

    print(f"🔍 Processing Error: {error_summary}")

    # Get AI-generated analysis
    ai_response = analyze_logs_with_llm(error_summary)

    # Update DataFrame with AI summary
    df_final.at[index, "AI Summary"] = ai_response

    print(f"✅ AI Analysis Completed.\n")

# Display final DataFrame in a readable format
print("\n📊 Final Log Analysis Report:")
print(df_final.to_string(index=False))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File exists: True


llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /content/drive/My Drive/Colab_Notebooks/codellama-7b.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = codellama_codellama-7b-hf
llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:        

🔍 Processing Error: /home/runner/work/_temp/39ca960c-7e83-451b-8ee6-87466defac22.sh: line 1: test.txt: Permission denied


llama_perf_context_print:        load time =   40284.31 ms
llama_perf_context_print: prompt eval time =   40284.11 ms /    88 tokens (  457.77 ms per token,     2.18 tokens per second)
llama_perf_context_print:        eval time =  186440.92 ms /   255 runs   (  731.14 ms per token,     1.37 tokens per second)
llama_perf_context_print:       total time =  227141.46 ms /   343 tokens
Llama.generate: 18 prefix-match hit, remaining 24 prompt tokens to eval


✅ AI Analysis Completed.

🔍 Processing Error: Process completed with exit code 1.


llama_perf_context_print:        load time =   40284.31 ms
llama_perf_context_print: prompt eval time =   10291.97 ms /    24 tokens (  428.83 ms per token,     2.33 tokens per second)
llama_perf_context_print:        eval time =  192465.06 ms /   255 runs   (  754.76 ms per token,     1.32 tokens per second)
llama_perf_context_print:       total time =  203175.53 ms /   279 tokens


✅ AI Analysis Completed.


📊 Final Log Analysis Report:
                                                                                       Error Message Error Code Severity                                                                                                                                                                                                                                                                                                   AI Summary
/home/runner/work/_temp/39ca960c-7e83-451b-8ee6-87466defac22.sh: line 1: test.txt: Permission denied        N/A     INFO Describe the bug in detail and include a stack trace (use `heroku logs --tail` to generate this). - If possible, attach an example file that is causing the problem.  ### Suggested Fixes ### Provide any suggested fixes or other ideas on how to solve the problem. Several suggestions may be provided an
                                                                 Process completed with exit code 1.