<a href="https://colab.research.google.com/github/rahulreva35/Oauth2_20_7_2018/blob/master/AI_Project_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load normalized logs
with open("/content/sample_data/normalized_spring_logs.json", "r") as f:
    logs = json.load(f)

df = pd.DataFrame(logs)

# Extract normalized messages
texts = df["normalized_message"].tolist()

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=500)  # limit features for speed

# Fit and transform
X = vectorizer.fit_transform(texts)

print(f"TF-IDF matrix shape: {X.shape}")  # (num_samples, num_features)

# Example: Show top 10 features
print("Top 10 TF-IDF features:", vectorizer.get_feature_names_out()[:10])



TF-IDF matrix shape: (1001, 31)
Top 10 TF-IDF features: ['api' 'application' 'cache' 'cleared' 'connection' 'context'
 'credentials' 'data' 'database' 'down']


In [None]:
# src/vectorizer.py

import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib  # for saving vectorizer

def load_normalized_logs(json_path: str) -> pd.DataFrame:
    with open(json_path, "r") as f:
        logs = json.load(f)
    return pd.DataFrame(logs)

def vectorize_logs(df: pd.DataFrame, max_features: int = 500):
    texts = df["normalized_message"].tolist()

    vectorizer = TfidfVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(texts)

    return X, vectorizer

def save_vectorizer(vectorizer, path: str):
    joblib.dump(vectorizer, path)

# Example usage (can be run from notebook or __main__)
if __name__ == "__main__":
    df = load_normalized_logs("/content/sample_data/normalized_spring_logs.json")
    X, vectorizer = vectorize_logs(df)
    save_vectorizer(vectorizer, "/content/sample_data/models/tfidf_vectorizer.pkl")
    print(f"TF-IDF feature matrix shape: {X.shape}")


TF-IDF feature matrix shape: (1001, 31)


In [None]:
# src/model.py

import pandas as pd
import json
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

def load_data_and_vectorizer(log_path, vectorizer_path):
    with open(log_path, "r") as f:
        logs = json.load(f)
    df = pd.DataFrame(logs)

    vectorizer = joblib.load(vectorizer_path)
    X = vectorizer.transform(df["normalized_message"])

    return df, X

def train_log_level_classifier(X, y):
    model = LogisticRegression(max_iter=200)
    model.fit(X, y)
    return model

def main():
    df, X = load_data_and_vectorizer(
        "/content/sample_data/normalized_spring_logs.json",
        "/content/sample_data/models/tfidf_vectorizer.pkl"
    )

    # Encode log levels (target)
    le = LabelEncoder()
    y = le.fit_transform(df["level"])  # e.g., INFO → 1, ERROR → 0

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Train model
    model = train_log_level_classifier(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=le.classes_))

    # Save model and label encoder
    joblib.dump(model, "/content/sample_data/models/log_level_classifier.pkl")
    joblib.dump(le, "/content/sample_data/models/label_encoder.pkl")

if __name__ == "__main__":
    main()


              precision    recall  f1-score   support

       DEBUG       0.32      0.44      0.37        50
       ERROR       0.23      0.10      0.14        48
        INFO       0.33      0.40      0.36        55
        WARN       0.30      0.27      0.29        48

    accuracy                           0.31       201
   macro avg       0.29      0.30      0.29       201
weighted avg       0.30      0.31      0.29       201



In [None]:
# run_local_inference.py

import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Load models


vectorizer = joblib.load("/content/sample_data/models/tfidf_vectorizer.pkl")
classifier = joblib.load("/content/sample_data/models/log_level_classifier.pkl")
label_encoder = joblib.load("/content/sample_data/models/label_encoder.pkl")

def normalize_message(log_line: str) -> str:
    # Very basic normalization - adjust based on your normalizer
    msg = re.sub(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}[.,]?\d*', '', log_line)  # remove timestamps
    msg = re.sub(r'\b(INFO|ERROR|WARN|DEBUG|TRACE)\b', '', msg, flags=re.IGNORECASE)  # remove levels
    msg = re.sub(r'\s+', ' ', msg).strip()  # remove extra spaces
    return msg.lower()

def predict_log_level(log_line: str):
    normalized = normalize_message(log_line)
    vectorized = vectorizer.transform([normalized])
    predicted = classifier.predict(vectorized)
    label = label_encoder.inverse_transform(predicted)[0]
    return label

def main():
    test_logs = [
        "2025-05-16 12:00:00,123 INFO Application started successfully.",
        "2025-05-16 12:01:00,456 ERROR Failed to connect to database.",
        "2025-05-16 12:02:00,789 WARN Disk usage at 95%."
    ]

    print("🔍 Testing log level predictions:\n")
    for log in test_logs:
        level = predict_log_level(log)
        print(f"{log} → Predicted Level: {level}")

if __name__ == "__main__":
    main()


🔍 Testing log level predictions:

2025-05-16 12:00:00,123 INFO Application started successfully. → Predicted Level: WARN
2025-05-16 12:01:00,456 ERROR Failed to connect to database. → Predicted Level: ERROR
2025-05-16 12:02:00,789 WARN Disk usage at 95%. → Predicted Level: INFO


In [None]:
# Create a sample log file with 1000+ lines
log_lines = []

levels = ["INFO", "ERROR", "WARN", "DEBUG"]
import random, datetime

for i in range(1000):
    ts = (datetime.datetime(2025, 5, 17, 10, 0) + datetime.timedelta(seconds=i*30)).strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
    level = random.choice(levels)
    message = f"{level} Simulated log message number {i}"
    log_lines.append(f"{ts} {level} {message}")

with open("/content/SampleAIProject/log_files/spring_app.log", "w") as f:
    f.write("\n".join(log_lines))

print("✅ Sample log file created.")


✅ Sample log file created.


In [None]:
import re
import pandas as pd

def parse_log_line(line):
    pattern = r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) (?P<level>\w+) (?P<message>.+)"
    match = re.match(pattern, line)
    if match:
        return match.groupdict()
    return None

parsed_logs = []

with open("/content/SampleAIProject/log_files/spring_app.log", "r") as f:
    for line in f:
        parsed = parse_log_line(line.strip())
        if parsed:
            parsed_logs.append(parsed)

df_logs = pd.DataFrame(parsed_logs)
df_logs.head()


Unnamed: 0,timestamp,level,message
0,"2025-05-17 10:00:00,000",WARN,WARN Simulated log message number 0
1,"2025-05-17 10:00:30,000",ERROR,ERROR Simulated log message number 1
2,"2025-05-17 10:01:00,000",ERROR,ERROR Simulated log message number 2
3,"2025-05-17 10:01:30,000",WARN,WARN Simulated log message number 3
4,"2025-05-17 10:02:00,000",ERROR,ERROR Simulated log message number 4


In [None]:
# Save to JSON and CSV
df_logs.to_json("/content/SampleAIProject/log_files/parsed_logs.json", orient="records", lines=True)
df_logs.to_csv("/content/SampleAIProject/log_files/parsed_logs.csv", index=False)

print("✅ Logs saved as parsed_logs.json and parsed_logs.csv")

✅ Logs saved as parsed_logs.json and parsed_logs.csv


In [None]:
import re

def normalize_message(msg: str) -> str:
    # Remove numbers
    msg = re.sub(r'\d+', '', msg)
    # Remove extra spaces and lowercase
    msg = re.sub(r'\s+', ' ', msg).strip().lower()
    return msg

# Apply normalization to the DataFrame
df_logs["normalized_message"] = df_logs["message"].apply(normalize_message)

# Preview
df_logs[["message", "normalized_message"]].head()



Unnamed: 0,message,normalized_message
0,WARN Simulated log message number 0,warn simulated log message number
1,ERROR Simulated log message number 1,error simulated log message number
2,ERROR Simulated log message number 2,error simulated log message number
3,WARN Simulated log message number 3,warn simulated log message number
4,ERROR Simulated log message number 4,error simulated log message number


In [None]:
df_logs.to_json("/content/SampleAIProject/log_files/normalized_logs.json", orient="records", lines=True)
df_logs.to_csv("/content/SampleAIProject/log_files/normalized_logs.csv", index=False)
print("✅ Normalized logs saved as JSON and CSV.")


✅ Normalized logs saved as JSON and CSV.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Load normalized messages
messages = df_logs["normalized_message"]

# Create and fit vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(messages)

print("✅ TF-IDF shape:", X.shape)


✅ TF-IDF shape: (1000, 8)


In [None]:
# Show top 20 feature names
feature_names = vectorizer.get_feature_names_out()
print("Sample terms:", feature_names[:20])

Sample terms: ['debug' 'error' 'info' 'log' 'message' 'number' 'simulated' 'warn']


In [None]:
# Save using joblib
joblib.dump(vectorizer, "/content/SampleAIProject/log_files/tfidf_vectorizer.pkl")
print("✅ TF-IDF vectorizer saved as tfidf_vectorizer.pkl")

✅ TF-IDF vectorizer saved as tfidf_vectorizer.pkl


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Encode log levels to numeric labels
le = LabelEncoder()
y = le.fit_transform(df_logs["level"])  # e.g., INFO → 1, ERROR → 0

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize and train classifier
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

# Evaluate performance
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Save model and label encoder for later
import joblib
joblib.dump(clf, "/content/SampleAIProject/log_files/log_level_classifier.pkl")
joblib.dump(le, "/content/SampleAIProject/log_files/label_encoder.pkl")

print("✅ Classifier and label encoder saved!")


Classification Report:
              precision    recall  f1-score   support

       DEBUG       1.00      1.00      1.00        47
       ERROR       1.00      1.00      1.00        50
        INFO       1.00      1.00      1.00        47
        WARN       1.00      1.00      1.00        56

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

✅ Classifier and label encoder saved!


In [None]:
import joblib
import re

# Load saved artifacts
vectorizer = joblib.load("/content/SampleAIProject/log_files/tfidf_vectorizer.pkl")
classifier = joblib.load("/content/SampleAIProject/log_files/log_level_classifier.pkl")
label_encoder = joblib.load("/content/SampleAIProject/log_files/label_encoder.pkl")

def normalize_message(msg: str) -> str:
    # Same normalization used in training
    msg = re.sub(r'\d+', '', msg)
    msg = re.sub(r'\s+', ' ', msg).strip().lower()
    return msg

def predict_log_level(log_line: str) -> str:
    normalized = normalize_message(log_line)
    vectorized = vectorizer.transform([normalized])
    pred_label_num = classifier.predict(vectorized)[0]
    pred_label = label_encoder.inverse_transform([pred_label_num])[0]
    return pred_label

# Example usage:
new_logs = [
    "2025-05-17 11:00:00,123 User logged in successfully",
    "2025-05-17 11:01:00,456 Failed to save data",
    "2025-05-17 11:02:00,789 Low disk space"
]

for log in new_logs:
    pred = predict_log_level(log)
    print(f"Log: {log}\nPredicted Level: {pred}\n")


Log: 2025-05-17 11:00:00,123 User logged in successfully
Predicted Level: DEBUG

Log: 2025-05-17 11:01:00,456 Failed to save data
Predicted Level: DEBUG

Log: 2025-05-17 11:02:00,789 Low disk space
Predicted Level: DEBUG



In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import tqdm
class GPTConfig:
    def __init__(self, vocab_size, block_size, n_layer=4, n_head=4, n_embd=128):
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_embd = n_embd

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding_table = nn.Embedding(config.vocab_size, config.n_embd)
        self.position_embedding_table = nn.Embedding(config.block_size, config.n_embd)
        self.blocks = nn.Sequential(*[
            nn.TransformerEncoderLayer(d_model=config.n_embd, nhead=config.n_head)
            for _ in range(config.n_layer)
        ])
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)

        self.block_size = config.block_size

    def forward(self, idx, targets=None):
        B, T = idx.size()
        token_embeddings = self.token_embedding_table(idx)
        position_embeddings = self.position_embedding_table(torch.arange(T, device=idx.device))
        x = token_embeddings + position_embeddings
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.size()
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_token), dim=1)
        return idx


In [None]:
text = "hello how are you doing today? I am a chatbot. hello how are you?"
chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

def encode(s):
    return [stoi[c] for c in s]

def decode(l):
    return ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

block_size = 8  # context size

def get_batch():
    start = torch.randint(len(data) - block_size - 1, (1,)).item()
    x = data[start:start+block_size].unsqueeze(0)
    y = data[start+1:start+block_size+1].unsqueeze(0)
    return x, y


In [None]:
mconf = GPTConfig(vocab_size=vocab_size, block_size=block_size)
model = GPT(mconf)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for step in range(500):  # Train for 500 steps
    x, y = get_batch()
    logits, loss = model(x, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 100 == 0:
        print(f"Step {step}: Loss {loss.item():.4f}")


Step 0: Loss 3.1265
Step 100: Loss 1.5610
Step 200: Loss 1.6271
Step 300: Loss 1.5662
Step 400: Loss 1.1871


In [None]:
context = torch.tensor(encode("hello "), dtype=torch.long).unsqueeze(0)
output = model.generate(context, max_new_tokens=20)[0].tolist()
print(decode(output))


hello arelow at. tbo a dow


In [None]:
text = """hello how are you doing today? I am a chatbot. hello how are you?"""
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(f"Vocab size: {vocab_size}")

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

def encode(s):
    return [stoi[c] for c in s]

def decode(l):
    return ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)


Vocab size: 21


In [None]:
class GPTConfig:
    def __init__(self, vocab_size, block_size, n_layer=4, n_head=4, n_embd=128):
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_embd = n_embd

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding_table = nn.Embedding(config.vocab_size, config.n_embd)
        self.position_embedding_table = nn.Embedding(config.block_size, config.n_embd)
        self.blocks = nn.Sequential(*[
            nn.TransformerEncoderLayer(d_model=config.n_embd, nhead=config.n_head)
            for _ in range(config.n_layer)
        ])
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)

        self.block_size = config.block_size

    def forward(self, idx, targets=None):
        B, T = idx.size()
        token_embeddings = self.token_embedding_table(idx)
        position_embeddings = self.position_embedding_table(torch.arange(T, device=idx.device))
        x = token_embeddings + position_embeddings
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.size()
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_token), dim=1)
        return idx


In [None]:
block_size = 8  # context length

def get_batch():
    start = torch.randint(len(data) - block_size - 1, (1,)).item()
    x = data[start:start+block_size].unsqueeze(0)
    y = data[start+1:start+block_size+1].unsqueeze(0)
    return x, y


In [None]:
mconf = GPTConfig(vocab_size=vocab_size, block_size=block_size)
model = GPT(mconf)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)


In [None]:
for step in range(500):  # you can increase steps later
    x, y = get_batch()
    logits, loss = model(x, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 100 == 0:
        print(f"Step {step}: Loss {loss.item():.4f}")


Step 0: Loss 3.3160
Step 100: Loss 1.3868
Step 200: Loss 1.2029
Step 300: Loss 1.0713
Step 400: Loss 1.4741


In [None]:
context = torch.tensor(encode("hello "), dtype=torch.long).unsqueeze(0)
output = model.generate(context, max_new_tokens=20)[0].tolist()
print(decode(output))


hello cam ham atbow y? are


In [None]:
torch.save(model.state_dict(), "/content/ChatBot/mini_gpt.pth")

In [None]:
# Recreate model config and model instance
mconf = GPTConfig(vocab_size=vocab_size, block_size=block_size)
model = GPT(mconf)

# Load saved weights
model.load_state_dict(torch.load("/content/ChatBot/mini_gpt.pth"))

# Set model to evaluation mode
model.eval()


GPT(
  (token_embedding_table): Embedding(21, 128)
  (position_embedding_table): Embedding(8, 128)
  (blocks): Sequential(
    (0): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (linear1): Linear(in_features=128, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=128, bias=True)
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (linear1): Linear(in_features=128, out_features=2048, bias=True)
      (dropout): Dr

In [None]:
def generate_text(model, prompt, max_new_tokens=50):
    context = torch.tensor(encode(prompt), dtype=torch.long).unsqueeze(0)  # batch size 1
    generated = model.generate(context, max_new_tokens=max_new_tokens)[0].tolist()
    return decode(generated)

# Example usage:
print(generate_text(model, "hello how are you"))


hello how are you are hot.lllloa he ho at. ay? I chacy? I hou am I 


In [None]:
import re

# Path to your Spring Boot log file
log_file_path = "/content/Sample_Log_AI/spring_boot_logs.log"

log_entry_pattern = re.compile(
    r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\s+'
    r'(?P<level>[A-Z]+)\s+'
    r'(?P<pid>\d+)\s+---\s+'
    r'\[(?P<thread>[^\]]+)\]\s+'
    r'(?P<logger>[\w\.\$]+)\s+:\s+'
    r'(?P<message>.*)'
)

logs = []
current_log = None

with open(log_file_path, 'r') as f:
    for line in f:
        line = line.rstrip('\n')
        match = log_entry_pattern.match(line)
        if match:
            # Save previous log entry if exists
            if current_log:
                logs.append(current_log)
            # Start a new log entry
            current_log = match.groupdict()
            current_log['stacktrace'] = []
        else:
            # Line is part of a stacktrace or multiline message
            if current_log is not None:
                current_log['stacktrace'].append(line)

# Append the last log entry
if current_log:
    logs.append(current_log)

# Clean stacktrace field
for log in logs:
    if log['stacktrace']:
        log['stacktrace'] = "\n".join(log['stacktrace']).strip()
    else:
        log['stacktrace'] = None

# Example: print parsed logs
for log in logs:
    print(f"{log['timestamp']} [{log['level']}] {log['logger']}: {log['message']}")
    if log['stacktrace']:
        print(f"Stacktrace:\n{log['stacktrace']}")
    print("-----")


In [None]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

def embed_log_entry(log):
    text = log['message']
    if log['stacktrace']:
        text += "\nStacktrace:\n" + log['stacktrace']
    embedding = model.encode(text)
    return embedding

# Embed all logs
for log in logs:
    log['embedding'] = embed_log_entry(log)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
import faiss
import numpy as np

dimension = len(logs[0]['embedding'])  # e.g. 384 for all-MiniLM-L6-v2
index = faiss.IndexFlatL2(dimension)

embedding_matrix = np.array([log['embedding'] for log in logs]).astype('float32')
index.add(embedding_matrix)


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# --- 1. Load the embedding model ---
model = SentenceTransformer('all-MiniLM-L6-v2')

# --- 2. Assume 'logs' list already has embeddings and FAISS index is built ---

# Example logs list with 'embedding' key:
# logs = [
#     {'timestamp': '2025-05-18 10:00:00', 'level': 'INFO', 'message': 'Log message number 0', 'embedding': [...], ...},
#     ...
# ]

# And the FAISS index built as:
# dimension = len(logs[0]['embedding'])
# index = faiss.IndexFlatL2(dimension)
# embedding_matrix = np.array([log['embedding'] for log in logs]).astype('float32')
# index.add(embedding_matrix)


def search_logs(query: str, model, index, logs, top_k=5):
    """
    Search logs for the top_k most semantically similar entries to the query.

    Args:
      query (str): User question or search string.
      model: SentenceTransformer model.
      index: FAISS index containing log embeddings.
      logs (list): List of log dicts with embeddings.
      top_k (int): Number of results to return.

    Returns:
      List of top_k logs most relevant to query.
    """
    # 1. Embed the query text
    query_embedding = model.encode(query)

    # 2. Convert to numpy array and correct dtype for FAISS
    query_vector = np.array([query_embedding]).astype('float32')

    # 3. Search the FAISS index for top_k closest logs
    distances, indices = index.search(query_vector, top_k)

    # 4. Retrieve logs by index
    results = [logs[i] for i in indices[0]]

    return results


# --- Usage example ---

user_query = "only Null Pointers"

top_logs = search_logs(user_query, model, index, logs, top_k=3)

print(f"Top {len(top_logs)} log entries matching the query:\n")

for log in top_logs:
    print(f"{log['timestamp']} [{log['level']}] {log['message']}")
    if log['stacktrace']:
        print(f"Stacktrace:\n{log['stacktrace']}")
    print("-" * 40)


Top 3 log entries matching the query:

2025-05-18 10:06:13.000 [ERROR] Log message number 373
Stacktrace:
java.lang.NullPointerException: Cannot read field "name" because "user" is null
    at com.example.controller.UserController.getUser(UserController.java:45)
    at com.example.service.UserService.findUserById(UserService.java:23)
----------------------------------------
2025-05-18 10:09:41.000 [ERROR] Log message number 581
Stacktrace:
java.lang.NullPointerException: Cannot read field "name" because "user" is null
    at com.example.controller.UserController.getUser(UserController.java:45)
    at com.example.service.UserService.findUserById(UserService.java:23)
----------------------------------------
2025-05-18 10:00:03.000 [ERROR] Log message number 3
Stacktrace:
java.lang.NullPointerException: Cannot read field "name" because "user" is null
    at com.example.controller.UserController.getUser(UserController.java:45)
    at com.example.service.UserService.findUserById(UserService

In [None]:
!python /content/Sample_Log_AI/src/add_logs.py /content/Sample_Log_AI/spring_boot_logs.log

2025-05-18 15:34:01.171625: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747582441.196652   21191 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747582441.204234   21191 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Parsing /content/Sample_Log_AI/spring_boot_logs.log...
✅ Added 1000 new logs. Total: 1000


In [None]:
!python /content/Sample_Log_AI/src/chatbot.py

2025-05-18 15:41:13.370526: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747582873.410331   22945 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747582873.422456   22945 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
🤖 LogBot: Ask me about your Spring Boot logs! (type 'exit' to quit)
You: show me nullpointers

🔍 Top Logs:
[2025-05-18 10:06:13.000] ERROR - Log message number 373
Stacktrace:
java.lang.NullPointerException: Cannot read field "name" because "user" is null
    at com.example.controller.UserController.getUser(UserController.java:45)
    at com.example.service.UserService.findUserById(UserService.java:23)
-------------------------------

In [None]:
import re
def parse_spring_boot_logs(log_file_path):
    logs = []
    current_log = None
    stacktrace_lines = []

    # Looser regex pattern with optional spacing and better logger matching
    log_line_re = re.compile(
        r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\s+(\w+)\s+\d+\s+---\s+\[.*?\]\s+(.*?)\s+:\s+(.*)$'
    )

    with open(log_file_path, 'r') as f:
        for line in f:
            print("asdads\n")
            line = line.rstrip()
            match = log_line_re.match(line)

            if match:
                # Save the previous log entry with its stacktrace if any
                if current_log and stacktrace_lines:
                    print('Inside One')
                    current_log['stacktrace'] = '\n'.join(stacktrace_lines)
                    exception_data = extract_exception(stacktrace_lines)
                    if exception_data:
                        current_log['exception'] = exception_data
                    stacktrace_lines = []
                print('Inside two')
                timestamp, level, logger, message = match.groups()
                current_log = {
                    'timestamp': timestamp,
                    'level': level,
                    'logger': logger.strip(),
                    'message': message,
                    'stacktrace': None,
                    'exception': None,
                    'component': extract_component(logger)
                }
                logs.append(current_log)
            else:
                print('Inside three')
                if current_log:
                    stacktrace_lines.append(line)

        # Handle final log entry
        if current_log and stacktrace_lines:
            print('Inside four')
            current_log['stacktrace'] = '\n'.join(stacktrace_lines)
            exception_data = extract_exception(stacktrace_lines)
            if exception_data:
                current_log['exception'] = exception_data

    return logs


def extract_component(logger):
    logger_lower = logger.lower()
    if 'controller' in logger_lower:
        return 'controller'
    elif 'service' in logger_lower:
        return 'service'
    elif 'repository' in logger_lower:
        return 'repository'
    return 'other'


def extract_exception(stacktrace_lines):
    print("I am here")
    exception_pattern = re.compile(r'([a-zA-Z0-9_.]+(?:Exception|Error))\s*:\s*(.*)')
    for line in stacktrace_lines:
        match = exception_pattern.match(line.strip())
        if match:
            return {
                "type": match.group(1),
                "message": match.group(2)
            }
    return None
if __name__ == "__main__":
    test = parse_spring_boot_logs("/content/Sample_Log_AI/spring_boot_logs.log")
    print(test)


In [93]:
!python Sample_Log_AI/src/add_logs.py Sample_Log_AI/spring_boot_logs.log
!python Sample_Log_AI/src/add_logs.py Sample_Log_AI/sample_500_logs.log

2025-05-18 18:57:23.213235: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747594643.255697   70386 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747594643.268529   70386 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Parsing Sample_Log_AI/spring_boot_logs.log...
new logs 
[{'timestamp': '2025-05-18 10:00:00.000', 'level': 'INFO', 'logger': 'com.example.repository.UserRepository', 'message': 'Log message number 0', 'stacktrace': None, 'exception': None, 'component': 'repository'}, {'timestamp': '2025-05-18 10:00:01.000', 'level': 'TRACE', 'logger': 'com.example.service.UserService', 'message': 'Log message number 1', 'stacktrace': None, 'exception

In [124]:
!python /content/Sample_Log_AI/src/chatbot.py

2025-05-18 20:10:43.687147: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747599043.752469   88312 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747599043.771255   88312 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Device set to use cpu
🤖 LogBot: Ask me about your Spring Boot logs! (type 'exit' to quit)
You: get me count of controller?
👤 User typed : get me count of controller?
💬 Detected intent: count_controllers (confidence: 0.55)
🧠 Controller logs count: 270
You: explain unknown exceptions
👤 User typed : explain unknown exceptions
💬 Detected intent: summary (confidence: 0.35)
📦 Total logs: 1500
🧠 Controller calls: 270
❌ Error logs: 356
⚠️ Ex