In [1]:
import dspy
import polars as pl
import pandas as pd
import time
from dotenv import load_dotenv
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


The goal of this notebook is to try to see which models available in OpenRouter's API (or some other API) could be beneficial for generating explanation labels for the anomalies. The assumption here is that **certain LLM's** are better than others in making inference about (supercomputer generaated) log based data, either due to their **architecture or the data seen during training.**

In [25]:
# Labeling from https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10254958
# Repo: https://github.com/Pranjal-Gupta2/learning-representations-on-logs-for-aiops

# Read with pandas
pd_train = pd.read_excel("./data/fc_public_train_30.xlsx", sheet_name=0)
pd_test = pd.read_excel("./data/fc_public_test.xlsx", sheet_name=0)

# Convert to Polars
#df_train = pl.from_pandas(pd_df)
#df_test = pl.from_pandas(pd_df_test)

print(pd_train)


    Unnamed: 0                                               text       labels
0            0  373746 node-121 action error 1085979750 1 halt...  application
1            1  373800 node-147 action error 1085979770 1 halt...  application
2            2  - 1131576210 2005.11.09 bn549 nov 9 14:43:30 b...  application
3            3  366588 node-198 unix.hw state_change.unavailab...  application
4            4  12-18 18:31:06.771 1795 1808 v activity manage...  application
..         ...                                                ...          ...
92          92  - 1131577116 2005.11.09 cn10 nov 9 14:58:36 cn...        other
93          93  - 1131579082 2005.11.09 dn211 nov 9 15:31:22 d...        other
94          94  - 1131577356 2005.11.09 an532 nov 9 15:02:36 a...        other
95          95  - 1131576074 2005.11.09 cn4 nov 9 14:41:14 cn4...        other
96          96  2015-10-18 18:20:19,151 info [ thread-111] org...        other

[97 rows x 3 columns]


In [None]:
# Access the API key
load_dotenv()
api_key = os.getenv("OPENROUTER_API_KEY")
assert api_key != None

In [None]:
class AnomalyLabeler(dspy.Signature):
    """Label the anomaly based on the text."""
    text: str = dspy.InputField()
    label: Literal['application', 'authentication', 'io', 'memory', 'network', 'other', ] = dspy.OutputField()

In [None]:
def create_examples(df_arg):
    examples = []
    for _, row in df_arg.iterrows():
        content = row['text']
        label = row['labels']
        example = dspy.Example(text=content, label=label).with_inputs("text", "label")
        examples.append(example)

    return examples

examples = create_examples(pd_train)
for ex in examples[:5]:
    print(ex)

In [None]:
metric = (lambda x, y, trace=None: x.label == y.label)

In [7]:
converted_trainset = [ex.with_inputs("text") for ex in create_examples(pd_train)]
converted_testset = [ex.with_inputs("text") for ex in create_examples(pd_test)]

In [8]:
anomaly_labeler = dspy.ChainOfThought(AnomalyLabeler)

In [9]:
class CoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = anomaly_labeler

    def forward(self, content):
        return self.prog(content=content)

In [10]:
# Chosen based on Openrouter's models and https://livebench.ai/#/
models = [
        "openrouter/meta-llama/llama-3-8b-instruct:extended",
        "openrouter/microsoft/wizardlm-2-8x22b:nitro",
        "openrouter/deepseek/deepseek-r1:free",
        "openrouter/openai/o1",
        "openrouter/openai/o3-mini-high",
        "openrouter/qwen/qwen-2.5-coder-32b-instruct",
        "openrouter/mistralai/mistral-7b-instruct:free",
        "openrouter/anthropic/claude-3.7-sonnet:thinking",
        "openrouter/aion-labs/aion-1.0",
        "openrouter/google/gemini-2.0-flash-lite-001"
        ]   

In [11]:
model = models[2]
lm = dspy.LM(model, api_key=api_key)
print(lm("hi"))

['Hello! How can I assist you today?']


In [12]:
dspy.configure(lm=lm)

In [13]:
evaluate = dspy.Evaluate(devset=converted_trainset, num_threads=8, display_progress=True, display_table=False, return_outputs=True)
score, outputs = evaluate(anomaly_labeler, metric=metric)

cost = [x['cost'] for x in lm.history if x['cost']]
print(f"cost: {sum(cost)} dollars")

Average Metric: 65.00 / 97 (67.0%): 100%|██████████| 97/97 [04:49<00:00,  2.99s/it]2025/03/10 01:10:13 INFO dspy.evaluate.evaluate: Average Metric: 65 / 97 (67.0%)

cost: 0.12780902000000002 dollars


In [37]:
print(score)
print(outputs[0][1].reasoning)

67.01
The error message indicates a connection refusal when trying to connect to the console. Connection refusal (state = refused) typically points to a network-related issue, such as a service not listening on the expected port or network accessibility problems. This aligns with the 'network' category.


Using the CoT-module is quite costly, let's see how regular `Predict` compares.

In [12]:
anomaly_labeler_pred = dspy.Predict(AnomalyLabeler) 

In [39]:
evaluate = dspy.Evaluate(devset=converted_trainset, num_threads=8, display_progress=True, display_table=False, return_outputs=True)
score, outputs = evaluate(anomaly_labeler_pred, metric=metric)

cost = [x['cost'] for x in lm.history if x['cost']]
print(f"cost: {sum(cost)} dollars")

Average Metric: 64.00 / 97 (66.0%): 100%|██████████| 97/97 [02:59<00:00,  1.85s/it]2025/03/10 01:22:10 INFO dspy.evaluate.evaluate: Average Metric: 64 / 97 (66.0%)

cost: 0.20056298000000009 dollars


OK seems somewhat the same and slightly cheaper.

In [13]:
results_df = pd.DataFrame(columns=['model', 'score', 'cost', 'runtime'])

for model in models:
    try:
        print(f"\n\nEvaluating model: {model}")
        lm = dspy.LM(model, api_key=api_key)
        dspy.configure(lm=lm)
        
        start_time = time.time()
        
        # Evaluate model
        evaluate = dspy.Evaluate(devset=converted_trainset, num_threads=8, display_progress=True, display_table=False, return_outputs=True)
        score, outputs = evaluate(anomaly_labeler_pred, metric=metric)
        
        runtime = time.time() - start_time
        cost = sum([x['cost'] for x in lm.history if 'cost' in x and x['cost']])
        
        print(f"Score: {score}")
        print(f"Cost: ${cost:.4f}")
        print(f"Runtime: {runtime:.2f} seconds")
        
        results_df = pd.concat([results_df, pd.DataFrame({
            'model': [model],
            'score': [score],
            'cost': [cost],
            'runtime': [runtime]
        })], ignore_index=True)
        
    except Exception as e:
        print(f"Error evaluating model {model}: {str(e)}")
        results_df = pd.concat([results_df, pd.DataFrame({
            'model': [model],
            'score': [None],
            'cost': [None],
            'runtime': [None],
            'error': [str(e)]
        })], ignore_index=True)

# Save results to CSV
results_df.to_csv('model_evaluation_results.csv', index=False)

print("\n\nFinal Evaluation Results:")
print(results_df.sort_values(by='score', ascending=False))



Evaluating model: openrouter/meta-llama/llama-3-8b-instruct:extended
Average Metric: 61.00 / 97 (62.9%): 100%|██████████| 97/97 [00:06<00:00, 15.91it/s]2025/03/10 01:26:55 INFO dspy.evaluate.evaluate: Average Metric: 61 / 97 (62.9%)
  results_df = pd.concat([results_df, pd.DataFrame({

Score: 62.89
Cost: $0.0000
Runtime: 6.19 seconds


Evaluating model: openrouter/microsoft/wizardlm-2-8x22b:nitro
Average Metric: 64.00 / 88 (72.7%):  91%|█████████ | 88/97 [00:11<00:01,  5.95it/s]2025/03/10 01:27:07 ERROR dspy.utils.parallelizer: Error processing item Example({'text': '- 1131579160 2005.11.09 cn6 nov 9 15:32:40 cn6/cn6 kernel: uhci_hcd 0000:00:1d.0: uhci host controller', 'label': 'other'}) (input_keys={'text'}): 'list' object has no attribute 'items'. Set `provide_traceback=True` to see the stack trace.
Average Metric: 66.00 / 95 (69.5%):  99%|█████████▉| 96/97 [00:15<00:00,  3.38it/s]2025/03/10 01:27:12 ERROR dspy.utils.parallelizer: Error processing item Example({'text': '- 11315771

Missing params for OpenAI + Gemini models, let's fix.

In [15]:
lm = dspy.LM("openrouter/openai/o1", api_key=api_key, temperature=1.0, max_tokens=5000)
dspy.configure(lm=lm)

evaluate = dspy.Evaluate(devset=converted_trainset, num_threads=8, display_progress=True, display_table=False, return_outputs=True)
score, outputs = evaluate(anomaly_labeler_pred, metric=metric)

cost = [x['cost'] for x in lm.history if x['cost']]
print(f"cost: {sum(cost)} dollars")

Average Metric: 63.00 / 97 (64.9%): 100%|██████████| 97/97 [00:40<00:00,  2.39it/s]2025/03/10 01:33:52 INFO dspy.evaluate.evaluate: Average Metric: 63 / 97 (64.9%)

cost: 1.6330950000000009 dollars


In [17]:
lm = dspy.LM("openrouter/openai/o3-mini-high", api_key=api_key, temperature=1.0, max_tokens=5000)
dspy.configure(lm=lm)

evaluate = dspy.Evaluate(devset=converted_trainset, num_threads=8, display_progress=True, display_table=False, return_outputs=True)
score, outputs = evaluate(anomaly_labeler_pred, metric=metric)

cost = [x['cost'] for x in lm.history if x['cost']]
print(f"cost: {sum(cost)} dollars")

Average Metric: 65.00 / 97 (67.0%): 100%|██████████| 97/97 [01:11<00:00,  1.36it/s]2025/03/10 01:36:09 INFO dspy.evaluate.evaluate: Average Metric: 65 / 97 (67.0%)

cost: 0 dollars


In [18]:
lm = dspy.LM("openrouter/google/gemini-2.0-flash-lite-001", api_key=api_key)
dspy.configure(lm=lm)

evaluate = dspy.Evaluate(devset=converted_trainset, num_threads=8, display_progress=True, display_table=False, return_outputs=True)
score, outputs = evaluate(anomaly_labeler_pred, metric=metric)

cost = [x['cost'] for x in lm.history if x['cost']]
print(f"cost: {sum(cost)} dollars")

Average Metric: 55.00 / 97 (56.7%): 100%|██████████| 97/97 [00:08<00:00, 11.09it/s]2025/03/10 01:43:32 INFO dspy.evaluate.evaluate: Average Metric: 55 / 97 (56.7%)

cost: 0 dollars


In [30]:
lm = dspy.LM("openrouter/amazon/nova-micro-v1", api_key=api_key)
dspy.configure(lm=lm)

evaluate = dspy.Evaluate(devset=converted_trainset, num_threads=8, display_progress=True, display_table=False, return_outputs=True)
score, outputs = evaluate(anomaly_labeler_pred, metric=metric)

cost = [x['cost'] for x in lm.history if x['cost']]
print(f"cost: {sum(cost)} dollars")

Average Metric: 70.00 / 97 (72.2%): 100%|██████████| 97/97 [00:00<00:00, 849.74it/s]2025/03/10 02:36:57 INFO dspy.evaluate.evaluate: Average Metric: 70 / 97 (72.2%)

cost: 0 dollars
