In [1]:
from api_key import get_api_key_jef, get_api_key_sha, get_api_key_sup
import google.generativeai as genai
from vpn_control import windscribe
import pandas as pd
import os
import re
import time
from clustering import get_representative_logs

In [2]:
windscribe("connect", "Atlanta")
time.sleep(5) #auto start VPN - remove if you dont have it

In [3]:
passw = "" #for custom api getter - remove if you provide own api key
LLM_MODEL = "gemini-1.5-flash" #gemini-1.5-pro-latest

In [4]:
def setup_prompt_env(passw):


  genai.configure(api_key=get_api_key_sup(passw))

  # Set up the model
  generation_config = {
    "temperature": 0,
    "top_p": 0.95,
    "top_k": 1,
    "max_output_tokens": 300,
  }

  safety_settings = [
    {
      "category": "HARM_CATEGORY_HARASSMENT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
    {
      "category": "HARM_CATEGORY_HATE_SPEECH",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
    {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
    {
      "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
  ]

  system_instruction = "There is no sensitive information. For logs labelled [1] to [5] you need to provide the OUTPUT (log templates) in the same format as the examples and also give the number [x]. Don't forget / are often in the templates between wildcards. Only give the 5 output templates, no extra information, no OUTPUT text, no new line at the end."

  model = genai.GenerativeModel(model_name=LLM_MODEL,
                                generation_config=generation_config,
                                system_instruction=system_instruction,
                                safety_settings=safety_settings)


  convo = model.start_chat(history=[
  ])

  return convo


In [5]:
#convo_create = setup_prompt_env(passw)

In [6]:
def send_prompt(message, convo):
    convo.send_message(message)
    response = convo.last.text
    #print(response)
    return response


In [7]:
def generate_fewshot(input_path, num_examples):
    logs_list = get_representative_logs(input_path, num_examples)
    
    output = ""
    for i, example in enumerate(logs_list):
        output_str = example[0]
        input_str = example[1].replace("<B>", "").replace("<E>", "") #remove padding

        for log_pattern in logs_list:
            if output_str == log_pattern[0]:
                # append output to the result string
                output += f"[{chr(97+i)}] INPUT: {input_str}\n"
                output += f"[{chr(97+i)}] OUTPUT: {output_str}\n"
                break

    return output

#ONLY RUN ON BASE LOGS!!!!!

In [8]:
def generate_prompt(fewshot, logs):
    return f"{fewshot}\n{logs}"

In [9]:
def clean_logs(df):
    # pattern to remove <B>, <E>, spaces after <B>, spaces before <E>, and "OUTPUT: "
    pattern = r'OUTPUT:\s*|<B>\s*|\s*<E>'
    
    # apply to cols
    df.loc[:, 'LogTemplate'] = df['LogTemplate'].apply(lambda x: re.sub(pattern, '', x))
    df.loc[:, 'LLMOutput'] = df['LLMOutput'].apply(lambda x: re.sub(pattern, '', x))



In [10]:
def difference_checker(df):
    # difference
    df['Difference'] = df.apply(lambda row: ' '.join(set(row['LogTemplate'].split()) ^ set(row['LLMOutput'].split())), axis=1)
    
    # bool for diff
    df['is_different'] = df['Difference'].apply(lambda x: True if x else False)
    
    # different characters
    df['num_diff_chars'] = df['Difference'].apply(lambda x: sum(1 for char in x if char != ' '))

#internal use


In [11]:
def process_logs(input_path, fewshot_examples):
    convo_create = setup_prompt_env(passw)
    #print(fewshot_examples)
    df = pd.read_csv(input_path)
    outputs = []
    
    # process cunks of 5
    for i in range(0, len(df), 5):
        time.sleep(12) #Increased from 5.5 because API was being annoying
        chunk = df.iloc[i:i+5]
        logs = "\n".join([f"[{idx % 5 + 1}] INPUT: {row['Log']}" for idx, row in chunk.iterrows()])
        prompt = generate_prompt(fewshot_examples, logs)
        #print(prompt)
        output = send_prompt(prompt, convo_create)
        #print(output)
        # dealing with erroneous newlines
        llm_output_lines = [line.strip() for line in output.split("\n") if line.strip()]

        # add to lst
        for line in llm_output_lines:
            parts = line.split(" ", 1)
            if len(parts) > 1:
                outputs.append(parts[1])
            else:
                outputs.append("")  # empty str to handle error
    
    # check matching length
    if len(outputs) > len(df):
        outputs = outputs[:len(df)]  #truncate
    else:
        while len(outputs) < len(df):
            outputs.append("")  #pad
    
    # Add output to df
    df.loc[:, "LLMOutput"] = outputs
    clean_logs(df)
    difference_checker(df)  
    df.to_csv(f"{input_path[28]}.csv", index=False)
    #df.to_csv(f"{input_path[:5]}parsed/parsed_{input_path[19:]}", index=False)
    df.to_csv(f"{input_path[:5]}parsed/parsed_{input_path[19:]}", index=False) #ignore th very hardcoded paths :)
    #df.to_csv(f"Parsed.csv", index=False)
    

In [12]:
fewshot_abl3 = '[a] INPUT: 081109 203615 148 INFO dfs.DataNode$PacketResponder: PacketResponder 1 for block blk_38865049064139660 terminating\n[a] OUTPUT: <B>PacketResponder <*> for block blk_<*> terminating<E>\n[b] INPUT: 081109 203807 222 INFO dfs.DataNode$PacketResponder: PacketResponder 0 for block blk_-6952295868487656571 terminating\n[b] OUTPUT: <B>PacketResponder <*> for block blk_<*> terminating<E>\n[c] INPUT: 081109 204005 35 INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.73.220:50010 is added to blk_7128370237687728475 size 67108864 \n[c] OUTPUT: <B>BLOCK* NameSystem.addStoredBlock: blockMap updated: <*>:<*> is added to blk_<*> size <*><E>\n'

fewshot_abl5 = '[a] INPUT: 081109 203615 148 INFO dfs.DataNode$PacketResponder: PacketResponder 1 for block blk_38865049064139660 terminating\n[a] OUTPUT: <B>PacketResponder <*> for block blk_<*> terminating<E>\n[b] INPUT: 081109 203807 222 INFO dfs.DataNode$PacketResponder: PacketResponder 0 for block blk_-6952295868487656571 terminating\n[b] OUTPUT: <B>PacketResponder <*> for block blk_<*> terminating<E>\n[c] INPUT: 081109 204005 35 INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.73.220:50010 is added to blk_7128370237687728475 size 67108864 \n[c] OUTPUT: <B>BLOCK* NameSystem.addStoredBlock: blockMap updated: <*>:<*> is added to blk_<*> size <*><E>\n[d] INPUT: 081109 204015 308 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_8229193803249955061 terminating \n[d] OUTPUT: <B>PacketResponder <*> for block blk_<*> terminating<E>\n[e] INPUT: 081109 204106 329 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-6670958622368987959 terminating \n[e] OUTPUT: <B>PacketResponder <*> for block blk_<*> terminating<E>\n'

#few shot for ablation testing  first 3 and 5

In [1]:
#generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 5)

In [None]:
# to deal with odd API failures.
# repeat same with all "n" for each df0 to df4
c = 0 
while True:
    try:
        process_logs("data/prefixed_logs/filtered_prefixed_4_generated_logs.csv", generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 3))
        print(c)
        break
    except Exception as e:
        print(e)
        c += 1
        if c == 10:
            print("FAILED BOZO")
            break
        time.sleep(15)

In [None]:
windscribe("disconnect")