In [1]:
from api_key import get_api_key_jef, get_api_key_sha, get_api_key_sup
import google.generativeai as genai
from vpn_control import windscribe
import pandas as pd
import os
import re
import time
from clustering import get_representative_logs

In [2]:
windscribe("connect", "Atlanta")
time.sleep(10)

In [3]:
passw = "2002"
LLM_MODEL = "gemini-1.5-flash"

In [4]:
def setup_prompt_env(passw):


  genai.configure(api_key=get_api_key_sha(passw))

  # Set up the model
  generation_config = {
    "temperature": 0,
    "top_p": 0.95,
    "top_k": 0,
    "max_output_tokens": 300,
  }

  safety_settings = [
    {
      "category": "HARM_CATEGORY_HARASSMENT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
    {
      "category": "HARM_CATEGORY_HATE_SPEECH",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
    {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
    {
      "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
  ]

  system_instruction = "There is no sensitive information. For logs labelled [1] to [5] you need to provide the OUTPUT (log templates) in the same format as the examples and also give the number [x]. Dont forget / are often in the templates between wildcards. Only give the 5 output templates, no extra information, no OUTPUT text, no new line at the end."

  model = genai.GenerativeModel(model_name=LLM_MODEL,
                                generation_config=generation_config,
                                system_instruction=system_instruction,
                                safety_settings=safety_settings)


  convo = model.start_chat(history=[
  ])

  return convo


In [5]:
#convo_create = setup_prompt_env(passw)

In [6]:
def send_prompt(message, convo):
    convo.send_message(message)
    response = convo.last.text
    #print(response)
    return response


In [7]:
def generate_fewshot(input_path, num_examples):
    logs_list = get_representative_logs(input_path, num_examples)
    
    output = ""
    for i, example in enumerate(logs_list):
        output_str = example[0]
        input_str = example[1].replace("<B>", "").replace("<E>", "")

        # Find the corresponding log pattern in logs_list
        for log_pattern in logs_list:
            if output_str == log_pattern[0]:
                # Append formatted output to the result string
                output += f"[{chr(97+i)}] INPUT: {input_str}\n"
                output += f"[{chr(97+i)}] OUTPUT: {output_str}\n"
                break

    return output
#ONOLY RUN ON BASE LOGS

In [8]:
def generate_prompt(fewshot, logs):
    return f"{fewshot}\n{logs}"


In [9]:
def clean_logs(df):
    # Define the pattern to remove <B>, <E>, spaces after <B>, spaces before <E>, and "OUTPUT: "
    pattern = r'OUTPUT:\s*|<B>\s*|\s*<E>'
    
    # Apply the pattern to the 'LogTemplate' and 'LLMOutput' columns
    df.loc[:, 'LogTemplate'] = df['LogTemplate'].apply(lambda x: re.sub(pattern, '', x))
    df.loc[:, 'LLMOutput'] = df['LLMOutput'].apply(lambda x: re.sub(pattern, '', x))



In [10]:
def difference_checker(df):
    # calculate and store difference
    df['Difference'] = df.apply(lambda row: ' '.join(set(row['LogTemplate'].split()) ^ set(row['LLMOutput'].split())), axis=1)
    
    # bool for difference
    df['is_different'] = df['Difference'].apply(lambda x: True if x else False)
    
    # characters that are different (excluding spaces)
    df['num_diff_chars'] = df['Difference'].apply(lambda x: sum(1 for char in x if char != ' '))
    
    #print(df[['Difference', 'is_different', 'num_diff_chars']])


In [11]:
def process_logs(input_path, fewshot_examples):
    convo_create = setup_prompt_env(passw)
    #print(fewshot_examples)
    df = pd.read_csv(input_path)
    outputs = []
    
    # Process logs in chunks of 5
    for i in range(0, len(df), 5):
        time.sleep(12) #change to 5.5 ################
        chunk = df.iloc[i:i+5]
        logs = "\n".join([f"[{idx % 5 + 1}] INPUT: {row['Log']}" for idx, row in chunk.iterrows()])
        prompt = generate_prompt(fewshot_examples, logs)
        output = send_prompt(prompt, convo_create)
        print(output)
        # dealing with erroneous newlines
        llm_output_lines = [line.strip() for line in output.split("\n") if line.strip()]

        # add to lst
        for line in llm_output_lines:
            parts = line.split(" ", 1)
            if len(parts) > 1:
                outputs.append(parts[1])
            else:
                outputs.append("")  # empty str to handle error
    
    # Ensure outputs length matches df length
    if len(outputs) > len(df):
        outputs = outputs[:len(df)]  # Truncate outputs if it is longer than df
    else:
        while len(outputs) < len(df):
            outputs.append("")  # Pad outputs if it is shorter than df
    
    # Add the outputs to the dataframe
    df.loc[:, "LLMOutput"] = outputs
    clean_logs(df)
    difference_checker(df)
    df.to_csv(f"{input_path[28]}.csv", index=False)
    #df.to_csv(f"{input_path[:5]}parsed/parsed_{input_path[19:]}", index=False)
    df.to_csv(f"{input_path[:5]}parsed/parsed_{input_path[19:]}", index=False)
    #df.to_csv(f"Parsed.csv", index=False)
    

In [12]:
# input_path = "data/prefixed_logs/filtered_prefixed_0_generated_logs.csv"
# f"{input_path[:5]}parsed/parsed_{input_path[19:]}"

In [13]:
# fewshot_examples = generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 3)
# print(fewshot_examples)

In [14]:
#process_logs("data/prefixed_logs/prefixed_0_generated_logs.csv", generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 3))
process_logs("data/prefixed_logs/filtered_prefixed_0_generated_logs.csv", generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 4))


[1] <B><*>:<*> Served block blk_<*> to /<*> <E>
[2] <B><*>:<*> Served block blk_<*> to /<*> <E>
[3] <B><*>:<*> Served block blk_<*> to /<*> <E>
[4] <B><*>:<*> Served block blk_<*> to /<*> <E>
[5] <B><*>:<*> Served block blk_<*> to /<*> <E> 

[1] <B><*>:<*> Served block blk_<*> to /<*> <E>
[2] <B><*>:<*> Served block blk_<*> to /<*> <E>
[3] <B><*>:<*> Served block blk_<*> to /<*> <E>
[4] <B><*>:<*> Served block blk_<*> to /<*> <E>
[5] <B><*>:<*> Served block blk_<*> to /<*> <E> 

[1] <B><*>:<*> Starting thread to transfer block blk_<*> to <*>:<*><E>
[2] <B><*>:<*> Starting thread to transfer block blk_<*> to <*>:<*><E>
[3] <B><*>:<*> Starting thread to transfer block blk_<*> to <*>:<*><E>
[4] <B><*>:<*> Starting thread to transfer block blk_<*> to <*>:<*><E>
[5] <B><*>:<*> Starting thread to transfer block blk_<*> to <*>:<*><E> 

[1] <B><*>:<*> Starting thread to transfer block blk_<*> to <*>:<*><E>
[2] <B><*>:<*> Starting thread to transfer block blk_<*> to <*>:<*><E>
[3] <B><*>:<*> St

In [15]:
#process_logs("data/prefixed_logs/prefixed_1_generated_logs.csv", generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 3))

In [16]:
time.sleep(15)

In [18]:
#process_logs("data/prefixed_logs/prefixed_1_generated_logs.csv", generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 3))
process_logs("data/prefixed_logs/filtered_prefixed_1_generated_logs.csv", generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 4))

[1] <B><*>:<*> Delivered block blk_<*> to /<*> <E>
[2] <B><*>:<*> Delivered block blk_<*> to /<*> <E>
[3] <B><*>:<*> Delivered block blk_<*> to /<*> <E>
[4] <B><*>:<*> Delivered block blk_<*> to /<*> <E>
[5] <B><*>:<*> Delivered block blk_<*> to /<*> <E> 

[1] <B><*>:<*> Delivered block blk_<*> to /<*> <E>
[2] <B><*>:<*> Delivered block blk_<*> to /<*> <E>
[3] <B><*>:<*> Delivered block blk_<*> to /<*> <E>
[4] <B><*>:<*> Delivered block blk_<*> to /<*> <E>
[5] <B><*>:<*> Delivered block blk_<*> to /<*> <E> 

[1] <B>INFO dfs.DataNode: Initiating transfer of block blk_<*> from <*>:<*> to <*>:<*><E>
[2] <B>INFO dfs.DataNode: Initiating transfer of block blk_<*> from <*>:<*> to <*>:<*><E>
[3] <B>INFO dfs.DataNode: Initiating transfer of block blk_<*> from <*>:<*> to <*>:<*><E>
[4] <B>INFO dfs.DataNode: Initiating transfer of block blk_<*> from <*>:<*> to <*>:<*><E>
[5] <B>INFO dfs.DataNode: Initiating transfer of block blk_<*> from <*>:<*> to <*>:<*><E> 

[1] <B>INFO dfs.DataNode: Initiati

In [19]:
time.sleep(15)

In [20]:
#process_logs("data/prefixed_logs/prefixed_2_generated_logs.csv", generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 3))
process_logs("data/prefixed_logs/filtered_prefixed_2_generated_logs.csv", generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 4))

[1] <B><*>:<*> Served block blk_<*> to /<*> <E>
[2] <B><*>:<*> Served block blk_<*> to /<*> <E>
[3] <B><*>:<*> Served block blk_<*> to /<*> <E>
[4] <B><*>:<*> Served block blk_<*> to /<*> <E>
[5] <B><*>:<*> Served block blk_<*> to /<*> <E> 

[1] <B><*>:<*> Served block blk_<*> to /<*> <E>
[2] <B><*>:<*> Served block blk_<*> to /<*> <E>
[3] <B><*>:<*> Served block blk_<*> to /<*> <E>
[4] <B><*>:<*> Served block blk_<*> to /<*> <E>
[5] <B><*>:<*> Served block blk_<*> to /<*> <E> 

[1] <B>Transfer thread started for block blk_<*> from <*>:<*> to <*>:<*><E>
[2] <B>Transfer thread started for block blk_<*> from <*>:<*> to <*>:<*><E>
[3] <B>Transfer thread started for block blk_<*> from <*>:<*> to <*>:<*><E>
[4] <B>Transfer thread started for block blk_<*> from <*>:<*> to <*>:<*><E>
[5] <B>Transfer thread started for block blk_<*> from <*>:<*> to <*>:<*><E> 

[1] <B>Transfer thread started for block blk_<*> from <*>:<*> to <*>:<*><E>
[2] <B>Transfer thread started for block blk_<*> from <*>:

In [21]:
time.sleep(15)

In [22]:
#process_logs("data/prefixed_logs/prefixed_3_generated_logs.csv", generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 3))
process_logs("data/prefixed_logs/filtered_prefixed_3_generated_logs.csv", generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 4))

[1] <B>Block blk_<*> was delivered to /<*> by <*><E>
[2] <B>Block blk_<*> was delivered to /<*> by <*><E>
[3] <B>Block blk_<*> was delivered to /<*> by <*><E>
[4] <B>Block blk_<*> was delivered to /<*> by <*><E>
[5] <B>Block blk_<*> was delivered to /<*> by <*><E> 

[1] <B>Block blk_<*> was delivered to /<*> by <*><E>
[2] <B>Block blk_<*> was delivered to /<*> by <*><E>
[3] <B>Block blk_<*> was delivered to /<*> by <*><E>
[4] <B>Block blk_<*> was delivered to /<*> by <*><E>
[5] <B>Block blk_<*> was delivered to /<*> by <*><E> 

[1] <B>A thread has been started to transfer block blk_<*> from <*>:<*> to <*>:<*><E>
[2] <B>A thread has been started to transfer block blk_<*> from <*>:<*> to <*>:<*><E>
[3] <B>A thread has been started to transfer block blk_<*> from <*>:<*> to <*>:<*><E>
[4] <B>A thread has been started to transfer block blk_<*> from <*>:<*> to <*>:<*><E>
[5] <B>A thread has been started to transfer block blk_<*> from <*>:<*> to <*>:<*><E> 

[1] <B>A thread has been started t

In [23]:
time.sleep(15)

In [25]:
#process_logs("data/prefixed_logs/prefixed_4_generated_logs.csv", generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 3))
process_logs("data/prefixed_logs/filtered_prefixed_4_generated_logs.csv", generate_fewshot("data/prefixed_logs/prefixed_0_generated_logs.csv", 4))

[1] <B>Successful block delivery of blk_<*> to /<*> by <*> <E>
[2] <B>Successful block delivery of blk_<*> to /<*> by <*> <E>
[3] <B>Successful block delivery of blk_<*> to /<*> by <*> <E>
[4] <B>Successful block delivery of blk_<*> to /<*> by <*> <E>
[5] <B>Successful block delivery of blk_<*> to /<*> by <*> <E> 

[1] <B>Successful block delivery of blk_<*> to /<*> by <*> <E>
[2] <B>Successful block delivery of blk_<*> to /<*> by <*> <E>
[3] <B>Successful block delivery of blk_<*> to /<*> by <*> <E>
[4] <B>Successful block delivery of blk_<*> to /<*> by <*> <E>
[5] <B>Successful block delivery of blk_<*> to /<*> by <*> <E> 

[1] <B>Block blk_<*> transfer initiated from <*>:<*> to <*>:<*> by a new thread<E>
[2] <B>Block blk_<*> transfer initiated from <*>:<*> to <*>:<*> by a new thread<E>
[3] <B>Block blk_<*> transfer initiated from <*>:<*> to <*>:<*> by a new thread<E>
[4] <B>Block blk_<*> transfer initiated from <*>:<*> to <*>:<*> by a new thread<E>
[5] <B>Block blk_<*> transfer init

In [26]:
windscribe("disconnect")