In [9]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.chat_models import ChatOpenAI
from langchain.chat_models import AzureChatOpenAI
from langchain_openai import AzureOpenAI 
import pandas as pd
import dice_ml
from pathlib import Path
from utils import *
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import tensorflow as tf
from keras import optimizers, Sequential
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Dropout, Input
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from keras.models import Model, load_model
from keras import regularizers
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, auc, roc_curve
import os
from tqdm import tqdm
import os

In [None]:


LABELS = ["Normal", "Anomaly"]


### 🟢 Step 4: Main Execution ###

path = "/home/sathish/UEBA/data/data.csv"
df = read_data(path)
train_data, test_data = df.iloc[:276388], df.iloc[276388:]
lookback = 3
X_train, X_test, y_test, scaler = get_train_test_data(train_data, test_data, lookback)
n_features = X_train.shape[2]

# Load or train model
model_path = "lstm_autoencoder.h5"
lstm_model = load_model(model_path, custom_objects={'MeanSquaredError': tf.keras.losses.MeanSquaredError()}) if os.path.exists(model_path) else None

mse = np.mean(np.power(X_test - lstm_model.predict(X_test), 2), axis=(1, 2))
threshold = 0.04
anomaly_idx = np.where(mse > threshold)[0][0]
anomalous_sequence = X_test[anomaly_idx].reshape(1, lookback, X_train.shape[2])

feature_names = train_data.drop(columns=['class', 'type'], errors='ignore').columns.tolist()
immutable_features=["user", "role", "O", "C", "E", "A", "N"]
# Set threshold for counterfactual generation to 0.02
counterfactual_examples = generate_diverse_counterfactuals(
    lstm_model,
    anomalous_sequence,
    scaler,
    feature_names,
    threshold=0.03,
    immutable_features=immutable_features
)
evaluate_model_on_sequences(lstm_model, anomalous_sequence, counterfactual_examples, feature_names, scaler)




In [None]:
# 🟢 NEW: Format the anomaly + counterfactuals for LLM input
formatted_data = format_data_for_llm(anomalous_sequence, counterfactual_examples, feature_names, scaler)

# 🟢 Print or save the JSON data
print("\n🔹 **Formatted Data for LLM:**")
print(formatted_data)  # Print to console


In [12]:
#print(anomalous_sequence.shape)

co_original = scaler.inverse_transform(anomalous_sequence.reshape(-1, len(feature_names)))
#print(co_original, type(co_original))
#print(co_original[0])
co_original_df = pd.DataFrame(co_original.reshape(3,-1),columns=counterfactual_examples[0].columns)
print(co_original_df.to_string())

    user  logon_on_own_pc_normal  logon_on_other_pc_normal  logon_on_own_pc_off_hour  logon_on_other_pc_off_hour  logon_hour  day_of_a_week  device_connects_on_own_pc  device_connects_on_other_pc  device_connects_on_own_pc_off_hour  device_connects_on_other_pc_off_hour  documents_copy_own_pc  documents_copy_other_pc  exe_files_copy_own_pc  exe_files_copy_other_pc  documents_copy_own_pc_off_hour  documents_copy_other_pc_off_hour  exe_files_copy_own_pc_off_hour  exe_files_copy_other_pc_off_hour  neutral_sites  job_search  hacking_sites  neutral_sites_off_hour  job_search_off_hour  hacking_sites_off_hour  total_emails  int_to_int_mails  int_to_out_mails  out_to_int_mails  out_to_out_mails  internal_recipients  external_recipients  distinct_bcc  mails_with_attachments  after_hour_mails  role  business_unit  functional_unit  department  team     O     C     E     A     N
0  675.0                     0.0                       0.0                       1.0                         0.0         

In [13]:
#cf_example = pd.DataFrame(counterfactual_examples[0].iloc[0])
cf_example = counterfactual_examples
#cf_example1  = cf_example.T.round(1)
print(type(cf_example))
cf_example[4]   # list of CF dataframes


<class 'list'>


Unnamed: 0,user,logon_on_own_pc_normal,logon_on_other_pc_normal,logon_on_own_pc_off_hour,logon_on_other_pc_off_hour,logon_hour,day_of_a_week,device_connects_on_own_pc,device_connects_on_other_pc,device_connects_on_own_pc_off_hour,...,role,business_unit,functional_unit,department,team,O,C,E,A,N
0,675.0,0.713983,0.007962,0.535136,0.0,7.441973,1.977825,7.3739,0.015285,0.29413,...,14.0,1.0,2.837465,2.099065,3.843745,39.0,40.0,47.0,50.0,26.0
1,675.0,0.854455,0.014584,0.401808,0.0,7.579709,2.011969,7.277054,0.001431,0.266381,...,14.0,1.0,2.961633,2.349903,3.085686,39.0,40.0,47.0,50.0,26.0
2,675.0,0.837595,0.0,0.403395,0.0,7.557063,1.978618,7.033497,0.010309,0.221397,...,14.0,1.0,2.922812,2.14336,2.789212,39.0,40.0,47.0,50.0,26.0


In [14]:
 # Initialize Azure OpenAI client
llm = AzureChatOpenAI(
            deployment_name="gpt-4o",
            openai_api_base=os.getenv("ENDPOINT_URL", "https://g4266-m3cff6ws-swedencentral.openai.azure.com/"),
            openai_api_key=os.getenv("AZURE_OPENAI_API_KEY", "6xxmkjbfwnm3Li1keZmIG0V5cggGUXPqfSdAppx2EkW0iUSXSbrDJQQJ99AKACfhMk5XJ3w3AAAAACOGdLLq"),
            openai_api_version="2024-05-01-preview"
        )


  llm = AzureChatOpenAI(


In [None]:
def ZeroShotExplain1():
    return  """
        Im providing a negative outcome from a {ML_system} and your task provide obeservations over the provided Insider Anomaly flagged by a LSTM model by comparing them to Counterfactuals generated from the Anomaly. 

        ----- Anomolous outcome -----
        {negative_outcome}

        
        ----- Positive couterfactual outcome -----
        {positive_outcome}


        ----- Observations -----
        <List of Observations>
        """
def ZeroShotRules():
    return  """
        Im providing a negative outcome from a {ML_system} and your task is to extract the most important observed rules over the provided Insider Anomaly flagged by a LSTM model by comparing them to Counterfactuals generated from the Anomaly. 

        --------Immutable features across Anomaly and Counterfactuals ----
        {immutable_features}

        
        ----- Anomolous outcome -----
        {negative_outcome}

        ----- Positive couterfactual outcome -----
        {positive_outcome}

        ----- Rules -----
        <List of Rules>
        """
def ZeroShotRulesCode():
    return  """
        Im providing a negative outcome from a {ML_system}, a set of counterfactual cases that flip the decision of the system and the main rules inferred from the counterfactuals. The Anomaly has three rows representing a lookback of 3 in a LSTM model. There are 5 Counterfactuals.
        You should generate python code to count how many of the counterfactuals are consistent with the rule. The code should create a df with the counterfactuals provided and then check for each rule how many of them follow the rules. Order the rules. Finally, you should print the results. 
        The counterfactual data is structured as sequences of 3 timesteps (lookback of 3) for each user. Each feature has values for t-2, t-1, and t. The rules should be applied to the most recent timestep (t) for each feature.

        --------Immutable features across Anomaly and Counterfactuals ----
        {immutable_features}
 
        ----- Anomolous outcome -----
        {negative_outcome}

        ----- Positive counterfactual outcome -----
        {positive_outcome}

        ----- Rules -----
        {rules}
        
        ----- Dataset info -----
        The following info about the dataset is available:
        {dataset_info}
        
        ----- Code -----
        ```
        import pandas as pd
        #complete this code
        ```
        """ 

In [16]:
template0 = ZeroShotRules()
template1 = ZeroShotRules()
template2 = ZeroShotRulesCode()
#template3 = ZeroShotExplanation(user_input=False)
#template4 = ZeroShotExample()
#template5 = ZeroShotExampleCode()

In [17]:
model_description = """ML_system that detects Anomolous behavior on a network"""

In [34]:
rules_chain = LLMChain(
            llm=llm,
            prompt=PromptTemplate(
                input_variables=["ML_system", "negative_outcome", "positive_outcome"],
                template=template0
            )
        )

In [None]:
immutable_features
negative_outcome_str = prep_data(co_original_df.to_string(index=False) )
positive_outcome_str = prep_data("\n\n".join([df.to_string(index=False) for df in cf_example]))
dataset_info = get_network_activity_info()

In [36]:
# Format the prompt with your inputs
prompt_text = rules_chain.prompt.format(
    ML_system=model_description,
    immutable_features=immutable_features,
    negative_outcome=negative_outcome_str,
    positive_outcome=positive_outcome_str,
    dataset_info=dataset_info
)

print("Prompt being sent to the LLM:")
print(prompt_text)


Prompt being sent to the LLM:

        Im providing a negative outcome from a ML_system that detects Anomolous behavior on a network and your task is to extract the most important observed rules over the provided Insider Anomaly flagged by a LSTM model by comparing them to Counterfactuals generated from the Anomaly. 

        --------Immutable features across Anomaly and Counterfactuals ----
        ['user', 'role', 'O', 'C', 'E', 'A', 'N']

        
        ----- Anomolous outcome -----
        {'header': ['user', 'logon_on_own_pc_normal', 'logon_on_other_pc_normal', 'logon_on_own_pc_off_hour', 'logon_on_other_pc_off_hour', 'logon_hour', 'day_of_a_week', 'device_connects_on_own_pc', 'device_connects_on_other_pc', 'device_connects_on_own_pc_off_hour', 'device_connects_on_other_pc_off_hour', 'documents_copy_own_pc', 'documents_copy_other_pc', 'exe_files_copy_own_pc', 'exe_files_copy_other_pc', 'documents_copy_own_pc_off_hour', 'documents_copy_other_pc_off_hour', 'exe_files_copy_own_pc_o

In [40]:
###Converting to straight text 

response = rules_chain.run({
    "ML_system": model_description,
    "immutable_features": immutable_features,
    "negative_outcome": negative_outcome_str,  # Pass as string
    "positive_outcome": positive_outcome_str   # Pass as string
})

In [43]:
response
sections = response.split("\n\n")  # Split by paragraph

# Print each section with a clear heading
for i, section in enumerate(sections, start=1):
    print(f"\n🔹 Section {i}:\n{section}")


🔹 Section 1:
To identify the most important observed rules explaining the negative anomalous outcome in the provided data, we need to compare the anomalous outcome against its corresponding counterfactual outcomes. The goal is to identify the features that significantly differ between the negative anomalous outcome and the positive counterfactual outcomes, while keeping the immutable features constant. These differences will highlight the most critical factors contributing to the anomaly.

🔹 Section 2:
### Key Observations and Process:
1. **Immutable Features**:
   - These features (`['user', 'role', 'O', 'C', 'E', 'A', 'N']`) remain constant across both anomalous and counterfactual outcomes. Hence, they are not considered when identifying important rules.

🔹 Section 3:
2. **Comparison**:
   - Compare the values of each mutable feature from the anomalous outcome to the corresponding values in the counterfactual outcomes.
   - Identify features where there is a substantial difference b

In [44]:
dataset_info = get_network_activity_info()
rules = response 

In [None]:
dataset_info

In [45]:
rulescode_chain = LLMChain(
            llm=llm,
            prompt=PromptTemplate(
                input_variables=["ML_system", "immutable_features", "negative_outcome", "positive_outcome", "rules", "dataset_info"],
                template=template2
            )
        )

In [31]:
# Format the prompt with your inputs
prompt_text = rulescode_chain.prompt.format(
    ML_system=model_description,
    immutable_features=immutable_features,
    negative_outcome=negative_outcome_str,
    positive_outcome=positive_outcome_str,
    rules=rules,
    dataset_info=dataset_info
)

print("Prompt being sent to the LLM:")
print(prompt_text)


Prompt being sent to the LLM:

        Im providing a negative outcome from a ML_system that detects Anomolous behavior on a network, a set of counterfactual cases that flip the decision of the system and the main rules inferred from the counterfactuals. The Anomaly has three rows representing a lookback of 3 in a LSTM model. There are 5 Counterfactuals.
        You should generate python code to count how many of the counterfactuals are consistent with the rule. The code should create a df with the counterfactuals provided and then check for each rule how many of them follow the rules. Order the rules. Finally, you should print the results. 
        The counterfactual data is structured as sequences of 3 timesteps (lookback of 3) for each user. Each feature has values for t-2, t-1, and t. The rules should be applied to the most recent timestep (t) for each feature.

        --------Immutable features across Anomaly and Counterfactuals ----
        ['user', 'role', 'O', 'C', 'E', 'A', 

In [46]:
response = rulescode_chain.run(
    {
        "ML_system": model_description,
        "immutable_features": immutable_features,
        "negative_outcome": negative_outcome_str,
        "positive_outcome": positive_outcome_str,
        "rules": rules,
        "dataset_info": dataset_info
    }
)

In [47]:

sections = response.split("\n\n")

for i, section in enumerate(sections, start=1):
    print(f"\n{section}")


Below is the Python code to count how many of the counterfactuals are consistent with the given rules and print the results.

```python
import pandas as pd

# Counterfactual data
counterfactual_data = [
    [0.918692, 0.035198, 0.418662, 0.233116, 0.0, 0.009844, 0.62919, 1.917232, 1.946101],
    [0.850021, 0.019444, 0.350859, 0.212401, 0.0, 0.052881, 0.565188, 1.332852, 1.721338],
    [0.874619, 0.012317, 0.395705, 0.231019, 0.0, 0.0, 0.542593, 2.037543, 2.159656],
    [0.852693, 0.085231, 0.352738, 0.26441, 0.0, 0.097775, 0.898456, 1.635325, 1.712093],
    [0.837595, 0.0, 0.403395, 0.221397, 0.0, 0.0, 0.742353, 1.835372, 2.028352]
]

# Anomalous data (for comparison against counterfactuals)
anomalous_data = {
    "logon_on_own_pc_normal": 0.0,
    "logon_on_own_pc_off_hour": 1.0,
    "documents_copy_own_pc_off_hour": 10.0,
    "device_connects_on_other_pc_off_hour": 1.0,
    "exe_files_copy_own_pc_off_hour": 2.0,
    "job_search": 1.0,
    "after_hour_mails": 2.0,
}

# Rules
def rule

In [48]:
def extract_code(text):
    start_code = text.find("```python") + len("```python\n")
    end_code = text.find("```", start_code)
    return text[start_code:end_code].strip()


In [49]:
code = extract_code(response)
code

'import pandas as pd\n\n# Counterfactual data\ncounterfactual_data = [\n    [0.918692, 0.035198, 0.418662, 0.233116, 0.0, 0.009844, 0.62919, 1.917232, 1.946101],\n    [0.850021, 0.019444, 0.350859, 0.212401, 0.0, 0.052881, 0.565188, 1.332852, 1.721338],\n    [0.874619, 0.012317, 0.395705, 0.231019, 0.0, 0.0, 0.542593, 2.037543, 2.159656],\n    [0.852693, 0.085231, 0.352738, 0.26441, 0.0, 0.097775, 0.898456, 1.635325, 1.712093],\n    [0.837595, 0.0, 0.403395, 0.221397, 0.0, 0.0, 0.742353, 1.835372, 2.028352]\n]\n\n# Anomalous data (for comparison against counterfactuals)\nanomalous_data = {\n    "logon_on_own_pc_normal": 0.0,\n    "logon_on_own_pc_off_hour": 1.0,\n    "documents_copy_own_pc_off_hour": 10.0,\n    "device_connects_on_other_pc_off_hour": 1.0,\n    "exe_files_copy_own_pc_off_hour": 2.0,\n    "job_search": 1.0,\n    "after_hour_mails": 2.0,\n}\n\n# Rules\ndef rule_1(logon_on_own_pc_normal):\n    return logon_on_own_pc_normal > 0.5\n\ndef rule_2(logon_on_own_pc_off_hour):\n  

In [50]:
import subprocess
import sys
def execute_code(code, timeout=10):
    with open("temp_code.py", "w") as f:
        f.write(code)
    try:
        result = subprocess.run([sys.executable, "temp_code.py"], capture_output=True, text=True, check=True, timeout=timeout)
        return result.stdout, False
    except subprocess.CalledProcessError as e:
        return e.stdout + e.stderr, True
    except subprocess.TimeoutExpired:
        return "Execution timed out.", True
   

In [51]:
result, _ = execute_code(code)

print(result)

Number of counterfactuals consistent with each rule (sorted):
Rule 1: 5
Rule 2: 5
Rule 3: 5
Rule 4: 5
Rule 5: 5
Rule 6: 5

