# Further Data Generation of the output of the Logs Extraction and Standardisation System

Adding some manual data to the manual fields of the logs extraction and standardisation system output to make data more realistic.

NOTE: Refer to the 'Logs Standardisation System' folder to understand how the 'logs_system_output.csv' was generated first before looking at this section of the code

## Manual Fields:
- Actual Test Case Outcome
- Reason For Failure

In [None]:
import pandas as pd
import random
import os

In [None]:
#file path
csv_path = "logs_system_output.csv"  # <- Adjust this if needed

#Load the output file
if not os.path.isfile(csv_path):
    raise FileNotFoundError(f"The file '{csv_path}' was not found. Please check the path.")

df = pd.read_csv(csv_path)
df.columns = df.columns.str.strip()
for col in ["Actual Test Case Outcome", "Reason for failure"]:
    if col in df.columns:
        df.drop(columns=col, inplace=True)

if "Error Message" in df.columns:
    error_msg_index = df.columns.get_loc("Error Message")
    df.insert(error_msg_index + 1, "Actual Test Case Outcome", "")
    df.insert(error_msg_index + 2, "Reason for failure", "")
else:
    raise ValueError("'Error Message' column not found in dataset.")

#Filter: Only Completed test cases with 'Pass'
eligible = df[(df["Test Case Status"] == "Completed") & (df["Test Case Outcome Message"] == "Pass")]

#Code randomly samples 20 rows to generate the manual verification synthetic data
n_samples = min(20, len(eligible))
sampled_indices = eligible.sample(n=n_samples, random_state=42).index

#realistic synthetic reasons generated
reasons = [
    "Not supposed to access classified info but it was accessed",
    "Response was 200 OK but expected 403 Forbidden",
    "System exposed internal API endpoint",
    "Sensitive data visible in response payload",
    "User was not authenticated but access granted",
    "Unsecured file directory was browsable",
    "Expected error handling did not trigger",
    "Request method should not be allowed (e.g., PUT)",
    "Redirection bypassed login control",
    "Unexpected content leaked in response",
    "Authorization check was missing for this resource",
    "System responded to invalid query with sensitive data",
    "Page was accessible without token",
    "Request URL manipulation succeeded unexpectedly",
    "No logging for sensitive request",
    "Parameter tampering revealed hidden data",
    "Test expected denial but received full access",
    "Security header missing from response",
    "Session did not expire after expected duration",
    "Resource not properly encrypted in transit"
]

#Fill the sampled cases with synthetic data
for idx in sampled_indices:
    df.at[idx, "Actual Test Case Outcome"] = "Fail"
    df.at[idx, "Reason for failure"] = random.choice(reasons)

#NaNs with empty strings in object columns only
object_cols = df.select_dtypes(include=["object"]).columns
df[object_cols] = df[object_cols].fillna("")

#Save the updated file
df.to_csv(csv_path, index=False)
print(f"Updated file saved to: {csv_path}")