In [None]:
import pandas as pd
import os

class PreProcess:
    def __init__(self, file_path):
        self.file_path = file_path
        self.df = pd.read_csv(file_path, encoding='utf-8', engine='python')
        self.cleaned_text = ""

    def handle_float_nans(self):
        float_columns = ['EventId', 'ProcessId', 'ParentProcessId']
        for col in float_columns:
            if col in self.df.columns:
                self.df[col] = self.df[col].fillna(-1)

    def remove_nans(self):
        self.df.fillna('', inplace=True)

    def normalize_text(self):
        def normalize(value):
            return str(value).replace(',', ' ').replace('\n', ' ').replace('\r', '').strip()
        for col in self.df.select_dtypes(include='object').columns:
            self.df[col] = self.df[col].map(normalize)


    def select_columns(self):
        columns_to_keep = [
            'TimeCreated', 'EventId', 'ProcessId', 'ParentProcessId',
            'Image', 'CommandLine', 'ParentImage', 'ParentCommandLine',
            'CurrentDirectory', 'User', 'IntegrityLevel'
        ]
        self.df = self.df[columns_to_keep]

    def rows_to_paragraphs(self):
        def row_to_paragraph(row):

            return (
                f"On {row['TimeCreated']}, Event ID {row['EventId']} occurred. "
                f"Process '{row['Image']}' (PID: {row['ProcessId']}) was launched using the command: {row['CommandLine']}. "
                f"It was spawned by '{row['ParentImage']}' (PID: {row['ParentProcessId']}) with command: {row['ParentCommandLine']}. "
                f"The process ran in directory '{row['CurrentDirectory']}' under user '{row['User']}' "
                f"with an integrity level of '{row['IntegrityLevel']}'."
            )
        self.df['log_paragraph'] = self.df.apply(row_to_paragraph, axis=1)

    def create_megastring(self):
        self.cleaned_text = '\n'.join(self.df['log_paragraph'].tolist())

    def run_pipeline(self):
        self.handle_float_nans()
        self.remove_nans()
        self.normalize_text()
        self.select_columns()
        self.rows_to_paragraphs()
        self.create_megastring()

In [None]:
import requests

class SyntheticData:
    def __init__(self, file_path):
        self.file_path = file_path
        self.cleaned_text = ""
        self.response_text = ""

    def generate_cleaned_text(self):
        log_processor = PreProcess(self.file_path)
        log_processor.run_pipeline()
        self.cleaned_text = log_processor.cleaned_text

    def generate_response(self, api_key, endpoint, deployment_name):
        prompt_prefix = (
            "As a security analyst, you are presented with raw Windows Sysmon logs from a red team simulation."
            "Instructions: "
            "- Interpret each log as if investigating a potential compromise."
            "- Call out behaviors suggestive of lateral movement, execution from temporary directories, PowerShell abuse, file obfuscation, etc."
            "- Be assertive in flagging anything that seems out of place."
            "- Prioritize security over ambiguity—it's okay to over-classify in favor of catching threats."
            "List each entry as:"
            "- Summary"
            "- Suspicion Level: Benign / Suspicious / Malicious"
            "- Explanation"

            "Begin analysis below:"
        )
        full_prompt = prompt_prefix + self.cleaned_text
        
        headers = {
            "api-key": api_key,
            "Content-Type": "application/json"
        }
        url = f"{endpoint}openai/deployments/{deployment_name}/chat/completions?api-version=2024-03-01-preview"
        payload = {
            "messages": [{"role": "user", "content": full_prompt}],
            "temperature": 0.6,
            "top_p": 0.95
        }
        response = requests.post(url, headers=headers, json=payload)
        response_json = response.json()
        self.response_text = response_json['choices'][0]['message']['content']

    def save_to_csv(self, output_path, append=False):
        import pandas as pd
        import os

        df = pd.DataFrame([{
            'input_text': self.cleaned_text,
            'copilot_response': self.response_text
        }])

        if append and os.path.exists(output_path):
            df.to_csv(output_path, mode='a', header=False, index=False)
        else:
            df.to_csv(output_path, index=False)


In [None]:
import os

class SyntheticBatchProcessor:
    def __init__(self, folder_path, output_path, api_key, endpoint, deployment_name):
        self.folder_path = folder_path
        self.output_path = output_path
        self.api_key = api_key
        self.endpoint = endpoint
        self.deployment_name = deployment_name

    def process_all_files(self):
        # Loop over all folders inside the parent folder
        for child_folder in os.listdir(self.folder_path):
            child_path = os.path.join(self.folder_path, child_folder)
            
            # Proceed only if it's a directory
            if os.path.isdir(child_path):
                for filename in os.listdir(child_path):
                    if filename.endswith(".csv"):
                        file_path = os.path.join(child_path, filename)
                        
                        print(f"🔍 Processing: {filename} in {child_folder}")
                        sd = SyntheticData(file_path)
                        sd.generate_cleaned_text()
                        sd.generate_response(
                            api_key=self.api_key,
                            endpoint=self.endpoint,
                            deployment_name=self.deployment_name
                        )
                        sd.save_to_csv(self.output_path, append=True)


In [None]:
processor = SyntheticBatchProcessor(
    folder_path="C:\\Users\\shrav\\Documents\\MTECH\\Sem IV\\SecureSLM\\Data\\new",
    output_path="copilot_output_3.csv",
    api_key="M2tMQ5DGXcyJNY45FQS8syNc9RdDtgSCjPic9Tdb0DYh6AVZCd93JQQJ99BGACHYHv6XJ3w3AAAAACOGpz6C",
    endpoint="https://shrav-mckii6fb-eastus2.cognitiveservices.azure.com/",
    deployment_name="gpt-4o"
)

processor.process_all_files()


🔍 Processing: T1001.002-1.csv
🔍 Processing: T1001.002-2.csv
🔍 Processing: T1003-1.csv
🔍 Processing: T1003-2.csv
🔍 Processing: T1003-3.csv
🔍 Processing: T1003-4.csv
🔍 Processing: T1003-5.csv
🔍 Processing: T1003-6.csv
🔍 Processing: T1003.001-1.csv
🔍 Processing: T1003.001-10.csv
🔍 Processing: T1003.001-11.csv
🔍 Processing: T1003.001-12.csv
🔍 Processing: T1003.001-13.csv
🔍 Processing: T1003.001-14.csv
🔍 Processing: T1003.001-2.csv
🔍 Processing: T1003.001-3.csv
🔍 Processing: T1003.001-4.csv
🔍 Processing: T1003.001-6.csv
🔍 Processing: T1003.001-8.csv
🔍 Processing: T1003.001-9.csv
🔍 Processing: T1003.002-1.csv
🔍 Processing: T1003.002-3.csv
🔍 Processing: T1003.002-4.csv
🔍 Processing: T1003.002-5.csv
🔍 Processing: T1003.002-6.csv
🔍 Processing: T1003.002-8.csv
🔍 Processing: T1003.003-2.csv
🔍 Processing: T1003.003-4.csv
🔍 Processing: T1003.003-5.csv
🔍 Processing: T1003.003-6.csv
🔍 Processing: T1003.003-7.csv
🔍 Processing: T1003.004-2.csv
🔍 Processing: T1003.005-1.csv
🔍 Processing: T1003.006-1.csv

KeyError: 'choices'