# Creating Synthetic Data


Load API key

In [1]:
from google.colab import userdata
google_api_key = userdata.get('GOOGLE_API_KEY')

# Mount drive and prepare csv

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
!ls /content/drive/MyDrive/SyntheticData/

grep_options.csv  sample_files3.csv  synthetic_data34.csv  synthetic_data3.csv	synthetic_data.csv


In [4]:
path_to_csv_folder = "/content/drive/MyDrive/SyntheticData/"
name_of_new_csv = path_to_csv_folder + "synthetic_data2.csv"
file_options = path_to_csv_folder + "grep_options.csv"
file_files = path_to_csv_folder + "sample_files3.csv"

1. Load csv files using dspy data loader
2. Create signature for DSPy module
3. Create module for generation
4. Use a loop to iterate through all file, option combinations

In [5]:
!pip install -q dspy-ai
!pip install -q python-dotenv
!pip install -q google-generativeai
!pip install -q pydantic

Collecting dspy-ai
  Downloading dspy_ai-2.4.9-py3-none-any.whl (220 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.4/220.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting backoff~=2.2.1 (from dspy-ai)
  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting joblib~=1.3.2 (from dspy-ai)
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai<2.0.0,>=0.28.1 (from dspy-ai)
  Downloading openai-1.34.0-py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.5/325.5 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
Collecting ujson (from dspy-ai)
  Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting d

In [6]:
import dspy
from dspy.datasets import DataLoader

dl = DataLoader()

grep_options_dataset = dl.from_csv(file_options)
sample_files_dataset = dl.from_csv(file_files)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
print(grep_options_dataset[1])
print(sample_files_dataset[1])

Example({'Options': '-a', 'Description': 'Treats binary files as text'}) (input_keys=set())
Example({'File_Path': '/var/log/auth.lo?', 'Description': 'Authentication log file. It is useful for recording system authentication attempts, identifying intrusion attempts and capturing relevant log events.', 'File': '/var/log/auth.log\n\nJan  1 00:00:01 localhost sshd[1234]: Accepted password for user123 from 192.168.1.1 port 12345 ssh2'}) (input_keys=set())


In [8]:
gemini_flash = dspy.Google(model="gemini-1.5-flash-latest", temperature=0, api_key=google_api_key)


dspy.settings.configure(lm=gemini_flash)

### Create generation module

In [9]:
from pydantic import BaseModel, Field, conlist
from typing import List, NamedTuple

class GrepPair(NamedTuple):
    command: str
    description: str

class Output(BaseModel):
    commands: conlist(GrepPair, min_length=10, max_length=10) = Field(..., description="List of 10 pairs of grep commands and a natural language description ")

# class QuestionOutput(BaseModel):
#     summaries: conlist(str, min_length=10, max_length=10) = Field(..., description="List of 10 natural language descriptions of something to look for with grep")

# class GrepOutput(BaseModel):
#     commands: conlist(str, min_length=11, max_length=10) = Field(..., description="List of 10 grep commands that would be useful")

# class CommentInput(BaseModel):
#     commands: conlist(str, min_length=10, max_length=10) = Field(..., description="List of 10 grep commands")

# class CommentOutput(BaseModel):
#     descriptions: conlist(str, min_length=10, max_length=10) = Field(..., description="corresponding list of natural language descriptions for the grep commands")

class test_generation(dspy.Signature):
    """Given the contents of a file, and the file name as well as a grep option as input create a list of useful and interesting grep commands"""
    file = dspy.InputField(desc="The file to make grep commands for")
    filename = dspy.InputField(desc="The filename")
    option = dspy.InputField(desc="The grep option to use")
    option_description = dspy.InputField(desc="A description of the grep option")
    commands: Output = dspy.OutputField()

# class test_generation_simple(dspy.Signature):
#     """Given the contents of a file, and the file name as well as a grep option as input, create a useful and interesting grep command"""
#     file = dspy.InputField(desc="The file to make grep commands for")
#     filename = dspy.InputField(desc="The filename")
#     option = dspy.InputField(desc="The grep option to use")
#     option_description = dspy.InputField(desc="A description of the grep option")
#     commands: GrepOutput = dspy.OutputField()

generate_answer = dspy.TypedPredictor(test_generation)
# generate_simple_answer = dspy.Predict(test_generation_simple)

Simple test

In [10]:
grep_option = grep_options_dataset[0]
file = sample_files_dataset[0]
fileFile = "```"+file.File+"```"
print(fileFile)

```/var/log/syslog```


In [11]:
print(generate_answer)

TypedPredictor(test_generation(file, filename, option, option_description -> commands
    instructions='Given the contents of a file, and the file name as well as a grep option as input create a list of useful and interesting grep commands'
    file = Field(annotation=str required=True json_schema_extra={'desc': 'The file to make grep commands for', '__dspy_field_type': 'input', 'prefix': 'File:'})
    filename = Field(annotation=str required=True json_schema_extra={'desc': 'The filename', '__dspy_field_type': 'input', 'prefix': 'Filename:'})
    option = Field(annotation=str required=True json_schema_extra={'desc': 'The grep option to use', '__dspy_field_type': 'input', 'prefix': 'Option:'})
    option_description = Field(annotation=str required=True json_schema_extra={'desc': 'A description of the grep option', '__dspy_field_type': 'input', 'prefix': 'Option Description:'})
    commands = Field(annotation=Output required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix'

In [12]:
with dspy.context(lm=gemini_flash):
    resp = generate_answer(file=file.File, filename=file.File_Path, option=grep_option.Options, option_description=grep_option.Description)

the_command=resp.commands

print(resp.commands)


commands=[GrepPair(command="grep -A 5 'kernel: [A-Z]+' /var/log/syslog", description='Show 5 lines of context after lines containing kernel messages with uppercase letters.'), GrepPair(command="grep -A 10 'user.notice' /var/log/syslog", description='Show 10 lines of context after lines containing user.notice messages.'), GrepPair(command="grep -A 2 'sshd: Authentication failure' /var/log/syslog", description='Show 2 lines of context after lines containing sshd authentication failures.'), GrepPair(command="grep -A 3 'crontab: Reloading' /var/log/syslog", description='Show 3 lines of context after lines containing crontab reloading messages.'), GrepPair(command="grep -A 1 'audit: type=SYSCALL' /var/log/syslog", description='Show 1 line of context after lines containing audit syscall messages.'), GrepPair(command="grep -A 5 'pam_unix(sshd:auth): authentication failure' /var/log/syslog", description='Show 5 lines of context after lines containing pam_unix authentication failures for sshd.'

In [16]:
print(gemini_flash.inspect_history(n=1))




Given the contents of a file, and the file name as well as a grep option as input create a list of useful and interesting grep commands

---

Follow the following format.

File: The file to make grep commands for

Filename: The filename

Option: The grep option to use

Option Description: A description of the grep option

Commands: ${commands}. Respond with a single JSON object. JSON Schema: {"$defs": {"GrepPair": {"maxItems": 2, "minItems": 2, "prefixItems": [{"title": "Command", "type": "string"}, {"title": "Description", "type": "string"}], "type": "array"}}, "properties": {"commands": {"description": "List of 10 pairs of grep commands and a natural language description ", "items": {"$ref": "#/$defs/GrepPair"}, "maxItems": 10, "minItems": 10, "title": "Commands", "type": "array"}}, "required": ["commands"], "title": "Output", "type": "object"}

---

File: /var/log/syslog

Filename: /var/log/syslo?.log

Option: -A

Option Description: Prints NUM lines of trailing context after mat

In [64]:
pyd_format_string = """${commands}. Respond with a single JSON object. JSON Schema: {"$defs": {"GrepPair": {"maxItems": 2, "minItems": 2, "prefixItems": [{"title": "Command", "type": "string"}, {"title": "Description", "type": "string"}], "type": "array"}}, "properties": {"commands": {"description": "List of 10 pairs of grep commands and a natural language description ", "items": {"$ref": "#/$defs/GrepPair"}, "maxItems": 10, "minItems": 10, "title": "Commands", "type": "array"}}, "required": ["commands"], "title": "Output", "type": "object"}"""

## Define generation loop
Its just a fully connected pair-wise connection of options to example files

In [14]:
!pip install ratelimit

Collecting ratelimit
  Downloading ratelimit-2.2.1.tar.gz (5.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ratelimit
  Building wheel for ratelimit (setup.py) ... [?25l[?25hdone
  Created wheel for ratelimit: filename=ratelimit-2.2.1-py3-none-any.whl size=5894 sha256=f23fd7abad6a869f196b633caefc9c58126edf164bed739c643ad7dae5a8a329
  Stored in directory: /root/.cache/pip/wheels/27/5f/ba/e972a56dcbf5de9f2b7d2b2a710113970bd173c4dcd3d2c902
Successfully built ratelimit
Installing collected packages: ratelimit
Successfully installed ratelimit-2.2.1


### Test REST client query

In [63]:
from pydantic import BaseModel, Field
from typing import List

class GrepPair(BaseModel):
    command: str = Field(..., title="Command")
    description: str = Field(..., title="Description")

class Output(BaseModel):
    commands: List[GrepPair] = Field(
        ...,
        title="Commands",
        description="List of 10 pairs of grep commands and a natural language description"
    )

    class Config:
        schema_extra = {
            "$defs": {
                "GrepPair": {
                    "maxItems": 2,
                    "minItems": 2,
                    "prefixItems": [
                        {"title": "Command", "type": "string"},
                        {"title": "Description", "type": "string"}
                    ],
                    "type": "array"
                }
            },
            "properties": {
                "commands": {
                    "description": "List of 10 pairs of grep commands and a natural language description",
                    "items": {"$ref": "#/$defs/GrepPair"},
                    "maxItems": 10,
                    "minItems": 10,
                    "title": "Commands",
                    "type": "array"
                }
            },
            "required": ["commands"],
            "title": "Output",
            "type": "object"
        }

# Create an instance of the model with example data
example_output = Output(
    commands=[
        GrepPair(command="grep -A 5 'kernel' /var/log/syslog", description="Find lines containing 'kernel' in /var/log/syslog and display 5 lines of context after each match."),
        GrepPair(command="grep -A 10 'error' /var/log/syslog", description="Find lines containing 'error' in /var/log/syslog and display 10 lines of context after each match."),
        # Add more pairs to make up 10
    ]
)

# Print the JSON schema
print(example_output.schema_json(indent=2))

{
  "$defs": {
    "GrepPair": {
      "properties": {
        "command": {
          "title": "Command",
          "type": "string"
        },
        "description": {
          "title": "Description",
          "type": "string"
        }
      },
      "required": [
        "command",
        "description"
      ],
      "title": "GrepPair",
      "type": "object"
    }
  },
  "properties": {
    "commands": {
      "description": "List of 10 pairs of grep commands and a natural language description",
      "items": {
        "$ref": "#/$defs/GrepPair"
      },
      "title": "Commands",
      "type": "array"
    }
  },
  "required": [
    "commands"
  ],
  "title": "Output",
  "type": "object"
}


* 'schema_extra' has been renamed to 'json_schema_extra'


In [47]:
import aiohttp
import asyncio
import requests
import json

API_URL = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent'
API_KEY = google_api_key


def chat_with_gemini(file_example, grep_option):
    smfp = file_example.File_Path
    smf = file_example.File
    grpo = grep_option.Options
    grpod = grep_option.Description

    user_prompt = f"""
    Given the following information:
    file={smf}
    file_path={smfp}
    grep_option={grpo}
    grep_option_description={grpod}

    Create 10 useful and interesting grep commands  and their natural language description.
    Make sure to use the supplied file name and the grep options provided
    Use the follwing JSON format:
    {{
      "answer": [
        {{
          "grep_command": "string",
          "description": "string"
        }}
      ]
    }}
    """

    headers = {
        'Content-Type': 'application/json',
    }
    data = {
        "contents": [
        {
          "parts": [
            {
              "text": user_prompt
            }
          ]
        }
      ],
      "generationConfig": {
            "response_mime_type": "application/json",
      }
    }
    response = requests.post(f"{API_URL}?key={API_KEY}", headers=headers, json=data)
    response.raise_for_status()  # Raises stored HTTPError, if one occurred.
    response_data = response.json()
    response_text = response_data['candidates'][0]['content']['parts'][0]['text']
    data = json.loads(response_text)

    return data

result = chat_with_gemini(sample_files_dataset[0],grep_options_dataset[0])
print(result)



In [21]:
import json

# The given JSON string
json_string = result
# Parse the JSON string into a Python dictionary
json_object = json.loads(json_string)

# Loop through each grep_command and description pair
for item in json_object['answer']:
    grep_command = item['grep_command']
    description = item['description']
    print(f"Grep Command: {grep_command}")
    print(f"Description: {description}")

Grep Command: grep -A 5 'ERROR' /var/log/syslog
Description: Show lines containing 'ERROR' and 5 lines of context after each match in /var/log/syslog
Grep Command: grep -A 2 'kernel' /var/log/syslog
Description: Show lines containing 'kernel' and 2 lines of context after each match in /var/log/syslog
Grep Command: grep -A 3 'permission denied' /var/log/syslog
Description: Show lines containing 'permission denied' and 3 lines of context after each match in /var/log/syslog
Grep Command: grep -A 1 'network' /var/log/syslog
Description: Show lines containing 'network' and 1 line of context after each match in /var/log/syslog
Grep Command: grep -A 5 'failed to start' /var/log/syslog
Description: Show lines containing 'failed to start' and 5 lines of context after each match in /var/log/syslog
Grep Command: grep -A 10 'authentication' /var/log/syslog
Description: Show lines containing 'authentication' and 10 lines of context after each match in /var/log/syslog
Grep Command: grep -A 2 'disk s

### Using Async

In [22]:
class RateLimiter:
    def __init__(self, max=1, period=1):
        self.period = period
        self.max = max
        self.signal = asyncio.Event()
        self.lock = asyncio.Lock()
        self._tasks = [asyncio.create_task(self.ticker())]
        self.signal.set()

    # This signals the event period/max times/second (so if
    # max=4 and period=1, this fires the signal ever 0.25 seconds).
    async def ticker(self):
        while True:
            await asyncio.sleep(self.period / self.max)
            self.signal.set()

    # When entering the context,
    async def __aenter__(self):
        async with self.lock:
            await self.signal.wait()
            self.signal.clear()
        return self

    async def __aexit__(self, *args):
        pass

In [34]:
API_URL = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent'
API_KEY = google_api_key
class RateLimiter:
    def __init__(self, max=1, period=1):
        self.period = period
        self.max = max
        self.signal = asyncio.Event()
        self.lock = asyncio.Lock()
        self._tasks = [asyncio.create_task(self.ticker())]
        self.signal.set()

    async def ticker(self):
        while True:
            await asyncio.sleep(self.period / self.max)
            self.signal.set()

    async def __aenter__(self):
        async with self.lock:
            await self.signal.wait()
            self.signal.clear()
        return self

    async def __aexit__(self, *args):
        pass

async def chat_with_gemini_async(g, grep_option, file_example, session):
    smfp = file_example['File_Path']
    smf = file_example['File']
    grpo = grep_option['Options']
    grpod = grep_option['Description']

    user_prompt = f"""
    Given the following information:
    file={smf}
    file_path={smfp}
    grep_option={grpo}
    grep_option_description={grpod}

    Create 10 useful and interesting grep commands and their natural language description.
    Make sure to use the supplied file name, file, and the grep options provided
    Use the following JSON format:
    {{
      "answer": [
        {{
          "grep_command": "string",
          "description": "string"
        }}
      ]
    }}
    """

    headers = {
        'Content-Type': 'application/json',
    }
    data = {
        "contents": [
            {
                "parts": [
                    {
                        "text": user_prompt
                    }
                ]
            }
        ],
        "generationConfig": {
            "response_mime_type": "application/json"
        }
    }

    async with g:
        try:
            async with session.post(f"{API_URL}?key={API_KEY}", headers=headers, json=data) as response:
                response.raise_for_status()  # Raises stored HTTPError, if one occurred.
                response_data = await response.json()
                response_text = response_data['candidates'][0]['content']['parts'][0]['text']
                response = json.loads(response_text)
                return (smfp, smf, grpo, response)
        except Exception as e:
            print(f"Error processing file_path '{smfp} file_option {grpo}': {e}")
            return (smfp, smf, grpo, None)  # Returning None or a custom message could indicate a failed request

async def main(grep_options_dataset, sample_files_dataset):
    rate_limiter = RateLimiter(max=90, period=15)

    async with aiohttp.ClientSession() as session:
        for grep_file in sample_files_dataset:
            tasks = [chat_with_gemini_async(rate_limiter, grep_option, grep_file, session) for grep_option in grep_options_dataset]
            for future in asyncio.as_completed(tasks):
                smfp, smf, grpo, response = await future
                if response:
                    # Save to CSV FILE HERE
                    print(response_text)
                else:
                    print(f"Failed to get a response for '{grpo} : {smfp}'")
            tasks.clear()
            # print(len(tasks))

In [89]:
# def write_to_csv(filename, option, csvwriter, response_json):
# # Loop through each grep_command and description pair
#   # print(response_json)
#   for item in response_json['answer']:
#       grep_command = item['grep_command']
#       description = item['description']
#       csvwriter.writerow([filename, option, grep_command, description])

def write_to_csv(filename, option, csvwriter, response_json):
    # Loop through each grep_command and description pair
    for item in response_json.get('answer', []):
        grep_command = item.get('grep_command')
        description = item.get('description')

        if grep_command and description:
            csvwriter.writerow([filename, option, grep_command, description])

In [79]:
pip install -q tqdm

# Async script Used

In [95]:
import asyncio
import aiohttp
import csv
import re

import json

API_URL = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent'
API_KEY = google_api_key

class RateLimiter:
    def __init__(self, max=1, period=1):
        self.period = period
        self.max = max
        self.signal = asyncio.Event()
        self.lock = asyncio.Lock()
        self._tasks = [asyncio.create_task(self.ticker())]
        self.signal.set()

    async def ticker(self):
        while True:
            await asyncio.sleep(self.period / self.max)
            self.signal.set()

    async def __aenter__(self):
        async with self.lock:
            await self.signal.wait()
            self.signal.clear()
        return self

    async def __aexit__(self, *args):
        pass

def sanitize_json_string(json_string):
    # Replace invalid escape sequences
    json_string = re.sub(r'\\(?![\"\\/bfnrt])', r'\\\\', json_string)
    return json_string

async def chat_with_gemini_async(g, grep_option, file_example, session):
    smfp = file_example['File_Path']
    smf = file_example['File']
    grpo = grep_option['Options']
    grpod = grep_option['Description']

    user_prompt = f"""
    Given the following information:
    file={json.dumps(smf)}
    file_path={json.dumps(smfp)}
    grep_option={json.dumps(grpo)}
    grep_option_description={json.dumps(grpod)}

    Create 10 useful and interesting grep commands and their natural language description.
    Make sure to base the grep command on the file name, the file contents, and the grep options provided.
    Use the following JSON format:
    {{
      "answer": [
        {{
          "grep_command": "string",
          "description": "string"
        }}
      ]
    }}
    """

    headers = {
        'Content-Type': 'application/json',
    }
    data = {
        "contents": [
            {
                "parts": [
                    {
                        "text": user_prompt
                    }
                ]
            }
        ],
        "generationConfig": {
            "temperature": "0.6",
            "response_mime_type": "application/json"
        }
    }

    async with g:
        try:
            async with session.post(f"{API_URL}?key={API_KEY}", headers=headers, json=data) as response:
                response.raise_for_status()  # Raises stored HTTPError, if one occurred.
                response_data = await response.json()
                response_text = response_data['candidates'][0]['content']['parts'][0]['text']
                # response_text = sanitize_json_string(response_text)
                response_json = json.loads(response_text)
                return (smfp, smf, grpo, response_json)
        except Exception as e:
            # print(f"Error processing file_path '{smfp} file_option {grpo}': {e}")
            return (smfp, smf, grpo, None)  # Returning None or a custom message could indicate a failed request

async def main(grep_options_dataset, sample_files_dataset, csv_file_name):
    rate_limiter = RateLimiter(max=90, period=15)

    # Define the CSV file header
    csv_header = ['path', 'options','command','description']

    # Open the CSV file in append mode using a with statement
    with open(csv_file_name, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        # Write the header only if the file is empty
        csvfile.seek(0, 2)  # Move to the end of the file
        if csvfile.tell() == 0:  # Check if the file is empty
            csvwriter.writerow(csv_header)

        async with aiohttp.ClientSession() as session:
            for grep_file in sample_files_dataset:
                tasks = [chat_with_gemini_async(rate_limiter, grep_option, grep_file, session) for grep_option in grep_options_dataset]
                for future in asyncio.as_completed(tasks):
                    smfp, smf, grpo, response_json = await future
                    if response_json:
                        # Write the response data to the CSV file
                        write_to_csv(smfp, grpo, csvwriter, response_json)
                tasks.clear()

In [96]:
csv_file_name = path_to_csv_folder + 'synthetic_data007.csv'
result = await main(grep_options_dataset, sample_files_dataset, csv_file_name)

### Simpler 1 request at a time

In [38]:
import os
import csv
import time
from ratelimit import limits, sleep_and_retry
import random  # For adding jitter to the retry sleep time

# Constants
REQUESTS_PER_MINUTE = 500
SECONDS_PER_MINUTE = 60
MAX_RETRIES = 3  # Maximum number of retry attempts
TIMEOUT_SECONDS = 60  # Maximum time allowed for each call

# Rate limiting decorator
@sleep_and_retry
@limits(calls=REQUESTS_PER_MINUTE, period=SECONDS_PER_MINUTE)
def rate_limited_function(option_idx, grep_option, file_idx, sample_file):
    with dspy.context(lm=gemini_flash):
        smfp = sample_file.File_Path
        smf = sample_file.File
        grpo = grep_option.Options
        grpod = grep_option.Description
        resp = generate_answer(file=smf, filename=smfp, option=grpo, option_description=grpod)
    return (resp, smfp, grpo)

def rate_limited_function_with_retries(option_idx, grep_option, file_idx, sample_file, max_retries=MAX_RETRIES):
    attempts = 0
    while attempts < max_retries:
        try:
            start_time = time.time()
            result = rate_limited_function(option_idx, grep_option, file_idx, sample_file)
            end_time = time.time()
            # print(f"Processed option {option_idx}, file {file_idx} in {end_time - start_time:.2f} seconds")
            return result
        except Exception as e:
            attempts += 1
            if attempts >= max_retries:
                print(f"Failed to process (option_idx={option_idx}, file_idx={file_idx}) after {max_retries} attempts: {e}")
                return None  # or some other placeholder indicating failure
            backoff_time = min(2 ** attempts, 60)  # Exponential backoff with a max of 60 seconds
            print(f"Retrying (option_idx={option_idx}, file_idx={file_idx}), attempt {attempts}")
            time.sleep(backoff_time + random.uniform(0, 1))  # Adding jitter

def save_result_to_csv(filename, result, mode='a'):
    fieldnames = ['command', 'description', 'file_path', 'option']
    with open(filename, mode, newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if mode == 'w':
            writer.writeheader()

        resp, file_path, option = result
        for intermediate_grep in resp.commands:
            grep_pairs = intermediate_grep[1]
            for grep_pair in grep_pairs:
                writer.writerow({
                    'command': grep_pair.command,
                    'description': grep_pair.description,
                    'file_path': file_path,
                    'option': option
                })

def main(grep_options_dataset, sample_files_dataset):
    filename = './data/synthetic_data.csv'

    # Initialize the CSV file with headers if it doesn't already exist
    if not os.path.exists(filename):
        save_result_to_csv(filename, [], mode='w')

    for option_idx, grep_option in enumerate(grep_options_dataset):
        for file_idx, sample_file in enumerate(sample_files_dataset):
            # print(f"Processing option {option_idx}, file {file_idx}")  # Print progress
            try:
                result = rate_limited_function_with_retries(option_idx, grep_option, file_idx, sample_file)
                if result:
                    save_result_to_csv(filename, result, mode='a')
                    # print(f"Saved result for option {option_idx}, file {file_idx}")  # Print confirmation
            except Exception as e:
                print(f"An error occurred while processing option {option_idx}, file {file_idx}: {e}")

ModuleNotFoundError: No module named 'ratelimit'

In [23]:
csv_file_name = path_to_csv_folder + 'synthetic_data65.csv'
results = main_complex(grep_options_dataset[:3], sample_files_dataset[:3])

Processing option 0, file 0
Processing option 0, file 0
Processed option 0, file 0 in 6.63 seconds
Saved result for option 0, file 0
Processing option 0, file 1
Processing option 0, file 1
Processed option 0, file 1 in 22.17 seconds
Saved result for option 0, file 1
Processing option 0, file 2
Processing option 0, file 2
Processed option 0, file 2 in 5.64 seconds
Saved result for option 0, file 2
Processing option 1, file 0
Processing option 1, file 0
Processed option 1, file 0 in 4.82 seconds
Saved result for option 1, file 0
Processing option 1, file 1
Processing option 1, file 1


KeyboardInterrupt: 

In [None]:
with dspy.context(lm=gpt35_instruct):
    resp = generate_simple_answer(file=file.File, filename=file.File_Path, option="-Eln", option_description=grep_option.Description)

the_command=resp.commands

print(resp.commands)

grep -Eln "error" /var/log/syslo?.log


In [None]:
print(resp)

Prediction(
    command='---\n\nFile: /var/log/syslog\n\nFilename: /var/log/syslo?.log\n\nOption: -A\n\nOption Description: Prints NUM lines of trailing context after matching lines\n\nCommand: `grep -A 5 "kernel: [A-Z]" /var/log/syslo?.log`'
)
