In [5]:
# imports
import openai
import git
import yaml
import pathlib
import json


In [17]:
# for all of the directories under certified-operators/operators, go through their subdirectories and look at each manifests/*.clusterserviceversion.yaml file to extract the fields in .spec.install.spec.Deployments, and return those as a list
def get_operator_deployments(operator_dir: str) -> list[dict]:
  deployments = []
  # go through each directory under certified-operators/operators
  for dir in pathlib.Path(operator_dir).iterdir():
    # skip if not a directory 
    if not dir.is_dir():
      continue
    # go through each subdirectory under the current directory
    for subdir in dir.iterdir():
      # skip if not a directory
      if not subdir.is_dir():
        continue
      # look at each manifests/*.clusterserviceversion.yaml file
      for file in subdir.glob('manifests/*.clusterserviceversion.yaml'):
        # parse the yaml file
        with open(file, 'r') as f:
          yaml_data = yaml.load(f, Loader=yaml.FullLoader)
        # extract the spec.install.spec.Deployments fields
        if 'spec' in yaml_data and 'install' in yaml_data['spec'] and 'spec' in yaml_data['spec']['install'] and 'deployments' in yaml_data['spec']['install']['spec']:
          deploy = {
            'operatorName': yaml_data['metadata']['name'],
            'deployments': yaml_data['spec']['install']['spec']['deployments']
          }
          # return the list of deployments
          deployments.append(deploy)
  return deployments


deployments = get_operator_deployments('certified-operators/operators')
# deployments

# dump all of the deployments to a file titled deployments.yaml
with open('deployments.yaml', 'w') as f:
  yaml.dump(deployments, f)

In [12]:
# create a sample set of training data for OpenAI in a JSONL format like this: 
completions = [
  {
    "prompt": "# deploy a ConfigMap named my-config-map\nKind: ConfigMap",
    "completion": "\nmetadata:\n\tname: my-config-map\nspec:\n\tmy-data: this is my data\n"
  },
  {
    "prompt": "# deploy a Service named my-service\nKind: Service",
    "completion": "\nmetadata:\n\tname: my-service\nspec:\n\tports:\n\t- port: 80\n\t- port: 443\n"
  }
]



[]

In [6]:
training_data = []
# go through each directory in completions/ and obtain the representation for completion.yml and prompt.yml, then place them into a dict and append it to the training data list
for dir in pathlib.Path('completions/').iterdir():
	# skip if not a directory
	if not dir.is_dir():
		continue
	sample = {
		'prompt': '',
		'completion': ''
	}
	# read the contents of completion.yml into a string		
	with open(dir / 'completion.yml', 'r') as f:
		sample['completion'] = f.read()

	# read the contents of prompt.yml into a string
	with open(dir / 'prompt.yml', 'r') as f:
		sample['prompt'] = f.read()
	# append the dict to the training data list
	training_data.append(sample)

# output the training data into a json file 
with open('training_data.jsonl', 'w') as f:
	json.dump(training_data, f)

In [14]:
# create a set of training data for openai based on the deployments extracted
training_data = []

# process the list of deployments in deployments.yaml
with open('deployments.yaml', 'r') as f:
	deployments = yaml.load(f, Loader=yaml.FullLoader)

# for each deployment, extract the operatorName and the deployments fields
for deployment in deployments:
	prompt = '# deploy a Deployment named ' + deployment['operatorName'] + '\nKind: Deployment'
	
	# format each deployment dict in the deployments field into a string
	deployment_strs = [yaml.dump(d, default_flow_style=False) for d in deployment['deployments']]

	# join the deployments fields into a string separated by '---'
	completion = '\n---\n'.join(deployment_strs)
	
	# append the test data into our training_data set
	training_data.append({
		'prompt': prompt,
		'completion': completion
	})

# dump the training data into a json file
with open('training_data.json', 'w') as f:
	json.dump(training_data, f)
