In [4]:
import os
import openai
import json
import re
import difflib

In [2]:
# The documentation suggests that the organization string is probably safe to be in the source code.
# The key is secret and of course is not safe to include in code.
# Both are NOT in the code for now.
openai.organization = os.getenv("openapi_org")
openai.api_key = os.getenv("openapi_key")

In [3]:
# Probably the simplest call is to list the available models.
openai_models = openai.Model.list()
openai_models_json = json.loads(str(openai_models))
openai_models_json['data'][0]
available_models = []
for rec in openai_models_json['data']:
    available_models.append(rec['id'])
print("Here are all of the models:\n", available_models)

code_regex = re.compile("code")
code_models = list(filter(code_regex.search, available_models))
print("\nThese models may be useful for tasks with code:\n", code_models)


Here are all of the models:
 ['babbage', 'ada', 'davinci', 'text-embedding-ada-002', 'babbage-code-search-code', 'text-similarity-babbage-001', 'text-davinci-001', 'curie-instruct-beta', 'babbage-code-search-text', 'babbage-similarity', 'curie-search-query', 'code-search-babbage-text-001', 'code-cushman-001', 'code-search-babbage-code-001', 'text-ada-001', 'code-davinci-002', 'text-similarity-ada-001', 'text-davinci-insert-002', 'ada-code-search-code', 'text-davinci-002', 'ada-similarity', 'code-search-ada-text-001', 'text-search-ada-query-001', 'text-curie-001', 'text-davinci-edit-001', 'davinci-search-document', 'ada-code-search-text', 'text-search-ada-doc-001', 'code-davinci-edit-001', 'davinci-instruct-beta', 'text-babbage-001', 'text-similarity-curie-001', 'code-search-ada-code-001', 'ada-search-query', 'text-search-davinci-query-001', 'curie-similarity', 'davinci-search-query', 'text-davinci-insert-001', 'babbage-search-document', 'ada-search-document', 'curie', 'text-search-babb

In [11]:
# The model ids could easily be mistyped. Here is a helper function to find the closest to a user input.
def closest_model_id(models, user_input):
    best_guess = difflib.get_close_matches(user_input, models, cutoff=0.8, n=1)
    return(best_guess)

closest_model_id(available_models, "text-davinci-03")

['text-davinci-003']

In [18]:
# Let's see if the ai knows the history of the bioinformatics field.
bioinformatics_history_prompt = "Write a paragraph explaining the history of the bioinformatics field."
bioinformatics_history_model = closest_model_id(available_models, "text-davinci-003")
# TODO: determine how to pass these variables into the openai.Completion.create call
print("Prompt:\n", bioinformatics_history_prompt, "\nModel:\n", bioinformatics_history_model)
bioinformatics_history_response = openai.Completion.create(
  model="text-davinci-003",
  prompt="Write a paragraph explaining the history of the bioinformatics field.",
  temperature=0.7,
  max_tokens=256,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0,
)



print("\nResponse:\n", bioinformatics_history_response)

Prompt:
 Write a paragraph explaining the history of the bioinformatics field. 
Model:
 ['text-davinci-003']

Response:
 {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": "\n\nBioinformatics is a relatively young field that has come about as the result of the convergence of several disciplines, including biology, computer science, mathematics, and engineering. It began in the 1970s when scientists first started to use computers to store and analyze biological data. Since then, the field has grown rapidly, as new technologies and techniques have been developed to store and analyze ever larger and more complex data sets. Today, bioinformatics is used in many fields of research, from basic research in genetics and molecular biology, to medical research, to agriculture and food production, to the development of new drugs and therapies. It is becoming increasingly important for research and development in many areas, as our underst

In [27]:
print(json.loads(str(bioinformatics_history_response))['choices'][0]['text'].lstrip('\n'))
# TODO: format properly so that the lines don't break in the middle of words

Bioinformatics is a relatively young field that has come about as the result of the convergence of several disciplines, including biology, computer science, mathematics, and engineering. It began in the 1970s when scientists first started to use computers to store and analyze biological data. Since then, the field has grown rapidly, as new technologies and techniques have been developed to store and analyze ever larger and more complex data sets. Today, bioinformatics is used in many fields of research, from basic research in genetics and molecular biology, to medical research, to agriculture and food production, to the development of new drugs and therapies. It is becoming increasingly important for research and development in many areas, as our understanding of the complexity of life grows.


In [30]:
# Let's do some real bioinformatics.
translate_to_protein_response = openai.Completion.create(
  model="code-davinci-002",
  prompt="\"\"\"\nWrite a python script that uses biopython to translate a DNA sequence to protein.\n\"\"\"",
  temperature=0,
  max_tokens=256,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)

print(translate_to_protein_response)

{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": "\n\nfrom Bio.Seq import Seq\nfrom Bio.Alphabet import IUPAC\n\ndna = Seq(\"ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG\", IUPAC.unambiguous_dna)\n\nprint(dna.translate())"
    }
  ],
  "created": 1674253048,
  "id": "cmpl-6atzM5mgBQPOxL0AP8jQqO9D53VNA",
  "model": "code-davinci-002",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 71,
    "prompt_tokens": 21,
    "total_tokens": 92
  }
}


In [31]:
print(json.loads(str(translate_to_protein_response))['choices'][0]['text'].lstrip('\n'))

from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)

print(dna.translate())


In [37]:
# The code doesn't appear to be dangerous, so let's try executing it.
exec(json.loads(str(translate_to_protein_response))['choices'][0]['text'].lstrip('\n'))

ImportError: Bio.Alphabet has been removed from Biopython. In many cases, the alphabet can simply be ignored and removed from scripts. In a few cases, you may need to specify the ``molecule_type`` as an annotation on a SeqRecord for your script to work correctly. Please see https://biopython.org/wiki/Alphabet for more information.

In [38]:
# Looks like this code recommendation is obsolete. Maybe we can copy + paste and remove the alphabet bit.
from Bio.Seq import Seq

dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")

print(dna.translate())

MAIVMGR*KGAR*


Now that is interesting - with a very minor simplification, the code runs properly. One interesting question would be why the AI chose to include a stop codon in the middle of the random sequence it generated.