In [1]:
import os
import openai
openai.organization = os.getenv("OPENAI_ORG_ID")
openai.api_key = os.getenv("OPENAI_API_KEY")


In [2]:
import pandas as pd

#Allows for viewing large data. Can be a boon when data is too large
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

In [3]:
#Import promed dataset and convert it to string
promed = pd.read_json("promed_1.json")
promed['header'] = promed['header'].astype(str)

In [10]:
#GPT3.5
# Input command is quite long. GPT3.5 tends to not follow all instructions as the number of intructions keep increasing. 
# Testing on first 10 rows.
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to parse it into CSV format. The columns are disease outbreak name and country name. The values for each column must be separated by a semicolon. The values for each row must begin on a new line. The value for country must be only the name of the country and no other characters are permitted. The value for disease must be only the name of the disease and no other characters are permitted"},
        {"role": "user", "content": promed['header'].head(10).to_string()}
    ]
)

<OpenAIObject chat.completion id=chatcmpl-8DZhDWmtuiBRi4X4WHuh93dlwFOd2 at 0x19267f7deb0> JSON: {
  "id": "chatcmpl-8DZhDWmtuiBRi4X4WHuh93dlwFOd2",
  "object": "chat.completion",
  "created": 1698246171,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "disease outbreak name;country name\nAvian influenza;Kuwait, Myanmar\nTularemia;Taiwan\nAmerican foulbrood, apis;Fiji\nCOVID-19;USA\nLumpy skin disease;Bulgaria\nAvian cholera;USA\nNorovirus;Spain\nFoot & mouth disease, bovine;South Korea\nCharcoal rot, soybean;USA\nDengue;USA"
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 650,
    "completion_tokens": 89,
    "total_tokens": 739
  }
}

In [33]:
# Extracting the output from GPT response and formatting it for ease of use.

# response.choices[0].message.content.split('\n')
# response.choices[0].message.content.split('\n')[1]

r=response.choices[0].message.content.split('\n')
pro = list(map(lambda x: x.split(';'), r))

print(pro)
print(pro[1])
print(pro[1][1])

[['disease outbreak name', 'country name'], ['Avian influenza', 'Kuwait, Myanmar'], ['Tularemia', 'Taiwan'], ['American foulbrood, apis', 'Fiji'], ['COVID-19', 'USA'], ['Lumpy skin disease', 'Bulgaria'], ['Avian cholera', 'USA'], ['Norovirus', 'Spain'], ['Foot & mouth disease, bovine', 'South Korea'], ['Charcoal rot, soybean', 'USA'], ['Dengue', 'USA']]
['Avian influenza', 'Kuwait, Myanmar']
Kuwait, Myanmar


In [44]:
#GPT3.5
# Testing on last 10 rows.
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to parse it into CSV format. The columns are disease outbreak name and country name. The values for each column must be separated by a semicolon. The values for each row must begin on a new line. The value for country must be only the name of the country and no other characters are permitted. The value for disease must be only the name of the disease and no other characters are permitted"},
        {"role": "user", "content": promed['header'].tail(10).to_string()}
    ]
)

r=response.choices[0].message.content.split('\n')
pro = list(map(lambda x: x.split(';'), r))

print(pro)
print(pro[1])
print(pro[1][1])

[['disease outbreak name', 'country name'], ['Avian influenza', 'USA'], ['Ebola', 'Congo DR'], ['Jaundice', 'Bangladesh'], ['Cholera', 'South Sudan'], ['Ricin', 'USA'], ['Rift Valley fever', 'Mauritania'], ['Herpes B virus infection', 'USA'], ['West Nile virus', 'USA'], ['E. coli EHEC', 'Australia'], ['BSE', 'USA']]
['Avian influenza', 'USA']
USA


In [53]:
#GPT3.5
# Testing on rows 50-60.
# Repeatedly running this instruction can sometimes lead to different results.
# Especially if you run it on a single row at a time.

# Results are not as expected. Countries are extracted properly and disease names aren't extracted very well.

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to parse it into CSV format. The columns are disease outbreak name and country name. The values for each column must be separated by a semicolon. The values for each row must begin on a new line. The value for country must be only the name of the country and no other characters are permitted. The value for disease must be only contain the name of the disease and no other characters are permitted"},
        {"role": "user", "content": promed['header'][50:60].to_string()}
    ]
)

r=response.choices[0].message.content.split('\n')
pro = list(map(lambda x: x.split(';'), r))

print(pro)
print(pro[1])
print(pro[1][1])

[['disease outbreak name', 'country name'], ['Late blight, potato', 'Papua New Guinea'], ['Vesicular stomatitis', 'USA'], ['E. coli EHEC', 'USA'], ['E. coli O157', 'USA'], ['Cholera, diarrhea & dysentery update', 'USA'], ['Hantavirus update 2011 - Americas', 'Chile'], ['Drugs, fish, contaminated waterways', 'USA'], ['Anthrax, human, equine', 'Kyrgyzstan'], ['Cryptosporidiosis', 'Sweden'], ['Meningitis advisory', 'Burundi & Congo, Dem.Rep.']]
['Late blight, potato', 'Papua New Guinea']
Papua New Guinea


In [54]:
#GPT3.5
# Tested on rows 50-60.
# Repeatedly running the previous cell led to erreneous results.
# Extracted country where there was nothing to extract. Fifth entry 

pro

[['disease outbreak name', 'country name'],
 ['Late blight, potato', 'Papua New Guinea'],
 ['Vesicular stomatitis', 'USA'],
 ['E. coli EHEC', 'USA'],
 ['E. coli O157', 'USA'],
 ['Cholera, diarrhea & dysentery update', 'USA'],
 ['Hantavirus update 2011 - Americas', 'Chile'],
 ['Drugs, fish, contaminated waterways', 'USA'],
 ['Anthrax, human, equine', 'Kyrgyzstan'],
 ['Cryptosporidiosis', 'Sweden'],
 ['Meningitis advisory', 'Burundi & Congo, Dem.Rep.']]

In [66]:
#GPT3.5
# Testing on rows 50-60.
# Repeatedly running this instruction can sometimes lead to different results.
# Input command was modified to ask to rewrite country name

# Results are not as expected. Countries are rewritten properly and disease names aren't extracted very well.

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country in a CSV format. The disease name should be only the medical name of all the diseases mentioned and return no extra data. The country name should be rewritten with the full name of the country with no acronyms and no extra data. The values for each column must be separated by a semicolon. The values for each row must begin on a new line."},
        {"role": "user", "content": promed['header'][50:60].to_string()}
    ]
)
r=response.choices[0].message.content



'disease;country\nLate blight, potato;Papua New Guinea\nVesicular stomatitis;USA\nE. coli EHEC;USA\nE. coli O157;USA\nCholera, diarrhea & dysentery update;unknown\nHantavirus update;unknown\nDrugs, fish, contaminated waterways;USA\nAnthrax, human, equine;Kyrgyzstan\nCryptosporidiosis;Sweden\nMeningitis advisory;Burundi & Congo, Dem.Rep.'

In [67]:
#GPT3.5
# Tested on rows 50-60.
# Repeatedly running the previous cell led to erreneous results.
# Extracted nothing where there was a country to extract. Sixth entry 
r.split('\n')

['disease;country',
 'Late blight, potato;Papua New Guinea',
 'Vesicular stomatitis;USA',
 'E. coli EHEC;USA',
 'E. coli O157;USA',
 'Cholera, diarrhea & dysentery update;unknown',
 'Hantavirus update;unknown',
 'Drugs, fish, contaminated waterways;USA',
 'Anthrax, human, equine;Kyrgyzstan',
 'Cryptosporidiosis;Sweden',
 'Meningitis advisory;Burundi & Congo, Dem.Rep.']

In [69]:
#GPT3.5
# Testing on rows 50-60.
# Input command was modified to ask to rewrite country name and not format as CSV

# Results are not as expected. Countries are rewritten or extracted properly and disease names aren't extracted very well.

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country. The disease name should be only the medical name of all the diseases and return no extra data. The country name should be rewritten with the full name of the country with no acronyms and no extra data."},
        {"role": "user", "content": promed['header'][50:60].to_string()}
    ]
)
r=response.choices[0].message.content.split('\n')
r

['Disease: Late blight, potato',
 'Country: Papua New Guinea',
 '',
 'Disease: Vesicular stomatitis',
 'Country: USA',
 '',
 'Disease: E. coli EHEC',
 'Country: USA',
 '',
 'Disease: E. coli O157',
 'Country: USA',
 '',
 'Disease: Cholera, diarrhea & dysentery',
 'Country: Not mentioned',
 '',
 'Disease: Hantavirus',
 'Country: Americas',
 '',
 'Disease: Drugs, fish, contaminated waterways',
 'Country: USA',
 '',
 'Disease: Anthrax, human, equine',
 'Country: Kyrgyzstan',
 '',
 'Disease: Cryptosporidiosis',
 'Country: Sweden',
 '',
 'Disease: Meningitis advisory',
 'Country: Burundi & Congo, Dem.Rep.']

In [59]:
promed['header'][54]

"['Published Date: 2007-05-11 11:00:02 EDT\\nSubject: PRO/EDR> Cholera, diarrhea & dysentery update 2007 (20)\\nArchive Number: 20070511.1509']"

In [81]:
#GPT4
# Testing on rows 59
# Works really well

response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country. The disease name should contain only the medical name of all the diseases and return no extra data. The country name should be rewritten with the full name of the country with no acronyms and no extra data."},
        {"role": "user", "content": promed['header'][59]}
    ]
)
r=response.choices[0].message.content.split('\n')
r

['Disease Name: Meningitis',
 'Country Name: Burundi, Democratic Republic of Congo']

In [82]:
#GPT4
# Testing on rows 50-60
# Continues to work really well
# Input command doesn't ask to parse into CSV format

response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country. The disease name should contain only the medical name of all the diseases and return no extra data. The country name should be rewritten with the full name of the country with no acronyms and no extra data."},
        {"role": "user", "content": promed['header'][50:60].to_string()}
    ]
)
r=response.choices[0].message.content.split('\n')
r

['1. Disease: Late blight, potato; Country: Papua New Guinea',
 '2. Disease: Vesicular stomatitis; Country: United States of America',
 '3. Disease: E. coli EHEC; Country: United States of America',
 '4. Disease: E. coli O157; Country: United States of America',
 '5. Disease: Cholera, diarrhea & dysentery; Country: No specific country mentioned',
 '6. Disease: Hantavirus; Countries: Chile, United States of America',
 "7. Not a disease, it's about Drugs, fish, contaminated waterways; Country: United States of America",
 '8. Disease: Anthrax, human, equine; Country: Kyrgyzstan',
 '9. Disease: Cryptosporidiosis; Country: Sweden',
 '10. Disease: Meningitis; Countries: Burundi, Democratic Republic of Congo']

In [83]:
#GPT4
# Testing on rows 50-60
# Continues to work really well
# Input command asks to parse into CSV format to reduce number of token used.

response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country in a CSV format. The disease name should contain only the medical name of all the diseases and return no extra data. The country name should be rewritten with the full name of the country with no acronyms and no extra data."},
        {"role": "user", "content": promed['header'][50:60].to_string()}
    ]
)
r=response.choices[0].message.content.split('\n')
r

['"Late blight, potato","Papua New Guinea"',
 '"Vesicular stomatitis","United States of America"',
 '"E. coli EHEC","United States of America"',
 '"E. coli O157","United States of America"',
 '"Cholera","No country information"',
 '"Hantavirus","Chile, United States of America"',
 '"Drugs, fish, contaminated waterways","United States of America"',
 '"Anthrax, human, equine","Kyrgyzstan"',
 '"Cryptosporidiosis","Sweden"',
 '"Meningitis advisory","Burundi, Democratic Republic of the Congo"']

In [84]:
promed['header'][56]

"['Published Date: 2009-04-01 18:00:52 EDT\\nSubject: PRO/AH/EDR> Drugs, fish, contaminated waterways - USA (02)\\nArchive Number: 20090401.1256']"