In [1]:
import os
import openai
openai.organization = os.getenv("OPENAI_ORG_ID")
openai.api_key = os.getenv("OPENAI_API_KEY")


In [2]:
import pandas as pd

#Allows for viewing large data. Can be a boon when data is too large
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

In [6]:
#Import promed dataset and convert it to string
promed = pd.read_json("C:\\Users\\Rushali\\Documents\\Code\\Lab\\CHAIN\\Data\\ProMED\\promed_1.json")
promed['header'] = promed['header'].astype(str)
# promed['header'].count()

In [10]:
#GPT3.5
# Input command is quite long. GPT3.5 tends to not follow all instructions as the number of intructions keep increasing. 
# Testing on first 10 rows.
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to parse it into CSV format. The columns are disease outbreak name and country name. The values for each column must be separated by a semicolon. The values for each row must begin on a new line. The value for country must be only the name of the country and no other characters are permitted. The value for disease must be only the name of the disease and no other characters are permitted"},
        {"role": "user", "content": promed['header'].head(10).to_string()}
    ]
)

<OpenAIObject chat.completion id=chatcmpl-8DZhDWmtuiBRi4X4WHuh93dlwFOd2 at 0x19267f7deb0> JSON: {
  "id": "chatcmpl-8DZhDWmtuiBRi4X4WHuh93dlwFOd2",
  "object": "chat.completion",
  "created": 1698246171,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "disease outbreak name;country name\nAvian influenza;Kuwait, Myanmar\nTularemia;Taiwan\nAmerican foulbrood, apis;Fiji\nCOVID-19;USA\nLumpy skin disease;Bulgaria\nAvian cholera;USA\nNorovirus;Spain\nFoot & mouth disease, bovine;South Korea\nCharcoal rot, soybean;USA\nDengue;USA"
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 650,
    "completion_tokens": 89,
    "total_tokens": 739
  }
}

In [33]:
# Extracting the output from GPT response and formatting it for ease of use.

# response.choices[0].message.content.split('\n')
# response.choices[0].message.content.split('\n')[1]

r=response.choices[0].message.content.split('\n')
pro = list(map(lambda x: x.split(';'), r))

print(pro)
print(pro[1])
print(pro[1][1])

[['disease outbreak name', 'country name'], ['Avian influenza', 'Kuwait, Myanmar'], ['Tularemia', 'Taiwan'], ['American foulbrood, apis', 'Fiji'], ['COVID-19', 'USA'], ['Lumpy skin disease', 'Bulgaria'], ['Avian cholera', 'USA'], ['Norovirus', 'Spain'], ['Foot & mouth disease, bovine', 'South Korea'], ['Charcoal rot, soybean', 'USA'], ['Dengue', 'USA']]
['Avian influenza', 'Kuwait, Myanmar']
Kuwait, Myanmar


In [44]:
#GPT3.5
# Testing on last 10 rows.
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to parse it into CSV format. The columns are disease outbreak name and country name. The values for each column must be separated by a semicolon. The values for each row must begin on a new line. The value for country must be only the name of the country and no other characters are permitted. The value for disease must be only the name of the disease and no other characters are permitted"},
        {"role": "user", "content": promed['header'].tail(10).to_string()}
    ]
)

r=response.choices[0].message.content.split('\n')
pro = list(map(lambda x: x.split(';'), r))

print(pro)
print(pro[1])
print(pro[1][1])

[['disease outbreak name', 'country name'], ['Avian influenza', 'USA'], ['Ebola', 'Congo DR'], ['Jaundice', 'Bangladesh'], ['Cholera', 'South Sudan'], ['Ricin', 'USA'], ['Rift Valley fever', 'Mauritania'], ['Herpes B virus infection', 'USA'], ['West Nile virus', 'USA'], ['E. coli EHEC', 'Australia'], ['BSE', 'USA']]
['Avian influenza', 'USA']
USA


In [53]:
#GPT3.5
# Testing on rows 50-60.
# Repeatedly running this instruction can sometimes lead to different results.
# Especially if you run it on a single row at a time.

# Results are not as expected. Countries are extracted properly and disease names aren't extracted very well.

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to parse it into CSV format. The columns are disease outbreak name and country name. The values for each column must be separated by a semicolon. The values for each row must begin on a new line. The value for country must be only the name of the country and no other characters are permitted. The value for disease must be only contain the name of the disease and no other characters are permitted"},
        {"role": "user", "content": promed['header'][50:60].to_string()}
    ]
)

r=response.choices[0].message.content.split('\n')
pro = list(map(lambda x: x.split(';'), r))

print(pro)
print(pro[1])
print(pro[1][1])

[['disease outbreak name', 'country name'], ['Late blight, potato', 'Papua New Guinea'], ['Vesicular stomatitis', 'USA'], ['E. coli EHEC', 'USA'], ['E. coli O157', 'USA'], ['Cholera, diarrhea & dysentery update', 'USA'], ['Hantavirus update 2011 - Americas', 'Chile'], ['Drugs, fish, contaminated waterways', 'USA'], ['Anthrax, human, equine', 'Kyrgyzstan'], ['Cryptosporidiosis', 'Sweden'], ['Meningitis advisory', 'Burundi & Congo, Dem.Rep.']]
['Late blight, potato', 'Papua New Guinea']
Papua New Guinea


In [54]:
#GPT3.5
# Tested on rows 50-60.
# Repeatedly running the previous cell led to erreneous results.
# Extracted country where there was nothing to extract. Fifth entry 

pro

[['disease outbreak name', 'country name'],
 ['Late blight, potato', 'Papua New Guinea'],
 ['Vesicular stomatitis', 'USA'],
 ['E. coli EHEC', 'USA'],
 ['E. coli O157', 'USA'],
 ['Cholera, diarrhea & dysentery update', 'USA'],
 ['Hantavirus update 2011 - Americas', 'Chile'],
 ['Drugs, fish, contaminated waterways', 'USA'],
 ['Anthrax, human, equine', 'Kyrgyzstan'],
 ['Cryptosporidiosis', 'Sweden'],
 ['Meningitis advisory', 'Burundi & Congo, Dem.Rep.']]

In [66]:
#GPT3.5
# Testing on rows 50-60.
# Repeatedly running this instruction can sometimes lead to different results.
# Input command was modified to ask to rewrite country name

# Results are not as expected. Countries are rewritten properly and disease names aren't extracted very well.

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country in a CSV format. The disease name should be only the medical name of all the diseases mentioned and return no extra data. The country name should be rewritten with the full name of the country with no acronyms and no extra data. The values for each column must be separated by a semicolon. The values for each row must begin on a new line."},
        {"role": "user", "content": promed['header'][50:60].to_string()}
    ]
)
r=response.choices[0].message.content



'disease;country\nLate blight, potato;Papua New Guinea\nVesicular stomatitis;USA\nE. coli EHEC;USA\nE. coli O157;USA\nCholera, diarrhea & dysentery update;unknown\nHantavirus update;unknown\nDrugs, fish, contaminated waterways;USA\nAnthrax, human, equine;Kyrgyzstan\nCryptosporidiosis;Sweden\nMeningitis advisory;Burundi & Congo, Dem.Rep.'

In [67]:
#GPT3.5
# Tested on rows 50-60.
# Repeatedly running the previous cell led to erreneous results.
# Extracted nothing where there was a country to extract. Sixth entry 
r.split('\n')

['disease;country',
 'Late blight, potato;Papua New Guinea',
 'Vesicular stomatitis;USA',
 'E. coli EHEC;USA',
 'E. coli O157;USA',
 'Cholera, diarrhea & dysentery update;unknown',
 'Hantavirus update;unknown',
 'Drugs, fish, contaminated waterways;USA',
 'Anthrax, human, equine;Kyrgyzstan',
 'Cryptosporidiosis;Sweden',
 'Meningitis advisory;Burundi & Congo, Dem.Rep.']

In [69]:
#GPT3.5
# Testing on rows 50-60.
# Input command was modified to ask to rewrite country name and not format as CSV

# Results are not as expected. Countries are rewritten or extracted properly and disease names aren't extracted very well.

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country. The disease name should be only the medical name of all the diseases and return no extra data. The country name should be rewritten with the full name of the country with no acronyms and no extra data."},
        {"role": "user", "content": promed['header'][50:60].to_string()}
    ]
)
r=response.choices[0].message.content.split('\n')
r

['Disease: Late blight, potato',
 'Country: Papua New Guinea',
 '',
 'Disease: Vesicular stomatitis',
 'Country: USA',
 '',
 'Disease: E. coli EHEC',
 'Country: USA',
 '',
 'Disease: E. coli O157',
 'Country: USA',
 '',
 'Disease: Cholera, diarrhea & dysentery',
 'Country: Not mentioned',
 '',
 'Disease: Hantavirus',
 'Country: Americas',
 '',
 'Disease: Drugs, fish, contaminated waterways',
 'Country: USA',
 '',
 'Disease: Anthrax, human, equine',
 'Country: Kyrgyzstan',
 '',
 'Disease: Cryptosporidiosis',
 'Country: Sweden',
 '',
 'Disease: Meningitis advisory',
 'Country: Burundi & Congo, Dem.Rep.']

In [59]:
promed['header'][54]

"['Published Date: 2007-05-11 11:00:02 EDT\\nSubject: PRO/EDR> Cholera, diarrhea & dysentery update 2007 (20)\\nArchive Number: 20070511.1509']"

In [81]:
#GPT4
# Testing on rows 59
# Works really well

response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country. The disease name should contain only the medical name of all the diseases and return no extra data. The country name should be rewritten with the full name of the country with no acronyms and no extra data."},
        {"role": "user", "content": promed['header'][59]}
    ]
)
r=response.choices[0].message.content.split('\n')
r

['Disease Name: Meningitis',
 'Country Name: Burundi, Democratic Republic of Congo']

In [82]:
#GPT4
# Testing on rows 50-60
# Continues to work really well
# Input command doesn't ask to parse into CSV format

response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country. The disease name should contain only the medical name of all the diseases and return no extra data. The country name should be rewritten with the full name of the country with no acronyms and no extra data."},
        {"role": "user", "content": promed['header'][50:60].to_string()}
    ]
)
r=response.choices[0].message.content.split('\n')
r

['1. Disease: Late blight, potato; Country: Papua New Guinea',
 '2. Disease: Vesicular stomatitis; Country: United States of America',
 '3. Disease: E. coli EHEC; Country: United States of America',
 '4. Disease: E. coli O157; Country: United States of America',
 '5. Disease: Cholera, diarrhea & dysentery; Country: No specific country mentioned',
 '6. Disease: Hantavirus; Countries: Chile, United States of America',
 "7. Not a disease, it's about Drugs, fish, contaminated waterways; Country: United States of America",
 '8. Disease: Anthrax, human, equine; Country: Kyrgyzstan',
 '9. Disease: Cryptosporidiosis; Country: Sweden',
 '10. Disease: Meningitis; Countries: Burundi, Democratic Republic of Congo']

In [13]:
#GPT4
# Testing on rows 50-60
# Continues to work really well
# Input command asks to parse into CSV format to reduce number of token used.

response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country in a CSV format. The disease name should contain only the medical name of all the diseases and return no extra data. The country name should be rewritten with the full name of the country with no acronyms and no extra data."},
        {"role": "user", "content": promed['header'][50:60].to_string()}
    ]
)
r=response.choices[0].message.content.split('\n')
r

['"Late blight, potato","Papua New Guinea"',
 '"Vesicular stomatitis","United States of America"',
 '"E. coli EHEC","United States of America"',
 '"E. coli O157","United States of America"',
 '"Cholera, diarrhea & dysentery","Not specified"',
 '"Hantavirus","Chile, United States of America"',
 '"Drugs, fish, contaminated waterways","United States of America"',
 '"Anthrax, human, equine","Kyrgyzstan"',
 '"Cryptosporidiosis","Sweden"',
 '"Meningitis advisory","Burundi, Democratic Republic of the Congo"']

In [84]:
promed['header'][56]

"['Published Date: 2009-04-01 18:00:52 EDT\\nSubject: PRO/AH/EDR> Drugs, fish, contaminated waterways - USA (02)\\nArchive Number: 20090401.1256']"

In [45]:
# Testing extraction

response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country in a CSV format where the columns are seperated by a semicolon. The disease name should contain only the medical name of all the diseases and return no extra data. The country name should be rewritten with the full name of the country with no acronyms and no extra data."},
        {"role": "user", "content": promed['header'][50:60].to_string()}
    ],
    seed = 1
)
r=response.choices[0].message.content.split('\n')
r

['"Late blight, potato";"Papua New Guinea"',
 '"Vesicular stomatitis";"United States of America"',
 '"E. coli EHEC";"United States of America"',
 '"E. coli O157";"United States of America"',
 '"Cholera, diarrhea & dysentery";"Not specified"',
 '"Hantavirus";"Chile, United States of America"',
 '"Drugs, fish, contaminated waterways";"United States of America"',
 '"Anthrax, human, equine";"Kyrgyzstan"',
 '"Cryptosporidiosis";"Sweden"',
 '"Meningitis";"Burundi, Democratic Republic of the Congo"']

In [31]:
# Disregard this cell. This code worked only before Dev Day. After that, this prompt generated extremely poor results.

import os
import openai
openai.organization = os.getenv("OPENAI_ORG_ID")
openai.api_key = os.getenv("OPENAI_API_KEY")

import pandas as pd

#Allows for viewing large data. Can be a boon when data is too large
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
) 
import backoff
import numpy as np



def backoff_hdlr(details):
    global end_index
    print("Too many tokens")
    end_index = end_index-10

def backoff_hdlr2(details):
    global flag
    print("RateLimitError, too many requests")
    flag = True

def backoff_hdlr3(details):
    global flag
    flag = True
    print("APIError or Timeout, either way something is wrong with openai \n", details)

@backoff.on_exception(backoff.expo, openai.error.Timeout, on_backoff=backoff_hdlr3, max_tries = 2, max_time=60)
@backoff.on_exception(backoff.expo, openai.error.APIError,on_backoff=backoff_hdlr3, max_tries = 2, max_time=60)
@backoff.on_exception(backoff.expo, openai.error.RateLimitError,on_backoff=backoff_hdlr2, max_tries = 3, max_time=120)
@backoff.on_exception(backoff.expo, openai.error.InvalidRequestError,  on_backoff=backoff_hdlr, max_tries = 2)
def completion_with_backoff(promed):
        global end_index, start_index, flag
        print(start_index)
    # try: 
        response = openai.ChatCompletion.create(
            # model="gpt-4",
            model="gpt-4-1106-preview",
            # model="gpt-3.5-turbo-1106",
            messages=[
                # {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country in a CSV format where the columns are seperated by a semicolon. The disease name should contain only the medical name of all the diseases and return no extra data. All the counries mentioned must be extracted and seperated by commas.The country name should be rewritten with the full name of the country with no acronyms and no extra data. Return None when there is no country or disease mentioned. Do not return any text excepted the extracted values. "},
                {"role": "system", "content": "You will be provided with unstructured data, extract only the disease name and country name in a CSV format where the columns are seperated by a semicolon. All the countries mentioned must be extracted and seperated by commas. All diseases mentioned must be extracted and seperated by commas. The disease name should contain only the medical name of all the diseases and return no extra data. The country name should be rewritten with the full name of the country with no acronyms and no extra data. Return 'Nil' when there is no country or disease to be extracted. Do not return any text/characters excepted the extracted values."},
                {"role": "user", "content": promed['header'][start_index:end_index].to_string()}
            ],
            seed = 1
        )
    # except openai.error.Timeout as e:
    #     # Handle timeout error, e.g. retry or log
    #     print(f"OpenAI API request timed out: {e}")
    #     pass
    # except openai.error.APIError as e:
    #     # Handle API error, e.g. retry or log
    #     print(f"OpenAI API returned an API Error: {e}")
    #     pass
    # except openai.error.APIConnectionError as e:
    #     # Handle connection error, e.g. check network or log
    #     print(f"OpenAI API request failed to connect: {e}")
    #     pass
    # except openai.error.InvalidRequestError as e:
    #     # Handle invalid request error, e.g. validate parameters or log
    #     print(f"OpenAI API request was invalid: {e}")
    #     pass
    # except openai.error.AuthenticationError as e:
    #     # Handle authentication error, e.g. check credentials or log
    #     print(f"OpenAI API request was not authorized: {e}")
    #     pass
    # except openai.error.PermissionError as e:
    #     # Handle permission error, e.g. check scope or log
    #     print(f"OpenAI API request was not permitted: {e}")
    #     pass
    # except openai.error.RateLimitError as e:
    #     # Handle rate limit error, e.g. wait or log
    #     print(f"OpenAI API request exceeded rate limit: {e}")
    #     pass
    # except Exception as e:
    #     # Handle rate limit error, e.g. wait or log
    #     print(e)
    #     pass

        flag = False
        return response





# promed_test['header'].count()
# for i in range(1,14,1):
i=3
if i==3:
    #Import promed dataset and convert it to string
    print("File ", i)
    filename = "promed_"+str(i)+".json"
    promed_test = pd.read_json("C:\\Users\\Rushali\\Documents\\Code\\Lab\\CHAIN\\Data\\ProMED\\"+filename)
    promed_test['header'] = promed_test['header'].astype(str)

    start_index = 0
    end_index = 50
    flag = False

    file = open("extract"+str(i)+".csv", 'ab')

    while start_index < promed_test['header'].count():
        try:
            if flag:
                break
            if end_index > promed_test['header'].count():
                end_index = promed_test['header'].count()

            resp = completion_with_backoff(promed_test)
            np.savetxt(file, resp.choices[0].message.content.split('\n')[:], delimiter=";", fmt ='% s')
            start_index = end_index
            end_index = end_index + 50
        except Exception as e:
            print("IN LOOP")
            print(e)
            break
        # print(end_index)
    print("END")
    file.close()

File  3
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550
4600
4650
4700
4750
4800
4850
4900
4950
END


In [1]:
# Main code to extract country and disease from promed alerts using gpt

import os
import json 

import openai
openai.organization = os.getenv("OPENAI_ORG_ID")
openai.api_key = os.getenv("OPENAI_API_KEY")

import pandas as pd
#Allows for viewing large data. Can be a boon when data is too large
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
) 
import backoff
import numpy as np


def backoff_hdlr(details):
    global end_index
    print("Too many tokens")
    end_index = end_index-10

def backoff_hdlr2(details):
    global flag
    print("RateLimitError, too many requests")
    flag = True

def backoff_hdlr3(details):
    global flag
    flag = True
    print("APIError or Timeout, either way something is wrong with openai \n", details)

@backoff.on_exception(backoff.expo, openai.error.Timeout, on_backoff=backoff_hdlr3, max_tries = 5, max_time=60)
@backoff.on_exception(backoff.expo, openai.error.APIError,on_backoff=backoff_hdlr3, max_tries = 5, max_time=60)
@backoff.on_exception(backoff.expo, openai.error.RateLimitError,on_backoff=backoff_hdlr2, max_tries = 5, max_time=120)
@backoff.on_exception(backoff.expo, openai.error.InvalidRequestError,  on_backoff=backoff_hdlr, max_tries = 5)
def completion_with_backoff(promed):
        global end_index, start_index, flag
        print(start_index)
        response = openai.ChatCompletion.create(
            model="gpt-4-1106-preview",
            # model="gpt-3.5-turbo-1106",
            messages=[
                # {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country in a CSV format where the columns are seperated by a semicolon. The disease name should contain only the medical name of all the diseases and return no extra data. All the counries mentioned must be extracted and seperated by commas.The country name should be rewritten with the full name of the country with no acronyms and no extra data. Return None when there is no country or disease mentioned. Do not return any text excepted the extracted values. "},
                # {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country in a CSV format where the columns are seperated by a semicolon. "},
                # {"role": "system", "content": "You will be provided with 50 data points. Your task is to extract the diseases and countries mentioned in each data point. The output you provide should contain exactly 50 lines where every disease or country mentioned is extracted and is comma seperated. The diseases and countries are seperated by a semicolon. The disease name should be rewtitten with the complete medical name of that disease. The country name should be rewritten with the full name of the country with no acronyms. Return 'Nil' when there is no country or disease to be extracted. Do not return any text/characters that do not corespond to the name of a country or a disease."},
                # {"role": "system", "content": "You will be provided with 50 data points. Your task is to extract the diseases and countries mentioned in each data point. The output you provide should contain exactly 50 lines where each line has at least one entry for disease and at least one for country. Every disease or country mentioned must be extracted and comma seperated. The diseases and countries are seperated by a semicolon. The disease name should be rewtitten with the complete medical name of that disease. The country name should be rewritten with the full name of the country with no acronyms. Return 'Nil' when there is no country or disease to be extracted. Do not return any extra data. 'Published Date: 2007-03-30 10:00:01 EDT\\nSubject: PRO/EDR> Poliomyelitis - Worldwide (02): Nigeria, DR Congo, Somalia, Pakistan\\nArchive Number: 20070330.1090' is a sample datapoint. The output for this should be 'Poliomyelitis;Nigeria, Democratic Republic of the Congo, Somalia, Pakistan'. " },
                {"role": "system", "content": "You will be provided with 50 data points. Your task is to extract the diseases and countries mentioned in each data point. The output you provide should contain exactly 50 lines where each line has at least one entry for disease and at least one for country. Every disease or country mentioned must be extracted and comma seperated. The diseases and countries are seperated by a semicolon. Return 'Nil' when there is no country or disease to be extracted. Do not return any extra data. '['Published Date: 2007-03-30 10:00:01 EDT\\nSubject: PRO/EDR> Poliomyelitis - Worldwide (02): Nigeria, DR Congo, Somalia, Pakistan\\nArchive Number: 20070330.1090' , 'Published Date: 2005-06-24 19:50:00 EDT\\nSubject: PRO/EDR> Cholera, diarrhea & dysentery update 2005 (24)\\nArchive Number: 20050624.1775' , 'Published Date: 2000-01-31 18:50:00 EST\\nSubject: PRO> updates about situation- Bangladesh (02)\\nArchive Number: 20000131.0147' , 'Published Date: 2010-05-06 14:00:03 EDT\\nSubject: PRO/AH/EDR> about about vaccines \\nArchive Number: 20100506.1477'] is a sample with 4 datapoints. The output for this should be 'Poliomyelitis;Nigeria, Democratic Republic of the Congo, Somalia, Pakistan \n Cholera, diarrhea & dysentery;Nil \n Nil; Bangladesh \n Nil;Nil'" },
                {"role": "user", "content": promed['header'][start_index:end_index].to_string()}
            ],
            seed = 1
        )
        # print(promed['header'][start_index:end_index].to_string())
        flag = False
        return response


# Redo was necessary as some blocks of alerts would continuously throw errors with the previous seed.

@backoff.on_exception(backoff.expo, openai.error.Timeout, on_backoff=backoff_hdlr3, max_tries = 5, max_time=60)
@backoff.on_exception(backoff.expo, openai.error.APIError,on_backoff=backoff_hdlr3, max_tries = 5, max_time=60)
@backoff.on_exception(backoff.expo, openai.error.RateLimitError,on_backoff=backoff_hdlr2, max_tries = 5, max_time=120)
@backoff.on_exception(backoff.expo, openai.error.InvalidRequestError,  on_backoff=backoff_hdlr, max_tries = 5)
def completion_changed_seed(promed):
        global end_index, start_index, flag
        print("Redo ",start_index)
        response = openai.ChatCompletion.create(
            model="gpt-4",
            # model="gpt-3.5-turbo-1106",
            messages=[
                # {"role": "system", "content": "You will be provided with unstructured data, and your task is to extract only the disease name and the name of the country in a CSV format where the columns are seperated by a semicolon. The disease name should contain only the medical name of all the diseases and return no extra data. All the counries mentioned must be extracted and seperated by commas.The country name should be rewritten with the full name of the country with no acronyms and no extra data. Return None when there is no country or disease mentioned. Do not return any text excepted the extracted values. "},
                # {"role": "system", "content": "You will be provided with 50 data points. Your task is to extract the diseases and countries mentioned in each data point. The output you provide should contain exactly 50 lines where each line has at least one entry for disease and at least one for country. Every disease or country mentioned must be extracted and comma seperated. The diseases and countries are seperated by a semicolon. The disease name should be rewtitten with the complete medical name of that disease. The country name should be rewritten with the full name of the country with no acronyms. Return 'Nil' when there is no country or disease to be extracted. Do not return any extra data."},
                # {"role": "system", "content": "You will be provided with 50 data points. Your task is to extract the diseases and countries mentioned in each data point. The output you provide should contain exactly 50 lines where each line has at least one entry for disease and at least one for country. Every disease or country mentioned must be extracted and comma seperated. The diseases and countries are seperated by a semicolon. The disease name should be rewtitten with the complete medical name of that disease. The country name should be rewritten with the full name of the country with no acronyms. Return 'Nil' when there is no country or disease to be extracted. Do not return any extra data. 'Published Date: 2007-03-30 10:00:01 EDT\\nSubject: PRO/EDR> Poliomyelitis - Worldwide (02): Nigeria, DR Congo, Somalia, Pakistan\\nArchive Number: 20070330.1090' is a sample datapoint. The output for this should be 'Poliomyelitis;Nigeria, Democratic Republic of the Congo, Somalia, Pakistan'. " },
                {"role": "system", "content": "You will be provided with 50 data points. Your task is to extract the diseases and countries mentioned in each data point. The output you provide should contain exactly 50 lines where each line has at least one entry for disease and at least one for country. Every disease or country mentioned must be extracted and comma seperated. The diseases and countries are seperated by a semicolon. The disease name should be rewtitten with the complete medical name of that disease. The country name should be rewritten with the full name of the country with no acronyms. Return 'Nil' when there is no country or disease to be extracted. Do not return any extra data. '['Published Date: 2007-03-30 10:00:01 EDT\\nSubject: PRO/EDR> Poliomyelitis - Worldwide (02): Nigeria, DR Congo, Somalia, Pakistan\\nArchive Number: 20070330.1090' , 'Published Date: 2005-06-24 19:50:00 EDT\\nSubject: PRO/EDR> Cholera, diarrhea & dysentery update 2005 (24)\\nArchive Number: 20050624.1775' , 'Published Date: 2000-01-31 18:50:00 EST\\nSubject: PRO> updates about situation- Bangladesh (02)\\nArchive Number: 20000131.0147' , 'Published Date: 2010-05-06 14:00:03 EDT\\nSubject: PRO/AH/EDR> about about vaccines \\nArchive Number: 20100506.1477'] is a sample datapoint. The output for this should be 'Poliomyelitis;Nigeria, Democratic Republic of the Congo, Somalia, Pakistan \n Cholera, diarrhea & dysentery;Nil \n Nil; Bangladesh \n Nil;Nil'" },
                {"role": "user", "content": promed['header'][start_index:end_index].to_string()}
            ],
            seed = 1
        )
        flag = False
        return response



# Ideally perform only one or two files in series as chatgpt tends to randomly throw API errors which can not be resolved.

for i in range(1,14,1):

# i=1
# if i==1:
    
    #Import promed dataset and convert it to string
    print("File ", i)
    filename = "promed_"+str(i)+".json"
    promed_test = pd.read_json("C:\\Users\\Rushali\\Documents\\Code\\Lab\\CHAIN\\Data\\ProMED\\"+filename)
    promed_test['header'] = promed_test['header'].astype(str)

    start_index = 0
    end_index = 50
    flag = False

    while start_index < promed_test['header'].count():

        try:

            if end_index > promed_test['header'].count():
                end_index = promed_test['header'].count()

            resp = completion_with_backoff(promed_test)

            if len(resp.choices[0].message.content.split('\n')[:]) != 50:
                resp = completion_changed_seed(promed_test)

            if resp.choices[0].message.content.count(";") != 50:
                resp = completion_changed_seed(promed_test)

            print(len(resp.choices[0].message.content.split('\n')[:]) , "  ", resp.choices[0].message.content.count(";"), " lines and ;" )

            filename = str(i) + '_' + str(start_index)
            out_file = open("bin_responses\\"+filename+".json", "w") 
            json.dump(resp, out_file,ensure_ascii=False) 
            out_file.close()

            file = open("extraction\\extract"+str(i)+".csv", 'a')
            np.savetxt(file, resp.choices[0].message.content.split('\n')[:], delimiter=";", fmt ='% s')
            file.close()

            start_index = end_index
            end_index = end_index + 50
        except Exception as e:
            print("IN LOOP")
            print(e)
            break
        # print(end_index)
    print("END")
    

File  1
4950
50    50  lines and ;
END


In [34]:
import numpy as np

promed_csv = pd.read_csv("C:\\Users\\Rushali\\Documents\\Code\\Lab\\outbreak\\extractEntities\\extraction\\extract4.csv", sep=";",names=["Disease","Country"])
filename = "promed_"+str(13)+".json"
promed_test = pd.read_json("C:\\Users\\Rushali\\Documents\\Code\\Lab\\CHAIN\\Data\\ProMED\\"+filename)

In [30]:
# Prelim count of number of diseases extracted
len(pd.unique(promed_csv["Disease"]))

4730

In [4]:
# Random testing to checck if GPT extractions were successful
index = np.random.randint(0,5000)
print(index)
print(promed_csv.iloc[index])
print(promed_test['header'][index])

4264
Disease    Gastroenteritis, Vibrio, E. coli
Country                                 USA
Name: 4264, dtype: object
['Published Date: 2021-12-22 13:47:18 EST\nSubject: PRO/EDR> Gastroenteritis - USA (03): cruise ship, Vibrio, E. coli\nArchive Number: 20211222.8700371']
header    4544
body      4544
dtype: int64


In [2]:
# Code to extract countries and diseases from WHO data.

import os
import json 

import openai
openai.organization = os.getenv("OPENAI_ORG_ID")
openai.api_key = os.getenv("OPENAI_API_KEY")

import pandas as pd
#Allows for viewing large data. Can be a boon when data is too large
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
) 
import backoff
import numpy as np


def backoff_hdlr(details):
    global end_index
    print("Too many tokens")
    end_index = end_index-10

def backoff_hdlr2(details):
    global flag
    print("RateLimitError, too many requests")
    flag = True

def backoff_hdlr3(details):
    global flag
    flag = True
    print("APIError or Timeout, either way something is wrong with openai \n", details)

@backoff.on_exception(backoff.expo, openai.error.Timeout, on_backoff=backoff_hdlr3, max_tries = 5, max_time=60)
@backoff.on_exception(backoff.expo, openai.error.APIError,on_backoff=backoff_hdlr3, max_tries = 5, max_time=60)
@backoff.on_exception(backoff.expo, openai.error.RateLimitError,on_backoff=backoff_hdlr2, max_tries = 5, max_time=120)
@backoff.on_exception(backoff.expo, openai.error.InvalidRequestError,  on_backoff=backoff_hdlr, max_tries = 5)
def completion_with_backoff(who):
        global start_index, flag
        print(start_index)
        response = openai.ChatCompletion.create(
            model="gpt-4-1106-preview",
            response_format={ "type": "json_object" },
            # model="gpt-3.5-turbo-1106",
            messages=[
                {"role": "system", "content": "You will be shown snippets of news articles. You need to extract only the country and disease mentioned in JSON. Use Nil if there is nothing to extract." },
                {"role": "user", "content": who}
            ],
            seed = 1
        )
        # print(promed['header'][start_index:end_index].to_string())
        flag = False
        return response



@backoff.on_exception(backoff.expo, openai.error.Timeout, on_backoff=backoff_hdlr3, max_tries = 5, max_time=60)
@backoff.on_exception(backoff.expo, openai.error.APIError,on_backoff=backoff_hdlr3, max_tries = 5, max_time=60)
@backoff.on_exception(backoff.expo, openai.error.RateLimitError,on_backoff=backoff_hdlr2, max_tries = 5, max_time=120)
@backoff.on_exception(backoff.expo, openai.error.InvalidRequestError,  on_backoff=backoff_hdlr, max_tries = 5)
def completion_changed_seed(who):
        global start_index, flag
        print("Redo ",start_index)
        response = openai.ChatCompletion.create(
            model="gpt-4",
            response_format={ "type": "json_object" },
            # model="gpt-3.5-turbo-1106",
            messages=[
                {"role": "system", "content": "You will be shown snippets of news articles. You need to extract only the country and disease mentioned in JSON. Use Nil if there is nothing to extract." },
                {"role": "user", "content": who}
            ],
            seed = 1
        )
        flag = False
        return response



# Replace with correct location of WHO file
who = pd.read_json("C:\\Users\\Rushali\\Documents\\Code\\Lab\\outbreak\\data\\WHO DON\\1695995527.14841_whodonreports.json")


i=1
if i==1:
    start_index = 0
    flag = False

    while start_index < 716:
        try:
            resp = completion_with_backoff(who['Report_text'][start_index][:500])

            filename = 'who_' + str(start_index)
            out_file = open("bin_responses\\"+filename+".json", "w") 
            json.dump(resp, out_file,ensure_ascii=False) 
            out_file.close()

            start_index = start_index + 1

        except Exception as e:
            print("IN LOOP")
            print(e)
            break
        # print(end_index)
    print("END")
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27