<a href="https://colab.research.google.com/github/schickery/Step1/blob/master/Copy_of_VictozaTrainingFinal1000_0_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install openai==0.28
import pandas as pd
import openai
import numpy as np
import time
from openai.error import RateLimitError



# Load the CSV file
df = pd.read_csv('Victoza_Clean_80.csv')

# Print the actual column names in the DataFrame
print("Actual columns in the DataFrame:")
print(df.columns)

# Define the columns for NER
ner_columns = [
    'Case ID', 'Suspect Product Names',
    'Suspect Product Active Ingredients', 'Reason for Use', 'Reactions',
    'Serious', 'Outcomes', 'Sex', 'Event Date', 'Latest FDA Received Date',
    'Case Priority', 'Patient Age', 'Patient Weight', 'Sender',
    'Reporter Type', 'Report Source', 'Concomitant Product Names',
    'Latest Manufacturer Received Date', 'Initial FDA Received Date',
    'Country where Event occurred', 'Reported to Manufacturer?',
    'Manufacturer Control Number', 'Literature Reference',
    'Compounded Flag'
]

# Check for any missing columns
missing_columns = [col for col in ner_columns if col not in df.columns]
if missing_columns:
    print("Missing columns in the DataFrame:")
    print(missing_columns)
else:
    print("All columns are present.")

openai.api_key = 'Enter Data Key'

def perform_ner(text, column_name):
    column_instructions = {
        'Case ID': "Extract the case ID.",
        'Suspect Product Names': "Extract drug names.",
        'Suspect Product Active Ingredients': "Extract active ingredient names.",
        'Reason for Use': "Extract medical conditions or diseases.",
        'Reactions': "Extract adverse reactions or side effects.",
        'Serious': "Extract 'serious' or 'non-serious'.",
        'Outcomes': "Extract outcomes or results of the adverse event. These will be 'hospitalized', 'other outcomes', or 'non-serious'.",
        'Sex': "Extract 'male' or 'female'.",
        'Event Date': "Extract event date year.",
        'Latest FDA Received Date': "Extract latest FDA received date day-month-year.",
        'Case Priority': "Extract case priority expedited or non-expedited.",
        'Patient Age': "Extract patient age in years or not specified.",
        'Patient Weight': "Extract patient weight in kg or not specified.",
        'Sender': "Extract sender name.",
        'Reporter Type': "Extract reporter type as consumer or healthcare professional.",
        'Report Source': "Extract report source as text.",
        'Concomitant Product Names': "Extract names of concomitant drugs. If blank, report 'none'.",
        'Latest Manufacturer Received Date': "Extract last manufacturer received date day-month-year.",
        'Initial FDA Received Date': "Extract initial FDA received date day-month-year.",
        'Country where Event occurred': "Extract the country name where the event occurred. If blank, report 'not specified'.",
        'Manufacturer Control Number': "Extract manufacturer control number.",
        'Literature Reference': "Extract literature reference. If blank extract 'none'.",
        'Compounded Flag': "Extract 'yes' or 'no'.",
    }

    instruction = column_instructions.get(column_name, "Extract named entities related to medical terms.")
    prompt = (
        f"You are an expert in extracting named entities from medical text. {instruction}\n"
        f"For example:\n"
        "1. Text: 'Patient was treated with Metformin for Diabetes and experienced nausea.'\n"
        "   Entities: ['Metformin', 'Diabetes', 'nausea']\n"
        "2. Text: 'The patient, a 65-year-old female, was administered Victoza for managing Type 2 Diabetes.'\n"
        "   Entities: ['Victoza', 'Type 2 Diabetes']\n"
        "3. Text: 'After starting on Liraglutide, the patient reported severe headache and dizziness.'\n"
        "   Entities: ['Liraglutide', 'headache', 'dizziness']\n"
        "4. Treated with Victoza for managing Type 2 Diabetes and had a life threatening outcome in the US and subsequently recovered.'\n"
        "   Entities: ['Victoza', 'Type 2 Diabetes', 'life threatening', 'US', 'recovered']\n"
        "5. The reason for use for Victoza was Diabetes, specifically Type 2 Diabetes.\n"
        "   Entities: ['Victoza', 'Diabetes', 'Type 2 Diabetes']\n"
        "6. The adverse reactions to victoza included nausea, vomiting, and headache.\n"
        "   Entities: ['Victoza', 'nausea', 'vomiting', 'headache']\n"
        "7. A patient with case id 23324208 had an adverse event with Victoza, active ingredient liraglutide, for Type 2 Diabetes Mellitus had a serious adverse event causing gastritis. The female patient was hospitalized and had other outcomes. The event occurred in Dec 2023 and was received by the FDA on 21 Dec 2023. \n"
        "   Entities: ['23324208', 'Victoza', 'liraglutide', 'Type 2 Diabetes Mellitus', 'serious', 'gastritis', 'hospitalized', 'other outcomes', '2023', '21 Dec 2023']\n"
        "8. The 69 yr old male patient's weight was 63.8 kg and the case was expedited. The sender of Victoza was Novo Nordisk and the unspecified source was reported by a healthcare professional.\n"
        "   Entities: ['69', 'male', '63.8', 'expedited', 'Novo Nordisk', 'healthcare professional', 'victoza']\n"
        "9. The adverse event for case id 22982248 was a female. The concomitant products were Empagliflozin, Metformin Hydrochloride, Duloxetine, and Alpha Lipoic Acid. She had reactions that included Hypoaesthesia, Inflammation, Limb Injury, and an Inappropriate Schedule Of Product Administration",
        "   Entities: ['22982248', 'female', 'Empagliflozin', 'Metformin Hydrochloride', 'Duloxetine', 'Alpha Lipoic Acid', 'Hypoaesthesia', 'Inflammation', 'Limb Injury', 'Inappropriate Schedule Of Product Administration']\n"
        "10. The suspect product names included Victoza, Ozempic, Levemir, and Toujeo.\n"
        "   Entities: ['Victoza', 'Ozempic', 'Levemir', 'Toujeo']\n"
        "11. The adverse events occurred in patients with conditions including Type 2 Diabetes Mellitus, Diabetes Mellitus, Obesity, and for weight control \n"
        "   Entities: ['Type 2 Diabetes Mellitus', 'Diabetes Mellitus', 'Obesity', 'weight control']\n"
        "12. Adverse reactions to Victoza from both male and female patients included Nausea, Asthenia, Retching, Decreased Appetite, diplegia, Cataract, Staphylococcal Infection, Muscular Weakness, Hypoaesthesia, and Amputation.\n"
        "   Entities: ['Victoza', 'male', 'female', 'Nausea', 'Asthenia', 'Retching', 'Decreased Appetite', 'diplegia', 'Cataract', 'Staphylococcal Infection', 'Muscular Weakness', 'Hypoaesthesia', 'Amputation']\n"
        "13. The countries reporting adverse drug events with Victoza include the United States, China, France, Ireland, Sweden, Netherlands, Latvia, Kazakhstan, Spain, Columbia, and Canada. Other reports had a country listed as not specified.\n"
        "   Entities: ['Victoza', 'United States', 'China', 'France', 'Ireland', 'Sweden', 'Netherlands', 'Latvia', 'Kazakhstan', 'Spain', 'Columbia', 'Canada', 'not specified']\n"
        "14. Text: 'USA'\n"
        "   Entities: ['USA']\n"
        "15. Text: 'United States'\n"
        "   Entities: ['USA']\n"
        "16. Text: 'US'\n"
        "   Entities: ['United States']\n"
        "17. Text: ['CN']\n"
        "   Entities: ['China']\n"
        "18. Text: ['FR']\n"
        "   Entities: ['France']\n"
        "19. Text: ['IR']\n"
        "   Entities: ['Ireland']\n"
        "20. Text: ['SE']\n"
        "   Entities: ['Sweden']\n"
        "21. Text: ['NL']\n"
        "   Entities: ['Netherlands']\n"
        "22. Text: ['LV']\n"
        "   Entities: ['Latvia']\n"
        "23. Text: ['KZ']\n"
        "   Entities: ['Kazakhstan']\n"
        "24. Text: ['ES']\n"
        "   Entities: ['Spain']\n"
        "25. Text: ['CO']\n"
        "   Entities: ['Columbia']\n"
        "26. Text: ['CA']\n"
        "   Entities: ['Canada']\n"
        f"Extract entities from the following text based on the instructions for the column '{column_name}':\n"
        f"Text: {text}"
    )

   # Convert prompt to string if it's not one of the expected types
    if not isinstance(prompt, (str, dict, list, np.ndarray)):
        prompt = str(prompt)  # Convert to string to handle unexpected types

    #Rate Limit Error avoidance
    retry_count = 0
    while retry_count < 5:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=1000,
                temperature=0.8,
            )
            return response['choices'][0]['message']['content'].strip()
        except RateLimitError:
            retry_count += 1
            wait_time = 2 ** retry_count  # Exponential backoff
            print(f"Rate limit reached. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    raise Exception("Rate limit reached too many times. Please try again later.")

# Define a function to apply NER on a DataFrame row
def perform_ner_on_row(row):
    ner_results = {}
    for col in ner_columns:
        if col in row:
            text = row[col]
            if pd.notna(text):
                if isinstance(text, str):
                    if text.strip() != "":
                        ner_results[col + '_ner'] = perform_ner(text, col)
                    else:
                        ner_results[col + '_ner'] = ""
                elif isinstance(text, (np.ndarray, list)):
                    ner_results[col + '_ner'] = ', '.join(text)
                else:
                    ner_results[col + '_ner'] = str(text)
            else:
                ner_results[col + '_ner'] = ""
        else:
            ner_results[col + '_ner'] = ""
    return pd.Series(ner_results)

# Apply the NER function to the entire DataFrame
ner_results_df = df.apply(perform_ner_on_row, axis=1)
df = pd.concat([df, ner_results_df], axis=1)

# Save the resulting DataFrame to a new CSV file
df.to_csv('Victoza_Clean_80_1000_with_ner.csv', index=False)






Actual columns in the DataFrame:
Index(['Case ID', 'Suspect Product Names',
       'Suspect Product Active Ingredients', 'Reason for Use', 'Reactions',
       'Serious', 'Outcomes', 'Sex', 'Event Date', 'Latest FDA Received Date',
       'Case Priority', 'Patient Age', 'Patient Weight', 'Sender',
       'Reporter Type', 'Report Source', 'Concomitant Product Names',
       'Latest Manufacturer Received Date', 'Initial FDA Received Date',
       'Country where Event occurred', 'Reported to Manufacturer?',
       'Manufacturer Control Number', 'Literature Reference',
       'Compounded Flag'],
      dtype='object')
All columns are present.
Rate limit reached. Retrying in 2 seconds...
Rate limit reached. Retrying in 2 seconds...
Rate limit reached. Retrying in 2 seconds...
Rate limit reached. Retrying in 2 seconds...
Rate limit reached. Retrying in 2 seconds...
Rate limit reached. Retrying in 2 seconds...
Rate limit reached. Retrying in 2 seconds...
Rate limit reached. Retrying in 2 second