In [15]:
# !pip install datasets
# !pip install openai
# !pip install weaviate-client
# !pip install python-dotenv

In [1]:
import os
import json
import time
import openai
import weaviate
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv, find_dotenv

In [2]:
# Function to print json data
def json_print(data):
    print(json.dumps(data, indent=2))

In [3]:
# Loading requirements
_ = load_dotenv(find_dotenv("Requirements.env")) # read local .env file

In [4]:
# Retrieve OpenAI API keys
openai.api_key = os.environ['OPENAI_API_KEY']

In [5]:
# Loading dataset from Higging Face Datasets
dataset = load_dataset('Jaymax/FDA_Pharmaceuticals_FAQ')

In [6]:
# Storing the train, val and test sets in different pandas dataframes
df1 = pd.DataFrame.from_dict(dataset['train'])
df2 = pd.DataFrame.from_dict(dataset['validation'])
df3 = pd.DataFrame.from_dict(dataset['test'])

# List with the dataframes to be concatenated
frames = [df1, df2, df3]

# Concatenating the dataframes and reseting the index
df = pd.concat(frames)
df.reset_index(drop=True, inplace=True)

In [7]:
# Displaying first 5 rows of data
df.head()

Unnamed: 0,Question,Answer
0,Taking into account the content of Q7 Good Man...,Any change in the API starting material should...
1,Drawing from the insights of Labeling OTC Huma...,"Fractions (e.g., 1/2) can be expressed in math..."
2,According to the E14 Clinical Evaluation of QT...,A drug with low TdP risk would be expected to ...
3,As stated in Guidance for Industry- FDA's Poli...,In the Nutrition Facts label final rule (81 FR...
4,With reference to Assay Migration Studies for ...,"Application which, when approved, allows the d..."


## Connect to Weaviate and load OpenAI embeddings model

In [9]:
# Connect to your Weaviate instance
client = weaviate.Client(
    url= os.getenv('WEAVIATE_API_URL'),
    # url="http://localhost:8080/",
    auth_client_secret=weaviate.auth.AuthApiKey(api_key = os.getenv('WEAVIATE_API_KEY')),
    additional_headers={
        "X-OpenAI-Api-Key": openai.api_key
    }
)

# Check if your instance is live and ready
# This should return `True`
client.is_ready()

            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


True

In [11]:
# Setting/resetting the schema. CAUTION: This will delete the existing schema
if client.schema.exists("FAQ"):
    client.schema.delete_class("FAQ")
class_obj = {
      "class": "FAQ",
      "vectorizer": "text2vec-openai",
      "moduleConfig": {
        "text2vec-openai": {
          "model": "text-embedding-3-large",
          "dimensions": 3072,
          "type": "text"
        },
        "qna-openai": {
          "model": "gpt-3.5-turbo-instruct",
          "maxTokens": 2000,
          "temperature": 0.0,
          "topP": 1,
          "frequencyPenalty": 0.0,
          "presencePenalty": 0.0
        }
    },
    "properties": [{
        "name": "Question",
        "description": "Questions in the FAQs",
        "dataType": ["text"]
    },
    {
        "name": "Answer",
        "description": "Questions in the FAQs",
        "dataType": ["text"]
    },
    ]
}
client.schema.create_class(class_obj)

# get the schema to make sure it worked
client.schema.get()

In [12]:
# Initialising batch size 
batch_size=5

# Uploading data to weaviate in batches
with client.batch.configure(batch_size = batch_size) as batch:
    for i, row in df.iterrows():  # Batch import data
        
        print(f"importing question: {i+1}")
        
        properties = {
            "Question": row["Question"],
            "Answer": row["Answer"],
        }
        
        batch.add_data_object(
            data_object=properties,
            class_name="FAQ"
        )

In [13]:
# Printing cound of rows uploaded to FAQ schema in weaviate cluster
count = client.query.aggregate("FAQ").with_meta_count().do()
json_print(count)

{
  "data": {
    "Aggregate": {
      "FAQ": [
        {
          "meta": {
            "count": 1681
          }
        }
      ]
    }
  }
}


In [17]:
# Extracting the vector for a question
result = (client.query
          .get("FAQ", ["question", "answer"])
          .with_additional("vector")
          .with_limit(1)
          .do())

json_print(result)

In [15]:
# Top 3 similar question and answer pairs to "What is a Premarket Approval Application (PMA)"
response = (client.query
            .get("FAQ",["question","answer"])
            .with_near_text({"concepts": "What is a Premarket Approval Application (PMA)"})
            .with_additional('distance')
            .with_limit(3)
            .do()
            )

json_print(response)

{
  "data": {
    "Get": {
      "FAQ": [
        {
          "_additional": {
            "distance": 0.26266968
          },
          "answer": "A premarket approval (PMA) application is the most stringent type of device marketing application for medical devices. FDA approves a PMA if it determines that the application contains sufficient valid scientific evidence to provide reasonable assurance that the device is safe and effective for its intended use(s).",
          "question": "As explained in FDA Decisions for Investigational Device Exemption Clinical Investigations Guidance for Sponsors, Clinical Investigators, Institutional Review Boards, and Food and Drug Administration Staff, What is a premarket approval (PMA) application?"
        },
        {
          "_additional": {
            "distance": 0.3389851
          },
          "answer": "The application for approval required prior to the marketing of most Class III medical devices (section 515 of the Act, 21 U.S.C.\n\n360e)

# Question Answering Generation

In [19]:
# Prompt used to generate seach answer
generate_prompt = """How do FDA regulations for protection of human subjects differ from HHS 
                     regulations? Give me any associated links"""

# Output properties
properties = [
        "question", "answer",
        "_additional { answer { hasAnswer result } distance }"
    ]
# Dictionary containing the prompting information
ask = {
        "question": generate_prompt,
        "properties": ["answer"]
    }

# Generative search
result = (
  client.query
  .get("FAQ", properties)
  .with_ask(ask)
  .with_limit(1)
).do()

json_print(result)

{
  "data": {
    "Get": {
      "FAQ": [
        {
          "_additional": {
            "answer": {
              "hasAnswer": true,
              "result": " According to the comparison on the FDA's Good Clinical Practice program website, the FDA regulations for protection of human subjects differ from HHS regulations in several ways. Some key differences include the scope of coverage, the definition of \"human subject,\" and the requirements for informed consent. For more information on these differences, please refer to the Office for Human Research Protections at http://www.hhs.gov/ohrp/humansubjects/guidance/45cfr46.htm."
            },
            "distance": 0.29862797
          },
          "answer": "A general comparison appears in the FDA's Good Clinical Practice program website, Comparison of FDA and HHS Human Subject Protection Regulations found at\n\nhttp://www.fda.gov/ScienceResearch/SpecialTopics/RunningClinicalTrials/EducaionalMaterials/ucm112910.htm. For additional 