In [1]:
%%capture
# %pip install scrapegraphai
%pip install scrapegraphai==1.20.0b1
%apt install chromium-chromedriver
%pip install nest_asyncio
%pip install playwright
%playwright install

In [30]:
import nest_asyncio
nest_asyncio.apply()

In [1]:
# sanity
import json
from openai import OpenAI
from scrapegraphai.graphs import SmartScraperGraph

  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from .graph_iterator_node import GraphIteratorNode


In [2]:
open_ai_key = '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'

In [5]:
# import pandas as pd

# # Load the CSV file
# df = pd.read_csv("categorized_content_links.csv")

# column_name = "url"  # replace with the actual column name
# urls = df[column_name].dropna().unique().tolist()

# print(len(urls))

In [6]:
# model schema
from pydantic import BaseModel, Field
from typing import Literal, List, Any

In [19]:
class PubHealthSchema(BaseModel):
    supporting_claim: str
    contradictory_claim: str
    ambiguous_claim: str # 0: +ve, 1: -ve, 2: ambiguous 
    
    def to_json(self):
        return {"supporting_claim": self.supporting_claim, "contradictory_claim": self.contradictory_claim, "ambiguous_claim": self.ambiguous_claim}
    
class Schema(BaseModel):
    claims: List[PubHealthSchema]
    
    def to_json(self):
        return [data.to_json() for data in self.claims]

In [None]:
prompt = """
Use the data from this url to generate a group of claims and evidences according to the description below:

Claim: A claim is an assertion or statement that proposes an idea, fact, or opinion. It is often the subject of verification, as it can contain elements that may be true, false, exaggerated, or misleading. Claims may require evidence or further investigation to determine their accuracy.

Evidence: The evidence provides context, evidence, and clarification regarding the claim. It may include a breakdown of factual elements, highlight any inaccuracies, and explain nuances that help the reader understand the validity or implications of the claim. Explanations aim to inform by distinguishing between what is supported by evidence and what may be incorrect or misleading.

Label: The label is a categorization of the claim after it has been evaluated. It often indicates the nature or degree of accuracy of the claim, such as whether it is true, false, partially true, misleading, or speculative. Labels help in classifying claims for easier identification and understanding of their reliability.
 
Ensure that each claim is only one sentence long. Ensure that each evidence is NOT more than 4 sentences long. The values of the labels can be 0 or 1. 0 is when the evidence SUPPORTS the claim and 1 is when the evidence disproves the claim. Only generate claims which have the label of 1 for now.
"""

In [40]:
graph_config = {
    "llm": {
        "api_key": open_ai_key,
        "model": "openai/gpt-4o-mini",
    },
    "verbose": True,
    "headless": False,
}

schema = Schema(src=urls[0])

smart_scraper_graph = SmartScraperGraph(
    prompt="prompt",
    source=urls[0],
    schema=schema,
    config=graph_config
)

result = smart_scraper_graph.run()
print(result)

ValidationError: 1 validation error for Schema
data
  Field required [type=missing, input_value={'src': 'https://sniv3r2....ions-to-seminal-plasma'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing

## Output to CSV

In [28]:
def export_schema_to_csv(schema, file_name: str):    
    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(schema['data'])
    
    # Export DataFrame to CSV
    df.to_csv(file_name, index=False)

In [29]:
export_schema_to_csv(result, "test-1.csv")

In [20]:
def get_prompt(num_samples):
    prompt = f"""You are a medical expert. You will be given a medical document, generate three types of claims:\n\n1) A supporting claim that paraphrases a key assertion.\n2) A contradictory claim that directly contradicts a key evidence provided in the summary.\n3) An ambiguous claim that either partially supports or contradicts, or presents elements that are neither clearly supported nor contradicted.\n\nEach claim should be one or two sentences long. Ideally, the claims should be generated from different key assertions or sections of the summary.\n\nReturn ONLY the claims in this exact JSON format below. DO NOT include any extra text or explanations. DO NOT add ```json``` formatting. Just output the exact JSON as a string.\n[{{\n  \"supporting_claim\": '...',\n  \"contradictory_claim\": '...',\n  \"ambiguous_claim\": '...'\n}}].\n\n Output {num_samples} triplets of supporting, contradictory, and ambiguous claims from the provided summary. Give all {num_samples} triplets in the same JSON array."""
    return prompt

In [21]:
client = OpenAI(api_key=open_ai_key)

In [22]:
text = open("./uptodate/1.txt", "r").read()

In [26]:
completion = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": get_prompt(20)},
        {"role": "user", "content": f"Document:\n\n{text}"}
    ],
    response_format=Schema,
)

In [27]:
completion.choices[0].message.parsed.to_json()

[{'supporting_claim': 'UpToDate content is not meant to replace professional medical advice and should only be used as a supplementary source of information.',
  'contradictory_claim': 'UpToDate is intended to provide definitive medical advice that can be solely relied upon for treatment decisions.',
  'ambiguous_claim': 'While UpToDate offers extensive information on birth control methods, the effectiveness of these methods can vary significantly depending on individual circumstances.'},
 {'supporting_claim': 'IUDs and implants are among the most effective birth control methods due to their low risk of failure.',
  'contradictory_claim': 'Contraceptives such as the condom and diaphragm are just as effective as long-term methods like IUDs.',
  'ambiguous_claim': 'The effectiveness of birth control methods can greatly depend on how consistently and correctly they are used, leaving some uncertainty regarding their overall reliability.'},
 {'supporting_claim': 'Emergency contraception can