In [1]:
%%capture
# %pip install scrapegraphai
%pip install scrapegraphai==1.20.0b1
%apt install chromium-chromedriver
%pip install nest_asyncio
%pip install playwright
%playwright install

In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
# sanity
import json
from openai import OpenAI
from scrapegraphai.graphs import SmartScraperGraph
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
open_ai_key = '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'

In [5]:
# Load the CSV file
df = pd.read_csv("categorized_content_links.csv")

column_name = "url"  # replace with the actual column name
urls = df[column_name].dropna().unique().tolist()

print(len(urls))

In [6]:
# model schema
from pydantic import BaseModel, Field
from typing import Literal, List, Any

In [19]:
class PubHealthSchema(BaseModel):
    supporting_claim: str
    contradictory_claim: str
    ambiguous_claim: str # 0: +ve, 1: -ve, 2: ambiguous 
    
    def to_json(self):
        return {"supporting_claim": self.supporting_claim, "contradictory_claim": self.contradictory_claim, "ambiguous_claim": self.ambiguous_claim}
    
class Schema(BaseModel):
    claims: List[PubHealthSchema]
    
    def to_json(self):
        return [data.to_json() for data in self.claims]

In [None]:
prompt = """
Use the data from this url to generate a group of claims and evidences according to the description below:

Claim: A claim is an assertion or statement that proposes an idea, fact, or opinion. It is often the subject of verification, as it can contain elements that may be true, false, exaggerated, or misleading. Claims may require evidence or further investigation to determine their accuracy.

Evidence: The evidence provides context, evidence, and clarification regarding the claim. It may include a breakdown of factual elements, highlight any inaccuracies, and explain nuances that help the reader understand the validity or implications of the claim. Explanations aim to inform by distinguishing between what is supported by evidence and what may be incorrect or misleading.

Label: The label is a categorization of the claim after it has been evaluated. It often indicates the nature or degree of accuracy of the claim, such as whether it is true, false, partially true, misleading, or speculative. Labels help in classifying claims for easier identification and understanding of their reliability.
 
Ensure that each claim is only one sentence long. Ensure that each evidence is NOT more than 4 sentences long. The values of the labels can be 0 or 1. 0 is when the evidence SUPPORTS the claim and 1 is when the evidence disproves the claim. Only generate claims which have the label of 1 for now.
"""

In [40]:
graph_config = {
    "llm": {
        "api_key": open_ai_key,
        "model": "openai/gpt-4o-mini",
    },
    "verbose": True,
    "headless": False,
}

schema = Schema(src=urls[0])

smart_scraper_graph = SmartScraperGraph(
    prompt="prompt",
    source=urls[0],
    schema=schema,
    config=graph_config
)

result = smart_scraper_graph.run()
print(result)

ValidationError: 1 validation error for Schema
data
  Field required [type=missing, input_value={'src': 'https://sniv3r2....ions-to-seminal-plasma'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing

## Output to CSV

In [28]:
def export_schema_to_csv(schema, file_name: str):    
    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(schema['data'])
    
    # Export DataFrame to CSV
    df.to_csv(file_name, index=False)

In [29]:
export_schema_to_csv(result, "test-1.csv")

In [20]:
def get_prompt(num_samples):
    prompt = f"""You are a medical expert. You will be given a medical document, generate three types of claims:\n\n1) A supporting claim that paraphrases a key assertion.\n2) A contradictory claim that directly contradicts a key evidence provided in the summary.\n3) An ambiguous claim that either partially supports or contradicts, or presents elements that are neither clearly supported nor contradicted.\n\nEach claim should be one or two sentences long. Ideally, the claims should be generated from different key assertions or sections of the summary.\n\nReturn ONLY the claims in this exact JSON format below. DO NOT include any extra text or explanations. DO NOT add ```json``` formatting. Just output the exact JSON as a string.\n[{{\n  \"supporting_claim\": '...',\n  \"contradictory_claim\": '...',\n  \"ambiguous_claim\": '...'\n}}].\n\n Output {num_samples} triplets of supporting, contradictory, and ambiguous claims from the provided summary. Give all {num_samples} triplets in the same JSON array."""
    return prompt

In [21]:
client = OpenAI(api_key=open_ai_key)

In [22]:
text = open("./uptodate/1.txt", "r").read()

In [26]:
completion = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": get_prompt(20)},
        {"role": "user", "content": f"Document:\n\n{text}"}
    ],
    response_format=Schema,
)

In [27]:
completion.choices[0].message.parsed.to_json()

[{'supporting_claim': 'UpToDate content is not meant to replace professional medical advice and should only be used as a supplementary source of information.',
  'contradictory_claim': 'UpToDate is intended to provide definitive medical advice that can be solely relied upon for treatment decisions.',
  'ambiguous_claim': 'While UpToDate offers extensive information on birth control methods, the effectiveness of these methods can vary significantly depending on individual circumstances.'},
 {'supporting_claim': 'IUDs and implants are among the most effective birth control methods due to their low risk of failure.',
  'contradictory_claim': 'Contraceptives such as the condom and diaphragm are just as effective as long-term methods like IUDs.',
  'ambiguous_claim': 'The effectiveness of birth control methods can greatly depend on how consistently and correctly they are used, leaving some uncertainty regarding their overall reliability.'},
 {'supporting_claim': 'Emergency contraception can

## Semantic Chunking

In [14]:
import re
import numpy as np
import csv
import json
import logging
import os
import sys
import time
import traceback
from functools import partial
from multiprocessing import Lock, Manager, Pool


In [17]:
import pandas as pd
from langchain.agents import AgentExecutor, Tool, initialize_agent
from langchain.chains import LLMChain
from langchain_community.tools import DuckDuckGoSearchRun
from langchain.prompts import PromptTemplate
from langchain_community.agent_toolkits import JsonToolkit, create_json_agent
from langchain_community.llms import OpenAI
from langchain_community.tools.json.tool import JsonSpec
from langchain_openai import ChatOpenAI
from tqdm import tqdm

In [3]:
from langchain.embeddings import OpenAIEmbeddings
oaiembeds = OpenAIEmbeddings(openai_api_key="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")

  oaiembeds = OpenAIEmbeddings(openai_api_key="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")


In [5]:
def combine_sentences(sentences, buffer_size=1):
    # Go through each sentence dict
    for i in range(len(sentences)):

        # Create a string that will hold the sentences which are joined
        combined_sentence = ''

        # Add sentences before the current one, based on the buffer size.
        for j in range(i - buffer_size, i):
            # Check if the index j is not negative (to avoid index out of range like on the first one)
            if j >= 0:
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += sentences[j]['sentence'] + ' '

        # Add the current sentence
        combined_sentence += sentences[i]['sentence']

        # Add sentences after the current one, based on the buffer size
        for j in range(i + 1, i + 1 + buffer_size):
            # Check if the index j is within the range of the sentences list
            if j < len(sentences):
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += ' ' + sentences[j]['sentence']

        # Then add the whole thing to your dict
        # Store the combined sentence in the current sentence dict
        sentences[i]['combined_sentence'] = combined_sentence

    return sentences

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]['combined_sentence_embedding']
        embedding_next = sentences[i + 1]['combined_sentence_embedding']
        
        # Calculate cosine similarity
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        
        # Convert to cosine distance
        distance = 1 - similarity

        # Append cosine distance to the list
        distances.append(distance)

        # Store distance in the dictionary
        sentences[i]['distance_to_next'] = distance

    # Optionally handle the last sentence
    # sentences[-1]['distance_to_next'] = None  # or a default value

    return distances, sentences

In [9]:
def generate_chunks(filepath):
    with open(filepath) as f:
        text = f.read()
    single_sentences_list = re.split(r'(?<=[.?!])\s+', text)
    # print (f"{len(single_sentences_list)} senteneces were found")
    sentences = [{'sentence': x, 'index' : i} for i, x in enumerate(single_sentences_list)]
    sentences = combine_sentences(sentences)
    embeddings = oaiembeds.embed_documents([x['combined_sentence'] for x in sentences])
    for i, sentence in enumerate(sentences):
        sentence['combined_sentence_embedding'] = embeddings[i]
    distances, sentences = calculate_cosine_distances(sentences)
    breakpoint_percentile_threshold = 90
    breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold) # If you want more chunks, lower the percentile cutoff

    # Then we'll see how many distances are actually above this one
    num_distances_above_theshold = len([x for x in distances if x > breakpoint_distance_threshold]) # The amount of distances above your threshold
    # plt.text(x=(len(distances)*.01), y=y_upper_bound/50, s=f"{num_distances_above_theshold + 1} Chunks");

    # Then we'll get the index of the distances that are above the threshold. This will tell us where we should split our text
    indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold] # The indices of those breakpoints on your list
    # Initialize the start index
    start_index = 0

    # Create a list to hold the grouped sentences
    chunks = []

    # Iterate through the breakpoints to slice the sentences
    for index in indices_above_thresh:
        # The end index is the current breakpoint
        end_index = index

        # Slice the sentence_dicts from the current start index to the end index
        group = sentences[start_index:end_index + 1]
        combined_text = ' '.join([d['sentence'] for d in group])
        chunks.append(combined_text)
        
        # Update the start index for the next group
        start_index = index + 1

    # The last group, if any sentences remain
    if start_index < len(sentences):
        combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
        chunks.append(combined_text)
        
    return chunks


In [10]:
chunks = generate_chunks("./UTD2txt/10.txt")

In [21]:
def init_files(output_file, log_file):
    """Initialize output and log files with headers if they don't exist."""
    if not os.path.exists(output_file):
        with open(output_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['Topic', 'Evidence', 'Supports', 'Contradicts', 'Ambiguous'])
            writer.writeheader()
    
    if not os.path.exists(log_file):
        with open(log_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['Processed_Topic'])
            writer.writeheader()

# %%
# File paths
log_file = os.path.join('.', 'csv', 'uptodate_process_log.csv')
output_file = os.path.join('.', 'csv', 'claim_triplets_uptodate.csv')
input_file = os.path.join('.', 'csv', 'categorized_content_links_unique.csv')

# OpenAI API key
# openai_api_key = "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"
# openai_api_key = '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'
openai_api_key = "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"
# API key for OpenAI (Harshini)
# openai_api_key = "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"
# openai_api_key = os.environ['OPENAI_API_KEY']

# Initialize logging
logging.basicConfig(level=logging.INFO)

# Initialize files
init_files(output_file, log_file)

In [20]:
def write_row(file_path, row_dict, file_lock):
    """Write a single row to the CSV file in a thread-safe manner."""
    with file_lock:
        file_exists = os.path.exists(file_path)
        mode = 'a' if file_exists else 'w'
        
        with open(file_path, mode, newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=list(row_dict.keys()))
            if not file_exists:
                writer.writeheader()
            writer.writerow(row_dict)

In [35]:
def generate_claim_triplet(summary):
    """Generate claim triplets from a summary."""
    try:
        prompt = """
        Using the following detailed summary, generate three types of claims:\n\n1) A supporting claim that paraphrases a key assertion.\n2) A contradictory claim that directly contradicts a key evidence provided in the summary.\n3) An ambiguous claim that either partially supports or contradicts, or presents elements that are neither clearly supported nor contradicted.\n\nEach claim should be one or two sentences long. Ideally, the claims should be generated from different key assertions or sections of the summary.\n\nReturn ONLY the claims in this exact JSON format below. DO NOT include any extra text or explanations. DO NOT add ```json``` formatting. Just output the exact JSON as a string.\n[{{\n  \"supporting_claim\": '...',\n  \"contradictory_claim\": '...',\n  \"ambiguous_claim\": '...'\n}}].\n\n Output {num_samples} triplets of supporting, contradictory, and ambiguous claims from the provided summary. Give all {num_samples} triplets in the same JSON array.\n\n\n\n

        Summary:
        {summary}
        """
        # response = agent.run(prompt)
        llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o-mini")
        prompt = PromptTemplate(
            input_variables=["summary", "additional_details"],
            template=prompt
        )
        # structured = llm.with_structured_output(Claims)
        llm_chain = LLMChain(llm=llm, prompt=prompt)
        response = llm_chain.run({"summary": summary, "num_samples": 4})
        
        print("JSON Response", response)
        
        response = json.loads(response)
        
        return response
    
    except Exception as e:
        logging.error(f"Error generating claims for summary: {e}")
        traceback.print_exc()
        return None

In [26]:
out = generate_claim_triplet(chunks[5])

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


JSON Response [{
  "supporting_claim": "The timing of antiretroviral therapy (ART) is crucial to reduce the incidence of immune reconstitution inflammatory syndrome (IRIS) among patients with tuberculosis (TB).",
  "contradictory_claim": "IRIS occurs in fewer than 8 percent of patients with TB, contradicting the reported prevalence of 8 to 43 percent.",
  "ambiguous_claim": "While management of TB-IRIS is similar to that of IRIS from other conditions, the specific details of treatment differences remain unclear."
}, {
  "supporting_claim": "Patients with an initial CD4 count below 100/microL face a higher risk of developing IRIS.",
  "contradictory_claim": "Patients with a significant reduction in viral load and an increase in CD4 count do not experience an increased risk of IRIS, which contradicts the evidence presented.",
  "ambiguous_claim": "The clinical manifestations of IRIS can vary widely, leading to some uncertainty about whether all symptoms are directly linked to TB."
}, {
 

In [33]:
out[0].get("supporting_claim", "na")

'The timing of antiretroviral therapy (ART) is crucial to reduce the incidence of immune reconstitution inflammatory syndrome (IRIS) among patients with tuberculosis (TB).'

In [37]:
logging.basicConfig(level=logging.INFO)

In [1]:
import os

manager = Manager()
file_lock = manager.Lock()

log_df = pd.read_csv(log_file)
try:
    processed_files = set(log_df["processed_file"])
except:
    processed_files = []

folder_path = './UTD2txt'
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        if file_path in processed_files:
            continue
        print(f"Processing {file_path}")
        chunks = generate_chunks(file_path)
        for chunk in chunks:
            responses = generate_claim_triplet(chunk)
            if not responses:
                continue
            for response in responses:
                # response = json.loads(response)
                
                # print("Writing to file", response)
                supports = response.get("supporting_claim", "N/A")
                contradicts = response.get("contradictory_claim", "N/A")
                ambiguous = response.get("ambiguous_claim", "N/A")
            
                # Prepare result row
                result = {
                    'Topic': file_path,
                    'Evidence': chunk,
                    'Supports': supports,
                    'Contradicts': contradicts,
                    'Ambiguous': ambiguous
                }
                
                # Write to output file immediately
                write_row(output_file, result, file_lock)
                
                # Write to log file
        write_row(log_file, {'processed_file': file_path}, file_lock)

NameError: name 'Manager' is not defined