In [30]:
!pip install langchain langchain_community duckduckgo-search google-serp-api wikipedia



In [31]:
!pip install --upgrade langchain

^C
Traceback (most recent call last):
  File "/Users/rateria/Code/cs-5787-final-project/.conda/bin/pip", line 7, in <module>
    from pip._internal.cli.main import main
  File "<frozen importlib._bootstrap>", line 1002, in _find_and_load
KeyboardInterrupt


In [None]:
!pip install -U langchain-openai



In [None]:
import pandas as pd
import os
import openai
from langchain_openai import ChatOpenAI
from langchain.tools import DuckDuckGoSearchRun
from langchain.agents import initialize_agent, Tool
import logging
from multiprocessing import Pool, Manager, Lock
from functools import partial
import time
from tqdm import tqdm
import csv

In [None]:
def setup_agent(openai_api_key):
    """Initialize and return a LangChain agent with DuckDuckGo search capabilities."""
    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0, openai_api_key=openai_api_key)
    duckduckgo_search = DuckDuckGoSearchRun()
    
    tools = [
        Tool(
            name="DuckDuckGo Search",
            func=duckduckgo_search.run,
            description="Useful for retrieving detailed information based on a given topic."
        )
    ]
    
    return initialize_agent(
        tools=tools,
        llm=llm,
        agent="zero-shot-react-description",
        verbose=True,
        handle_parsing_errors=True
    )


In [None]:
def search_and_summarize(agent, topic):
    """Perform detailed search and generate summary."""
    try:
        prompt = (
            f"Search for detailed and technical information on the topic: '{topic}' "
            "from a medical standpoint. Provide a comprehensive, detailed summary in paragraph form. "
            "Avoid using bullet points or lists. The summary should focus on technical and medical aspects "
            "with a high level of detail, suitable for a medical professional audience."
        )
        summary = agent.run(prompt)
        return summary
    except Exception as e:
        logging.error(f"Error summarizing topic '{topic}': {e}")
        return None

In [None]:
def generate_claim_triplet(agent, summary):
    """Generate claim triplets from a summary."""
    try:
        prompt = (
            f"Using the following detailed summary, generate three types of claims:\n\n"
            f"Summary:\n{summary}\n\n"
            "1) A supporting claim that paraphrases a key assertion.\n"
            "2) A contradictory claim that directly contradicts a key evidence provided in the summary.\n"
            "3) An ambiguous claim that either partially supports or contradicts, or presents elements that are neither clearly supported nor contradicted.\n\n"
            "Each claim should be one or two sentences long. Ideally, the claims should be generated from different key assertions or sections of the summary."
        )
        response = agent.run(prompt)

        supports = response.split("1) Supporting Claim:")[1].split("2) Contradictory Claim:")[0].strip() if "1) Supporting Claim:" in response else "N/A"
        contradicts = response.split("2) Contradictory Claim:")[1].split("3) Ambiguous Claim:")[0].strip() if "2) Contradictory Claim:" in response else "N/A"
        ambiguous = response.split("3) Ambiguous Claim:")[1].strip() if "3) Ambiguous Claim:" in response else "N/A"

        return supports, contradicts, ambiguous
    except Exception as e:
        logging.error(f"Error generating claims for summary: {e}")
        return "N/A", "N/A", "N/A"


In [None]:
def write_row(file_path, row_dict, file_lock):
    """Write a single row to the CSV file in a thread-safe manner."""
    with file_lock:
        file_exists = os.path.exists(file_path)
        mode = 'a' if file_exists else 'w'
        
        with open(file_path, mode, newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=list(row_dict.keys()))
            if not file_exists:
                writer.writeheader()
            writer.writerow(row_dict)

In [None]:
def process_topic(args):
    """Process a single topic and write results immediately."""
    topic, openai_api_key, processed_topics, file_lock, output_file, log_file = args
    
    if topic in processed_topics:
        return None
    
    try:
        agent = setup_agent(openai_api_key)
        summary = search_and_summarize(agent, topic)
        
        if summary:
            supports, contradicts, ambiguous = generate_claim_triplet(agent, summary)
            
            # Prepare result row
            result = {
                'Topic': topic,
                'Evidence': summary,
                'Supports': supports,
                'Contradicts': contradicts,
                'Ambiguous': ambiguous
            }
            
            # Write to output file immediately
            write_row(output_file, result, file_lock)
            
            # Write to log file
            write_row(log_file, {'Processed_Topic': topic}, file_lock)
            
            return result
            
    except Exception as e:
        logging.error(f"Error processing topic '{topic}': {e}")
    
    return None

In [None]:
def init_files(output_file, log_file):
    """Initialize output and log files with headers if they don't exist."""
    if not os.path.exists(output_file):
        with open(output_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['Topic', 'Evidence', 'Supports', 'Contradicts', 'Ambiguous'])
            writer.writeheader()
    
    if not os.path.exists(log_file):
        with open(log_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['Processed_Topic'])
            writer.writeheader()

In [None]:
# File paths
log_file = './process_log.csv'
output_file = './generated_claim_triplets_with_topics.csv'
input_file = "./categorized_content_links_unique.csv"

# OpenAI API key
# openai_api_key = "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"
openai_api_key = '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'

# Initialize logging
logging.basicConfig(level=logging.INFO)

# Initialize files
init_files(output_file, log_file)

# Load data
df = pd.read_csv(input_file)
topics = df.iloc[:, 1].astype(str) + " " + df.iloc[:, 2].astype(str)[:5]

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
topics

0         Anaphylaxis Allergic reactions to seminal plasma
1                       Anaphylaxis Anaphylaxis in infants
2        Anaphylaxis Anaphylaxis in pregnant and breast...
3        Anaphylaxis Anaphylaxis induced by subcutaneou...
4                 Anaphylaxis Anaphylaxis: Acute diagnosis
                               ...                        
31858                                                  NaN
31859                                                  NaN
31860                                                  NaN
31861                                                  NaN
31862                                                  NaN
Length: 31863, dtype: object

In [None]:
def go_for_it():
    # Load processed topics
    processed_topics = set()
    if os.path.exists(log_file):
        log_df = pd.read_csv(log_file)
        processed_topics = set(log_df["Processed_Topic"])
    
    # Initialize multiprocessing resources
    manager = Manager()
    file_lock = manager.Lock()
    num_processes = min(os.cpu_count() - 1, 4)  # Use up to 4 processes or CPU count - 1
    
    # Prepare arguments for each topic
    process_args = [
        (topic, openai_api_key, processed_topics, file_lock, output_file, log_file)
        for topic in topics
    ]
    
    # Process topics in parallel with immediate output
    with Pool(num_processes) as pool:
        for _ in tqdm(pool.imap_unordered(process_topic, process_args), total=len(topics)):
            pass

In [None]:
go_for_it()

  0%|          | 0/31863 [00:00<?, ?it/s]Process SpawnPoolWorker-2:
Traceback (most recent call last):
  File "/Users/rateria/Code/cs-5787-final-project/.conda/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/rateria/Code/cs-5787-final-project/.conda/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/rateria/Code/cs-5787-final-project/.conda/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/rateria/Code/cs-5787-final-project/.conda/lib/python3.9/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'process_topic' on <module '__main__' (built-in)>
Process SpawnPoolWorker-3:
Traceback (most recent call last):
  File "/Users/rateria/Code/cs-5787-final-project/.conda/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/rater

KeyboardInterrupt: 