# Assignment 3

In [23]:
from dotenv import load_dotenv
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
import re
from google import genai
import pandas as pd
import time
from tqdm import tqdm

load_dotenv()

os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [None]:
data = pd.read_csv('adverse_drug_effects_pos.csv')
texts = data['Text'].tolist()

# Initialize the LLM
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")

#Function to extract drug mentions using the LLM
def extract_drug_mentions(text):
    prompt = f"""
Extract all drug names mentioned in the following text and count their occurrences.
Return results in the format: DrugName: Count

Text:
{text}
"""
    try:
        response = llm.invoke(prompt)
        result_text = response.content if hasattr(response, "content") else str(response)
    except Exception as e:
        print(f"Error processing batch: {e}")
        return {}

    mentions = {}
    for line in result_text.strip().split('\n'):
        match = re.match(r"(.+?):\s*(\d+)", line)
        if match:
            drug, count = match.groups()
            mentions[drug.strip()] = int(count)
    return mentions

#setting up batch processing
batch_size = 10
delay_between_batches = 5  # seconds
save_every = 5  # Save every 5 batches
drug_mentions = {}

#progress bar for batch processing
num_batches = (len(texts) + batch_size - 1) // batch_size
for batch_num in tqdm(range(num_batches), desc="Processing batches"):
    start_idx = batch_num * batch_size
    end_idx = start_idx + batch_size
    batch = texts[start_idx:end_idx]
    combined_text = "\n\n".join(batch)

    extracted = extract_drug_mentions(combined_text)
    for drug, count in extracted.items():
        drug_mentions[drug] = drug_mentions.get(drug, 0) + count

    #save progress every few batches
    if (batch_num + 1) % save_every == 0 or batch_num == num_batches - 1:
        temp_df = pd.DataFrame(list(drug_mentions.items()), columns=['Drug', 'Mentions'])
        temp_df.to_csv("drug_mentions_progress.csv", index=False)

    time.sleep(delay_between_batches)

# Final save
final_df = pd.DataFrame(list(drug_mentions.items()), columns=['Drug', 'Mentions'])
final_df.to_csv("drug_mentions_final.csv", index=False)
print("✅ All done. Final results saved to 'drug_mentions_final.csv'.")

Processing batches:  82%|████████▏ | 349/428 [38:01<07:44,  5.88s/it]  