# Part 1 - NER and Drug Count - BioNER

In [None]:
import pandas as pd
from transformers import AutoTokenizer, BertForTokenClassification
import torch
import torch.nn.functional as F

#bioner model and tokenizer
model_name = 'MilosKosRad/BioNER'
tokenizer = AutoTokenizer.from_pretrained(model_name)   
model = BertForTokenClassification.from_pretrained(model_name)

df = pd.read_csv('adverse_drug_effects_pos.csv', sep=',')

drug_mentions = []

for text in df['Text']:
    inputs = tokenizer('Drug', text, return_tensors='pt', padding=True, truncation=True)
    
    #model inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)
        predictions = torch.argmax(probs, dim=-1).squeeze().tolist()

    tokens = tokenizer.tokenize(text)
    words = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())  # Convert all tokens in the sequence
    decoded = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())

    # Extract drug tokens
    drug_tokens = [decoded[i] for i, label in enumerate(predictions) if label == 1]
    drug_mentions.extend(drug_tokens)

#convert subword tokens to full entities
from collections import Counter

#merge subword tokens into full words
def merge_tokens(tokens):
    full_tokens = []
    current = ""
    for token in tokens:
        if token.startswith("##"):
            current += token[2:]
        else:
            if current:
                full_tokens.append(current) #append the last token if it exists
            current = token
    if current:
        full_tokens.append(current)
    return full_tokens

drug_counts = Counter(merge_tokens(drug_mentions))

# # Final result
# print("Drug Mentions Count:")
# for drug, count in drug_counts.items():
#     print(f"{drug}: {count}")

output_df = pd.DataFrame(drug_counts.items(), columns=['Drug', 'Count'])
output_df.to_csv('drug_mentions_count.csv', index=False)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


# Part 2 - Adverse Symptoms Associated with Each Drug - Bart Cause-Effect

In [None]:
from transformers import pipeline
from collections import defaultdict
import pandas as pd

# Load the BART cause-effect pipeline
cause_effect = pipeline("summarization", model="taskload/bart-cause-effect")

# Load your CSV
df = pd.read_csv('adverse_drug_effects_pos.csv', sep=',')

# Storage
causal_pairs_bart = defaultdict(int)

# Your existing NER helpers:
# extract_entities(label, text), merge_tokens(tokens) – keep these the same

def extract_entities(label, text):
    inputs = tokenizer(label, text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)
        predictions = torch.argmax(probs, dim=-1).squeeze().tolist()

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
    entity_tokens = [tokens[i] for i, pred in enumerate(predictions) if pred == 1]
    return merge_tokens(entity_tokens)

# Merge subword tokens
def merge_tokens(tokens):
    full_tokens = []
    current = ""
    for token in tokens:
        if token.startswith("##"):
            current += token[2:]
        else:
            if current:
                full_tokens.append(current)
            current = token
    if current:
        full_tokens.append(current)
    return full_tokens

#processes each sentence
for text in df['Text']:
    drugs = extract_entities('Drug', text)
    symptoms = extract_entities('Symptom', text)

    #if either entity not found then skip
    if not drugs or not symptoms:
        continue

    try:
        result = cause_effect("text: " + text, max_length=300, min_length=30, do_sample=False)
        cause_effect_text = result[0]['summary_text'].lower()

        #match extracted cause-effect pairs with known drug/symptom entities
        for drug in drugs:
            for symptom in symptoms:
                if drug.lower() in cause_effect_text and symptom.lower() in cause_effect_text:
                    causal_pairs_bart[(drug.lower(), symptom.lower())] += 1
    except Exception as e:
        print(f"Error processing: {text}\n{e}")

causal_df_bart = pd.DataFrame([
    {'Drug': drug, 'Symptom': symptom, 'Count': count}
    for (drug, symptom), count in causal_pairs_bart.items()
])
causal_df_bart.sort_values(by='Count', ascending=False, inplace=True)
causal_df_bart.to_csv('drug_symptom_pairs_bart.csv', index=False)

Device set to use mps:0
Your max_length is set to 300, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max_length is set to 300, but your input_length is only 31. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)
Your max_length is set to 300, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 300, but your input_length is only 50. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('.

# Part 3 - Symptoms Cured by a Drug (Positive Relationships) 
Filtered with cause-effect outputs for contexts like “treats”, “relieves”, “cures”, “used for”, “administered to manage”, etc.
Also uses a keyword match in the summarized cause-effect text to check if the drug is being used in a therapeutic context.

In [5]:
#storage for therapeutic pairs
therapeutic_pairs_bart = defaultdict(int)

#'therapeutic' keywords
therapeutic_keywords = ['treat', 'relieve', 'cure', 'manage', 'used for', 'administered for']

for text in df['Text']:
    drugs = extract_entities('Drug', text)
    symptoms = extract_entities('Symptom', text)

    if not drugs or not symptoms:
        continue

    try:
        result = cause_effect("text: " + text, max_length=300, min_length=30, do_sample=False)
        cause_effect_text = result[0]['summary_text'].lower()

        #only proceed if the text seems to describe a therapeutic (non adverse) relation
        if any(keyword in cause_effect_text for keyword in therapeutic_keywords):
            for drug in drugs:
                for symptom in symptoms:
                    if drug.lower() in cause_effect_text and symptom.lower() in cause_effect_text:
                        therapeutic_pairs_bart[(drug.lower(), symptom.lower())] += 1
    except Exception as e:
        print(f"Error processing: {text}\n{e}")

#save therapeutic pairs
therapeutic_df_bart = pd.DataFrame([
    {'Drug': drug, 'Symptom': symptom, 'Count': count}
    for (drug, symptom), count in therapeutic_pairs_bart.items()
])
therapeutic_df_bart.sort_values(by='Count', ascending=False, inplace=True)
therapeutic_df_bart.to_csv('drug_symptom_pairs_therapeutic.csv', index=False)

Your max_length is set to 300, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max_length is set to 300, but your input_length is only 31. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)
Your max_length is set to 300, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 300, but your input_length is only 50. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)
Your

# Observations about correctness

## Entity Recognition Accuracy (BioNER)
The BioNER model performs reasonably well in identifying drug and symptom entities, but some limitations are a bit apparent, such as missed entities in longer and more complex sentences (probably due to token missclassification). Subword issues might also be a cause. While the token merging does help to address some fragmented outputs, but some edge cases might end up splitting entities incorrectly. 

## Cause-Effect Detection Reliability (BART)
The general cause-effect summaries produced by BART were fine, but the summarization occasionally ommitted key terms, particularly in longer text inputs. Matching extracted entities with summary text was case insensitive and keyword based, and though this effective, it also quite simple and not necessarily the most accurate. 

## Causality Filtering - therapeutic vs adverse
For adverse effects, I did not use any filtering beyond BART summaries, under the assumption that the data was pre-labeled as adverse (POS). I added simple keyword rules which improved the precision but might have missed legitimate therapeutic relationships described in other words (like improved, bettered), and inlcuded false positives where keywords appeared but were not clearly linked causally. 

Finally, the frequency counts are dependent on both NER success and summarization matching, so each pairs count is only as good as the above pipelines.