In [1]:
import pandas as pd
from transformers import AutoTokenizer, BertForTokenClassification
import torch
import torch.nn.functional as F
from collections import defaultdict

# Load model and tokenizer
model_name = 'MilosKosRad/BioNER'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)

# Load your CSV
df = pd.read_csv('adverse_drug_effects_pos.csv', sep=',')

# Helper function to extract named entities based on label
def extract_entities(label, text):
    inputs = tokenizer(label, text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)
        predictions = torch.argmax(probs, dim=-1).squeeze().tolist()

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
    entity_tokens = [tokens[i] for i, pred in enumerate(predictions) if pred == 1]
    return merge_tokens(entity_tokens)

# Merge subword tokens
def merge_tokens(tokens):
    full_tokens = []
    current = ""
    for token in tokens:
        if token.startswith("##"):
            current += token[2:]
        else:
            if current:
                full_tokens.append(current)
            current = token
    if current:
        full_tokens.append(current)
    return full_tokens

# Store frequency of (drug, symptom) pairs
drug_symptom_pairs = defaultdict(int)

# Process each sentence
for text in df['Text']:
    drugs = extract_entities('Drug', text)
    symptoms = extract_entities('Symptom', text)
    
    for drug in drugs:
        for symptom in symptoms:
            pair = (drug.lower(), symptom.lower())
            drug_symptom_pairs[pair] += 1

# Convert to DataFrame
pair_df = pd.DataFrame([
    {'Drug': drug, 'Symptom': symptom, 'Count': count}
    for (drug, symptom), count in drug_symptom_pairs.items()
])
pair_df = pair_df.sort_values(by='Count', ascending=False)

# Save to CSV
pair_df.to_csv('drug_symptom_pairs.csv', index=False)

  from .autonotebook import tqdm as notebook_tqdm
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
