###########################################
### Sentiment Analysis Demonstration
### Description:
###   This script uses a pre-trained DistilBERT model
###   fine-tuned for sentiment classification on the SST-2 dataset.
###   It predicts whether the given text is POSITIVE or NEGATIVE.
###########################################


In [1]:
from transformers import pipeline
import pandas as pd
# Below code checks the Lists of Tasks Supported in Transformers
# try:
#     dummy_pipeline = pipeline(task="dummy")
# except Exception as e:
#     print(e)


In [2]:
# classifier = pipeline(task='text-classification')
# Pretrained DistilBERT model fine-tuned on the Stanford Sentiment Treebank (SST-2) dataset
classifier = pipeline(task='text-classification',
                      model='distilbert-base-uncased-finetuned-sst-2-english',
                      revision='af0f99b')
# Other BERT models include bert-base-uncased, bert-large-cased. RoBERTa (roberta-base, roberta-large) has similar architecture to BERT but pre-trained
# differently on bigger data. GPT-series (gpt2, gpt2-medium, gpt3) built built for text generation but can be adapted for classification with
# prompt-engineering or fine tuning.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


In [3]:
texts = [
    "The new treatment reduced my symptoms significantly!",
    "I experienced no improvement in my condition.",
    "I have mild side effects but overall I feel better."
]

result = classifier(texts)
pd.DataFrame(result)


Unnamed: 0,label,score
0,NEGATIVE,0.988233
1,NEGATIVE,0.999614
2,POSITIVE,0.982663


In [4]:
# Above example classifies first text as negative probably because general purpose model can misinterpret - may consider symptoms as negative
# even though overall message is positive.
# 1. Either use a domain spcific model
# 2. Incorporate negation
# 3. Emphasize negation on fine-tuning or
# 4. Use advance LLMs (GPTs) with prompting.
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "dmis-lab/biobert-base-cased-v1.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bio_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

bio_classifier = pipeline(
    task="text-classification",
    model=bio_model,
    tokenizer=tokenizer
)

texts = [
    "The new treatment reduced my symptoms significantly!",
    "I experienced no improvement in my condition.",
    "I have mild side effects but overall I feel better."
]

result = bio_classifier(texts)
pd.DataFrame(result)

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Unnamed: 0,label,score
0,LABEL_1,0.621528
1,LABEL_1,0.5974
2,LABEL_1,0.611964


In [5]:
# Since BioBERT is not trained or fine-tuned for sentiment classification. It's pre-trained on biomedical corpora to capture domain language.
# To get accurate sentiment predictions, fine-tune it on a relevant labeled sentiment dataset.

# 2. Negation handling

In [6]:
import re

# Common English negation words
NEGATION_WORDS = ["not", "no", "never", "none", "without", "n't", "reduced", "decrease"]

def apply_negation_tagging(text, window_size=3):
    """
    Add a "_NEG" suffix to 'window_size' words following a negation term.
    This is a crude approach but can help the model see negation more explicitly.
    """
    tokens = text.split()
    negation_active = 0

    for i, token in enumerate(tokens):
        # Check if the token is a negation trigger
        if token.lower() in NEGATION_WORDS:
            negation_active = window_size

        # If negation scope is active, append _NEG
        elif negation_active > 0:
            tokens[i] = token + "_NEG"
            negation_active -= 1

    return " ".join(tokens)

def negation_preprocessing(texts):
    """
    Apply negation tagging to a list of texts
    """
    processed = []
    for t in texts:
        processed.append(apply_negation_tagging(t))
    return processed

# Example usage:
texts = [
    "The new treatment reduced my symptoms significantly!",
    "I experienced no improvement in my condition.",
    "I have mild side effects but overall I feel better."
]

# 1. Pre-process text with negation tagging
tagged_texts = negation_preprocessing(texts)

# 2. Feed to your pipeline
results = bio_classifier(tagged_texts)
print(tagged_texts)
pd.DataFrame(results)

['The new treatment reduced my_NEG symptoms_NEG significantly!_NEG', 'I experienced no improvement_NEG in_NEG my_NEG condition.', 'I have mild side effects but overall I feel better.']


Unnamed: 0,label,score
0,LABEL_1,0.606981
1,LABEL_1,0.58972
2,LABEL_1,0.611964


# Clinical specific models

1.   BioBERT : Pretrained on PubMed abstracts and PMC articles. Good for classification, NER
2.   PubMedBERT : Trained exclusively on PubMed abstracts; excellent baseline for biomed tasks
3.   SciBERT : Pretrained on scientific texts from semantic scholar. Good for broad scientific domains.
4.   BioGPT : Focused on biomedical text generation and advanced tasks like summarization, question answering.
5.   ClinicalBERT : Tailored for clinical notes. Useful for tasks involving EHR data or clinical text.



In [None]:
#############################################
# Fine-Tuning SciBERT on scicite Dataset
# Description:
#   - Loads the scicite dataset from Hugging Face
#   - Fine-tunes SciBERT on citation classification
#   - Evaluates classification accuracy
#
# Potential Biotech Relevance:
#   - Similar pipelines can classify biological/medical
#     literature, e.g., distinguishing background vs. methods vs. results
#     in biotech publications or clinical trial reports
#############################################
# !pip install datasets
import os
import numpy as np

from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1. Load the scicite dataset from Hugging Face
dataset = load_dataset("scicite")

# The dataset has three splits: train, validation, test.
# "train" and "validation" exist; "test" is typically for final eval.
print(dataset)

# 2. Pick a model & tokenizer
#    We'll use "allenai/scibert_scivocab_uncased" for scientific text
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
# 3. Preprocess the data
#    We'll tokenize the input text, and keep the "label" as is.
def tokenize_function(examples):
    return tokenizer(
        examples["string"],
        padding="max_length",       # or "longest"
        truncation=True,
        max_length=128             # set a max length suitable for your data
    )

encoded_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/8194 [00:00<?, ? examples/s]

In [10]:
# The dataset has a "label" column with values [0, 1, 2]
# scicite dataset also has "cites" and "label2". We'll focus on "label".
encoded_dataset = encoded_dataset.remove_columns(["string", "label2", "excerpt_index"])
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format("torch")

train_dataset = encoded_dataset["train"]
val_dataset = encoded_dataset["validation"]
test_dataset = encoded_dataset["test"]

# 4. Load model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3  # scicite has 3 classes
)

# 5. Define the metrics function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 6. Set training arguments
training_args = TrainingArguments(
    output_dir="./results_scicite",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)
# If using CPU, use smaller parameters for quick run
# training_args = TrainingArguments(
#     output_dir="./results_scicite_quick",
#     num_train_epochs=1,             # Lower epochs => faster training
#     per_device_train_batch_size=16, # Larger batch on GPU => fewer steps per epoch
#     per_device_eval_batch_size=16,
#     evaluation_strategy="no",       # Turn off frequent evaluation => faster
#     save_strategy="no",             # Disable saving multiple checkpoints
#     learning_rate=5e-5,             # Slightly higher LR => converges faster (less stable though)
#     logging_steps=5000,             # Log less often => less overhead
#     disable_tqdm=True,              # Reduces console output overhead
#     fp16=True,                      # If using GPU w/ half-precision support => faster
#     # load_best_model_at_end=False, # No best model => skip overhead of tracking best
# )

# 7. Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# 8. Train the model
trainer.train()

In [14]:
# 9. Evaluate on the test set
results = trainer.evaluate(test_dataset)
print("Test set evaluation:",results)
# pd.DataFrame(results)

#############################################
# End of script
#############################################

Test set evaluation: {'eval_loss': 0.39392197132110596, 'eval_accuracy': 0.8687466379774073, 'eval_f1': 0.870561971878682, 'eval_precision': 0.8778352259065095, 'eval_recall': 0.8687466379774073, 'eval_runtime': 4.5428, 'eval_samples_per_second': 409.218, 'eval_steps_per_second': 51.29, 'epoch': 3.0}


## Try ChemProt: Classify protein-chemical interactions from biomedical literature next in HF datasets and classify text as “interaction/no-interaction” or “disease mention/no mention”.
