In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cancerfinetuning/val.csv
/kaggle/input/cancerfinetuning/train.csv
/kaggle/input/cancerfinetuning/test.csv


In [2]:
pip install transformers accelerate peft datasets bitsandbytes trl


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=2.0.0->accelerate)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.

In [3]:
config = {
  "base_model": "bert-base-uncased",
  "lora_r": 8,
  "lora_alpha": 16,
  "lora_dropout": 0.05,
  "max_length": 512,
  "train_batch_size": 4,
  "eval_batch_size": 4,
  "learning_rate": 2e-5,
  "num_train_epochs": 3,
  "output_dir": "/kaggle/working/lora_bert-base-uncased"
}


In [32]:
import pandas as pd
import torch
import torch.nn.functional as F  # ✅ this line is critical
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from tqdm import tqdm
import json
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows


In [24]:
train_df = load_data("/kaggle/input/cancerfinetuning/train.csv")
val_df = load_data("/kaggle/input/cancerfinetuning/val.csv")
test_df =load_data("/kaggle/input/cancerfinetuning/test.csv")


In [34]:
# ========== SETUP ==========
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "bert-base-uncased"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

# Load model
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model = get_peft_model(base_model, lora_config).to(device)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
 # Tokenization helper
def tokenize_data(df):
    tokens = tokenizer(
        list(df["abstract"]),
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    tokens["labels"] = torch.tensor(df["label"].values, dtype=torch.long)
    return TensorDataset(tokens['input_ids'], tokens['attention_mask'], tokens['labels'])

# Dataset loaders

train_loader = DataLoader(tokenize_data(train_df), batch_size=8, shuffle=True)
val_loader = DataLoader(tokenize_data(val_df), batch_size=8, shuffle=True)
test_loader = DataLoader(tokenize_data(test_df), batch_size=8, shuffle=True)

In [42]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)  # ⬅️ Increase from 2e-5


In [43]:
# ========== TRAIN + VAL ==========
model.train()
for epoch in range(5):
    total_train_loss = 0

    # TRAIN
    model.train()
    for batch in tqdm(train_loader, desc=f"[Train] Epoch {epoch+1}"):
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # VALIDATION
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss = outputs.loss
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    print(f"📘 Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

[Train] Epoch 1: 100%|██████████| 75/75 [00:49<00:00,  1.52it/s]


📘 Epoch 1 - Train Loss: 0.7047 | Val Loss: 0.6923


[Train] Epoch 2: 100%|██████████| 75/75 [00:48<00:00,  1.55it/s]


📘 Epoch 2 - Train Loss: 0.6999 | Val Loss: 0.6837


[Train] Epoch 3: 100%|██████████| 75/75 [00:48<00:00,  1.54it/s]


📘 Epoch 3 - Train Loss: 0.6853 | Val Loss: 0.6671


[Train] Epoch 4: 100%|██████████| 75/75 [00:48<00:00,  1.53it/s]


📘 Epoch 4 - Train Loss: 0.6131 | Val Loss: 0.4313


[Train] Epoch 5: 100%|██████████| 75/75 [00:48<00:00,  1.54it/s]


📘 Epoch 5 - Train Loss: 0.2830 | Val Loss: 0.1922


In [44]:
# ========== EVALUATION ==========
label_map_rev = {0: "Non-Cancer", 1: "Cancer"}
results = []
summary = []
confusion_blocks = {}

def evaluate_and_collect(name, df, loader):
    model.eval()
    all_preds, all_probs, all_labels = [], [], []
    with torch.no_grad():
        for batch in loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = F.softmax(outputs.logits, dim=1)
            preds = torch.argmax(probs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Metrics
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    cm = confusion_matrix(all_labels, all_preds)
    print(f"✅ {name} Accuracy: {acc:.2%} | F1: {f1:.2f}")

    summary.append({
        "Split": name,
        "Accuracy": f"{acc:.2%}",
        "F1-score": f"{f1:.2f}"
    })

    # Confusion matrix block
    cm_df = pd.DataFrame(cm, index=["Actual Cancer", "Actual Non-Cancer"],
                         columns=["Predicted Cancer", "Predicted Non-Cancer"])
    confusion_blocks[name] = cm_df

    # Predictions JSON style
    for i, row in df.iterrows():
        results.append({
            "id": str(row['id']),
            "split": name,
            "true_label": label_map_rev[row['label']],
            "predicted_label": label_map_rev[all_preds[i]],
            "confidence_scores": {
                "Cancer": round(float(all_probs[i][1]), 3),
                "Non-Cancer": round(float(all_probs[i][0]), 3)
            }
        })

# Evaluate all 3 splits
evaluate_and_collect("train", train_df.reset_index(drop=True), train_loader)
evaluate_and_collect("val", val_df.reset_index(drop=True), val_loader)
evaluate_and_collect("test", test_df.reset_index(drop=True), test_loader)

# ========== EXPORT RESULTS ==========
# Save predictions
with open("/kaggle/working/bert_lora_all_predictions.json", "w") as f:
    json.dump(results, f, indent=2)

# Save metrics & confusion matrices to Excel
excel_path = "/kaggle/working/bert_lora_report.xlsx"
wb = openpyxl.Workbook()
ws_summary = wb.active
ws_summary.title = "Metrics"
for r in dataframe_to_rows(pd.DataFrame(summary), index=False, header=True):
    ws_summary.append(r)

ws_cm = wb.create_sheet("Confusion_Matrix")
for name, df in confusion_blocks.items():
    ws_cm.append([name])  # Add title
    for r in dataframe_to_rows(df, index=True, header=True):
        ws_cm.append(r)
    ws_cm.append([])

wb.save(excel_path)
print(f"📄 Saved all results to: {excel_path}")

✅ train Accuracy: 96.17% | F1: 0.96
✅ val Accuracy: 95.00% | F1: 0.95
✅ test Accuracy: 95.00% | F1: 0.95
📄 Saved all results to: /kaggle/working/bert_lora_report.xlsx
