In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BioGptTokenizer, BioGptForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, accuracy_score, classification_report
from scipy import stats
import numpy as np
import spacy

# Load the dataset
df = pd.read_csv("D:/Downloads/newData.csv")

# Use this code if you want to preprocess the dataset
'''
# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

df['preprocessed_txt'] = df['Report'].apply(preprocess)
'''
# Preprocessing the text data
#df['preprocessed_txt'] = df['Report']  # Use the original text without extra preprocessing

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['Report'],
    df['BIRADS'],
    test_size=0.2,
    random_state=2022,
    stratify=df['BIRADS']
)

# Hyperparameters
BATCH_SIZE = 16
EPOCHS = 30
MAX_LEN = 128
NUM_CLASSES = 5

# Tokenizer for BioGPT
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")

# Prepare custom Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx] - 1  # Convert labels from 1-5 to 0-4 for BioGPT compatibility

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Convert datasets to torch Datasets
train_dataset = TextDataset(X_train.tolist(), y_train.tolist(), tokenizer, MAX_LEN)
test_dataset = TextDataset(X_test.tolist(), y_test.tolist(), tokenizer, MAX_LEN)

# Data loaders
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Define the BioGPT-based model
class BioGPTClassifier(nn.Module):
    def __init__(self, model_name, num_classes):
        super(BioGPTClassifier, self).__init__()
        self.biogpt = BioGptForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

    def forward(self, input_ids, attention_mask):
        return self.biogpt(input_ids=input_ids, attention_mask=attention_mask).logits

# Instantiate the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BioGPTClassifier(model_name="microsoft/biogpt", num_classes=NUM_CLASSES)
model.to(device)

# Loss function and optimizer
class_weights = torch.tensor([1.0, 0.5, 1.5, 2.0, 3.0], dtype=torch.float).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training the model
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(train_dataloader)}")

# Evaluation
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Adjust predicted and true labels back to original range (1 to 5)
predictions = [p+1 for p in predictions]
true_labels = [t+1 for t in true_labels]

# Classification report
print(classification_report(true_labels, predictions, target_names=[str(i+1) for i in range(NUM_CLASSES)]))

# Calculate Accuracy and Macro Average Recall with 95% Confidence Interval
def compute_confidence_interval(metric_func, true_labels, predictions, confidence=0.95):
    # Compute the metric (accuracy or recall)
    metric = metric_func(true_labels, predictions)

    # Calculate standard error
    se_metric = np.sqrt(metric * (1 - metric) / len(true_labels)) if len(true_labels) > 1 else np.nan
    ci_lower, ci_upper = stats.norm.interval(confidence, loc=metric, scale=se_metric) if len(true_labels) > 1 else (np.nan, np.nan)

    return metric, ci_lower, ci_upper

# Accuracy
accuracy, ci_acc_lower, ci_acc_upper = compute_confidence_interval(accuracy_score, true_labels, predictions)

# Macro average recall
macro_recall = recall_score(true_labels, predictions, average='macro')

# For macro recall confidence interval
if len(true_labels) > 1:
    recalls = recall_score(true_labels, predictions, average=None)
    se_recall = np.std(recalls, ddof=1) / np.sqrt(len(recalls))  # Standard error
    recall_ci_lower, recall_ci_upper = stats.norm.interval(0.95, loc=macro_recall, scale=se_recall)
else:
    recall_ci_lower, recall_ci_upper = (np.nan, np.nan)

# Output results
print(f"Accuracy: {accuracy:.4f}, 95% CI: ({ci_acc_lower:.4f}, {ci_acc_upper:.4f})")
print(f"Macro Average Recall: {macro_recall:.4f}, 95% CI: ({recall_ci_lower:.4f}, {recall_ci_upper:.4f})")
