In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from scipy import stats
import spacy

# Load English language model from Spacy
nlp = spacy.load("en_core_web_sm")

# Load the dataset
df = pd.read_csv("D:/Downloads/newData.csv")


def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

df['preprocessed_txt'] = df['Report'].apply(preprocess)


# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['preprocessed_txt'],
    df['BIRADS'],
    test_size=0.2,
    random_state=2022,
    stratify=df['BIRADS']
)

# Hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
NUM_CLASSES = 5
BATCH_SIZE = 16
EPOCHS = 10
MAX_LEN = 128

# Prepare custom Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenization and padding
        tokens = self.tokenizer(text)
        tokens = tokens[:self.max_len] + [0] * (self.max_len - len(tokens))  # Padding
        return {
            'text': torch.tensor(tokens, dtype=torch.long),
            'label': torch.tensor(label-1, dtype=torch.long)  # Adjust labels from 1-5 to 0-4
        }

# Tokenizer: a basic word tokenizer
def tokenizer(text):
    return [vocab.get(word, vocab['<unk>']) for word in text.split()]

# Build Vocabulary
vocab = {}
for text in df['preprocessed_txt']:
    for word in text.split():
        if word not in vocab:
            vocab[word] = len(vocab) + 1
vocab['<unk>'] = 0  # Unknown token

# Convert datasets to torch Datasets
train_dataset = TextDataset(X_train.tolist(), y_train.tolist(), tokenizer, MAX_LEN)
test_dataset = TextDataset(X_test.tolist(), y_test.tolist(), tokenizer, MAX_LEN)

# Data loaders
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Define the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]  # Take output from the last LSTM unit
        out = self.fc(lstm_out)
        return out

# Instantiate the model
model = LSTMClassifier(vocab_size=len(vocab), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, num_classes=NUM_CLASSES)

# Compute class weights for imbalanced data
class_weights = compute_class_weight('balanced', classes=np.array([1, 2, 3, 4, 5]), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Loss function and optimizer
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training the model
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        inputs, labels = batch['text'].to(device), batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(train_dataloader)}")

# Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        inputs, labels = batch['text'].to(device), batch['label'].to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Adjust predicted and true labels back to the original range (1 to 5)
predictions = [p+1 for p in predictions]
true_labels = [t+1 for t in true_labels]

# Classification report
report = classification_report(true_labels, predictions, target_names=[str(i) for i in range(1, NUM_CLASSES+1)], output_dict=True)
print(report)

# Bootstrapping for 95% confidence intervals
def bootstrap_metric(y_true, y_pred, metric_func, n_bootstrap=1000):
    bootstrapped_scores = []
    rng = np.random.default_rng()
    for _ in range(n_bootstrap):
        indices = rng.choice(np.arange(len(y_true)), len(y_true), replace=True)
        score = metric_func(y_true[indices], y_pred[indices])
        bootstrapped_scores.append(score)
    sorted_scores = np.sort(bootstrapped_scores)
    ci_lower = sorted_scores[int(0.025 * len(sorted_scores))]
    ci_upper = sorted_scores[int(0.975 * len(sorted_scores))]
    return np.mean(bootstrapped_scores), ci_lower, ci_upper

# Accuracy confidence interval
accuracy_mean, accuracy_lower, accuracy_upper = bootstrap_metric(np.array(true_labels), np.array(predictions), accuracy_score)
print(f"Accuracy: {accuracy_mean:.4f} (95% CI: {accuracy_lower:.4f} - {accuracy_upper:.4f})")

# Macro recall confidence interval
macro_recall_mean, macro_recall_lower, macro_recall_upper = bootstrap_metric(np.array(true_labels), np.array(predictions),
                                                                            lambda y_true, y_pred: recall_score(y_true, y_pred, average='macro'))
print(f"Macro Recall: {macro_recall_mean:.4f} (95% CI: {macro_recall_lower:.4f} - {macro_recall_upper:.4f})")
