In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import pandas as pd
from collections import deque
import matplotlib.pyplot as plt
import numpy as np
from datasets import load_dataset
import torch.optim as optim
import torch.nn.functional as F

In [2]:
# Load Train, Validation, Test Dataset
train_df = pd.read_csv('/kaggle/input/mimic-3/MIMIC III/MP_IN_adm_train.csv')
val_df = pd.read_csv('/kaggle/input/mimic-3/MIMIC III/MP_IN_adm_val.csv')
test_df = pd.read_csv('/kaggle/input/mimic-3/MIMIC III/MP_IN_adm_test.csv')

In [3]:
train_df.head()

Unnamed: 0,id,text,hospital_expire_flag
0,107384,"CHIEF COMPLAINT: AMS, concern for toxic alcoho...",0
1,101061,CHIEF COMPLAINT: abdominal pain\n\nPRESENT ILL...,0
2,127180,CHIEF COMPLAINT: Bilateral Sub Dural Hematoma\...,0
3,168339,CHIEF COMPLAINT: Intracranial bleed\n\nPRESENT...,0
4,154044,CHIEF COMPLAINT: ischemic left foot\n\nPRESENT...,0


In [4]:
val_df.head()

Unnamed: 0,id,text,hospital_expire_flag
0,176763,CHIEF COMPLAINT: # Lethargy # Confusion # Hypo...,0
1,173211,"CHIEF COMPLAINT: Dyspnea, LE edema\n\nPRESENT ...",0
2,116333,CHIEF COMPLAINT: upper GI bleed\n\nPRESENT ILL...,1
3,161102,CHIEF COMPLAINT: increased lethargy\n\nPRESENT...,0
4,116799,CHIEF COMPLAINT: s/p 18 ft fall\n\nPRESENT ILL...,0


In [5]:
test_df.head()

Unnamed: 0,id,text,hospital_expire_flag
0,100058,CHIEF COMPLAINT: \n\nPRESENT ILLNESS: The pati...,0
1,124871,CHIEF COMPLAINT: shortness of breath\n\nPRESEN...,0
2,109159,CHIEF COMPLAINT: s/p mechanical fall\n\nPRESEN...,0
3,159161,CHIEF COMPLAINT: nausea and vomiting\n\nPRESEN...,0
4,109863,CHIEF COMPLAINT: \n\nPRESENT ILLNESS: The pati...,1


In [6]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd

# Download NLTK resources (run only once)
nltk.download('punkt')

# Initialize the PorterStemmer
porter = PorterStemmer()
# Function to stem text
def stem_text(text):
    words = word_tokenize(text)
    stemmed_words = [porter.stem(word) for word in words]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

# Apply stemming to the 'text' column
train_df['text'] = train_df['text'].apply(stem_text)
val_df['text'] = val_df['text'].apply(stem_text)
test_df['text'] = test_df['text'].apply(stem_text)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
pip install clean-text

Collecting clean-text
  Downloading clean_text-0.6.0-py3-none-any.whl.metadata (6.6 kB)
Collecting emoji<2.0.0,>=1.0.0 (from clean-text)
  Downloading emoji-1.7.0.tar.gz (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting ftfy<7.0,>=6.0 (from clean-text)
  Downloading ftfy-6.2.0-py3-none-any.whl.metadata (7.3 kB)
Downloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l- done
[?25h  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171033 sha256=0d6a470470053d479b31b55df10f56ccf4206e2b365d24912600eb851bae0e19
  Stored in directory: /root/.cache

In [8]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

# Apply the cleaning function to the 'text' column
train_df['text'] = train_df['text'].apply(remove_stopwords)
val_df['text'] = val_df['text'].apply(remove_stopwords)
test_df['text'] = test_df['text'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
train_df['hospital_expire_flag'].nunique()

2

In [10]:
val_df['hospital_expire_flag'].nunique()

2

In [11]:
test_df['hospital_expire_flag'].nunique()

2

In [12]:
train_df.drop(columns=['id'], inplace=True)
val_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

In [13]:
train_df.head()

Unnamed: 0,text,hospital_expire_flag
0,"chief complaint : , concern toxic alcohol inge...",0
1,chief complaint : abdomin pain present ill : 7...,0
2,chief complaint : bilater sub dural hematoma p...,0
3,chief complaint : intracrani bleed present ill...,0
4,chief complaint : ischem left foot present ill...,0


In [14]:
test_df.head()

Unnamed: 0,text,hospital_expire_flag
0,chief complaint : present ill : patient 57 yea...,0
1,chief complaint : short breath present ill : 7...,0
2,chief complaint : s/p mechan fall present ill ...,0
3,chief complaint : nausea vomit present ill : p...,0
4,chief complaint : present ill : patient 48 yea...,1


In [15]:
train_df.shape

(33954, 2)

In [16]:
val_df.shape

(4908, 2)

In [17]:
test_df.shape

(9822, 2)

In [18]:
train_texts = train_df['text'].tolist()
train_labels = train_df['hospital_expire_flag'].tolist()
val_texts = val_df['text'].tolist()
val_labels = val_df['hospital_expire_flag'].tolist()
test_texts = test_df['text'].tolist()
test_labels = test_df['hospital_expire_flag'].tolist()

In [19]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = [str(text) for text in texts]
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [20]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [21]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 512
batch_size = 8

In [22]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [24]:
epochs = 20
best_roc_auc = 0.0
min_delta = 0.0001
early_stopping_count = 0
early_stopping_patience = 3
gradient_accumulation_steps = 10

# Set the optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

# Set the scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=50, 
    num_training_steps=len(train_dataloader) * epochs // gradient_accumulation_steps
)


In [25]:
# Training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for step, batch in enumerate(train_dataloader):
        optimizer.zero_grad() if step % gradient_accumulation_steps == 0 else None
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        (loss / gradient_accumulation_steps).backward()
        train_loss += loss.item()
        if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_dataloader):
            optimizer.step()
            scheduler.step()
            
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = nn.CrossEntropyLoss()(outputs, labels)
            val_loss += loss.item()
            val_preds.append(F.softmax(outputs, dim=1).cpu().numpy())
            val_labels.append(labels.cpu().numpy())
            
    val_preds = np.concatenate(val_preds)
    val_labels = np.concatenate(val_labels)
    val_loss /= len(val_dataloader)
    train_loss /= len(train_dataloader)
    print(f'Epoch: {epoch+1}/{epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
    
    # Calculate metrics
    val_preds_class = np.argmax(val_preds, axis=1)
    accuracy = accuracy_score(val_labels, val_preds_class)
    recall = recall_score(val_labels, val_preds_class, average='weighted')
    precision = precision_score(val_labels, val_preds_class, average='weighted')
    f1 = f1_score(val_labels, val_preds_class, average='weighted')
    micro_f1 = f1_score(val_labels, val_preds_class, average='micro')
    macro_roc_auc = roc_auc_score(val_labels, val_preds[:, 1], multi_class='ovo', average='macro')  # Changed val_preds to val_preds[:, 1]
    
    print(f'Accuracy: {accuracy:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}, F1: {f1}, Micro F1: {micro_f1:.4f}, Macro Roc Auc: {macro_roc_auc:.4f}')
            
    # Implement early stopping
    if epoch > 0 and macro_roc_auc - best_roc_auc < min_delta:
        early_stopping_count += 1
        print(f'EarlyStopping counter: {early_stopping_count} out of {early_stopping_patience}')
        if early_stopping_count >= early_stopping_patience:
            print('Early stopping')
            break
    else:
        best_roc_auc = macro_roc_auc
        early_stopping_count = 0  # Reset early stopping counter

Epoch: 1/20, Training Loss: 0.3522, Validation Loss: 0.3080
Accuracy: 0.8945, Recall: 0.8945, Precision: 0.8001, F1: 0.8446269609908136, Micro F1: 0.8945, Macro Roc Auc: 0.7226


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 2/20, Training Loss: 0.2949, Validation Loss: 0.2820
Accuracy: 0.8965, Recall: 0.8965, Precision: 0.8759, F1: 0.8524363101139121, Micro F1: 0.8965, Macro Roc Auc: 0.7909
Epoch: 3/20, Training Loss: 0.2680, Validation Loss: 0.2872
Accuracy: 0.8991, Recall: 0.8991, Precision: 0.8776, F1: 0.8630250224796862, Micro F1: 0.8991, Macro Roc Auc: 0.7971
Epoch: 4/20, Training Loss: 0.2499, Validation Loss: 0.2782
Accuracy: 0.8998, Recall: 0.8998, Precision: 0.8799, F1: 0.8640004505717763, Micro F1: 0.8998, Macro Roc Auc: 0.8051
Epoch: 5/20, Training Loss: 0.2259, Validation Loss: 0.3140
Accuracy: 0.9004, Recall: 0.9004, Precision: 0.8882, F1: 0.8623036919510163, Micro F1: 0.9004, Macro Roc Auc: 0.8046
EarlyStopping counter: 1 out of 3
Epoch: 6/20, Training Loss: 0.2019, Validation Loss: 0.3063
Accuracy: 0.8979, Recall: 0.8979, Precision: 0.8714, F1: 0.871516532955614, Micro F1: 0.8979, Macro Roc Auc: 0.7898
EarlyStopping counter: 2 out of 3
Epoch: 7/20, Training Loss: 0.1766, Validation L

In [26]:
model.eval()

test_preds = []
test_labels = []

# Iterate over test data
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask)
        test_preds.append(F.softmax(outputs, dim=1).cpu().numpy())
        test_labels.append(labels.cpu().numpy())


In [27]:
test_preds = np.concatenate(test_preds)
test_labels = np.concatenate(test_labels)

test_preds_class = np.argmax(test_preds, axis=1)

report = classification_report(test_labels, test_preds_class, digits = 4)

print(report)

              precision    recall  f1-score   support

           0     0.9139    0.9795    0.9456      8797
           1     0.5420    0.2078    0.3004      1025

    accuracy                         0.8990      9822
   macro avg     0.7279    0.5937    0.6230      9822
weighted avg     0.8751    0.8990    0.8782      9822

