In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import pandas as pd
from collections import deque
import matplotlib.pyplot as plt
import numpy as np
from datasets import load_dataset
import torch.optim as optim
import torch.nn.functional as F

In [2]:
# Load Train, Validation, Test Dataset
train_df = pd.read_csv('/kaggle/input/mp-ctc/MP_train.csv')
val_df = pd.read_csv('/kaggle/input/mp-ctc/MP_val.csv')
test_df = pd.read_csv('/kaggle/input/mp-ctc/MP_test.csv')

In [3]:
train_df.head()

Unnamed: 0,id,text,hospital_expire_flag
0,107384,"CHIEF COMPLAINT: AMS, concern for toxic alcoho...",0
1,101061,CHIEF COMPLAINT: abdominal pain\n\nPRESENT ILL...,0
2,127180,CHIEF COMPLAINT: Bilateral Sub Dural Hematoma\...,0
3,168339,CHIEF COMPLAINT: Intracranial bleed\n\nPRESENT...,0
4,154044,CHIEF COMPLAINT: ischemic left foot\n\nPRESENT...,0


In [4]:
val_df.head()

Unnamed: 0,id,text,hospital_expire_flag
0,176763,CHIEF COMPLAINT: # Lethargy # Confusion # Hypo...,0
1,173211,"CHIEF COMPLAINT: Dyspnea, LE edema\n\nPRESENT ...",0
2,116333,CHIEF COMPLAINT: upper GI bleed\n\nPRESENT ILL...,1
3,161102,CHIEF COMPLAINT: increased lethargy\n\nPRESENT...,0
4,116799,CHIEF COMPLAINT: s/p 18 ft fall\n\nPRESENT ILL...,0


In [5]:
val_df["hospital_expire_flag"].value_counts()

hospital_expire_flag
0    4390
1     518
Name: count, dtype: int64

In [6]:
test_df.head()

Unnamed: 0,id,text,hospital_expire_flag
0,100058,CHIEF COMPLAINT: \n\nPRESENT ILLNESS: The pati...,0
1,124871,CHIEF COMPLAINT: shortness of breath\n\nPRESEN...,0
2,109159,CHIEF COMPLAINT: s/p mechanical fall\n\nPRESEN...,0
3,159161,CHIEF COMPLAINT: nausea and vomiting\n\nPRESEN...,0
4,109863,CHIEF COMPLAINT: \n\nPRESENT ILLNESS: The pati...,1


In [7]:
train_df['hospital_expire_flag'].nunique()

2

In [8]:
val_df['hospital_expire_flag'].nunique()

2

In [9]:
test_df['hospital_expire_flag'].nunique()

2

In [10]:
train_df.drop(columns=['id'], inplace=True)
val_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

In [11]:
train_df.head()

Unnamed: 0,text,hospital_expire_flag
0,"CHIEF COMPLAINT: AMS, concern for toxic alcoho...",0
1,CHIEF COMPLAINT: abdominal pain\n\nPRESENT ILL...,0
2,CHIEF COMPLAINT: Bilateral Sub Dural Hematoma\...,0
3,CHIEF COMPLAINT: Intracranial bleed\n\nPRESENT...,0
4,CHIEF COMPLAINT: ischemic left foot\n\nPRESENT...,0


In [12]:
test_df.head()

Unnamed: 0,text,hospital_expire_flag
0,CHIEF COMPLAINT: \n\nPRESENT ILLNESS: The pati...,0
1,CHIEF COMPLAINT: shortness of breath\n\nPRESEN...,0
2,CHIEF COMPLAINT: s/p mechanical fall\n\nPRESEN...,0
3,CHIEF COMPLAINT: nausea and vomiting\n\nPRESEN...,0
4,CHIEF COMPLAINT: \n\nPRESENT ILLNESS: The pati...,1


In [13]:
train_df.shape

(33954, 2)

In [14]:
val_df.shape

(4908, 2)

In [15]:
test_df.shape

(9822, 2)

## ST (Stemming)

In [16]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd

# Download NLTK resources 
nltk.download('punkt')

# Initialize the PorterStemmer
porter = PorterStemmer()
# Function to stem text
def stem_text(text):
    words = word_tokenize(text)
    stemmed_words = [porter.stem(word) for word in words]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
# Apply stemming to the 'text' column
train_df['text'] = train_df['text'].apply(stem_text)
test_df['text'] = test_df['text'].apply(stem_text)
val_df['text'] = val_df['text'].apply(stem_text)
train_df.head()

Unnamed: 0,text,hospital_expire_flag
0,"chief complaint : am , concern for toxic alcoh...",0
1,chief complaint : abdomin pain present ill : 7...,0
2,chief complaint : bilater sub dural hematoma p...,0
3,chief complaint : intracrani bleed present ill...,0
4,chief complaint : ischem left foot present ill...,0


## RSW (Remove Stop Words)

In [18]:
# NLTK library to remove Stopwords.
from nltk.corpus import stopwords
stopword = stopwords.words('english')
# Function
def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopword:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [19]:
# Apply swr to the 'text' column
train_df['text'] = train_df['text'].apply(remove_stopwords)
test_df['text'] = test_df['text'].apply(remove_stopwords)
val_df['text'] = val_df['text'].apply(remove_stopwords)
train_df.head()

Unnamed: 0,text,hospital_expire_flag
0,"chief complaint : , concern toxic alcohol in...",0
1,chief complaint : abdomin pain present ill : 7...,0
2,chief complaint : bilater sub dural hematoma p...,0
3,chief complaint : intracrani bleed present ill...,0
4,chief complaint : ischem left foot present ill...,0


## CT (Clean Text)

In [20]:
!pip install clean-text
from cleantext import clean

Collecting clean-text
  Downloading clean_text-0.6.0-py3-none-any.whl.metadata (6.6 kB)
Collecting emoji<2.0.0,>=1.0.0 (from clean-text)
  Downloading emoji-1.7.0.tar.gz (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting ftfy<7.0,>=6.0 (from clean-text)
  Downloading ftfy-6.2.0-py3-none-any.whl.metadata (7.3 kB)
Downloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l- done
[?25h  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171033 sha256=c5e7436e46a767ceaa8e1c80d78e3186de09629140753346507a5cf66757d4ee
  Stored in directory: /root/.cache

In [21]:
# Apply text cleaning function
train_df['text'] = train_df['text'].apply(lambda x: clean(x,
                                                        fix_unicode=True,
                                                        to_ascii=True,
                                                        lower=True,
                                                        no_line_breaks=False,
                                                        no_urls=True,
                                                        no_emails=True,
                                                        no_phone_numbers=True,
                                                        no_numbers=True,
                                                        no_currency_symbols=True,
                                                        no_punct=True,
                                                        replace_with_punct="",
                                                        replace_with_url="<URL>",
                                                        replace_with_email="<EMAIL>",
                                                        replace_with_phone_number="<PHONE>",
                                                        replace_with_number="<NUMBER>",
                                                        replace_with_currency_symbol="<CUR>",
                                                        lang="en"
                                                        ))

train_df.head()

Unnamed: 0,text,hospital_expire_flag
0,chief complaint concern toxic alcohol ingest p...,0
1,chief complaint abdomin pain present ill <numb...,0
2,chief complaint bilater sub dural hematoma pre...,0
3,chief complaint intracrani bleed present ill <...,0
4,chief complaint ischem left foot present ill 7...,0


In [22]:
# Apply text cleaning function
test_df['text'] = test_df['text'].apply(lambda x: clean(x,
                                                        fix_unicode=True,
                                                        to_ascii=True,
                                                        lower=True,
                                                        no_line_breaks=False,
                                                        no_urls=True,
                                                        no_emails=True,
                                                        no_phone_numbers=True,
                                                        no_numbers=True,
                                                        no_currency_symbols=True,
                                                        no_punct=True,
                                                        replace_with_punct="",
                                                        replace_with_url="<URL>",
                                                        replace_with_email="<EMAIL>",
                                                        replace_with_phone_number="<PHONE>",
                                                        replace_with_number="<NUMBER>",
                                                        replace_with_currency_symbol="<CUR>",
                                                        lang="en"
                                                        ))

In [23]:
# Apply text cleaning function
val_df['text'] = val_df['text'].apply(lambda x: clean(x,
                                                        fix_unicode=True,
                                                        to_ascii=True,
                                                        lower=True,
                                                        no_line_breaks=False,
                                                        no_urls=True,
                                                        no_emails=True,
                                                        no_phone_numbers=True,
                                                        no_numbers=True,
                                                        no_currency_symbols=True,
                                                        no_punct=True,
                                                        replace_with_punct="",
                                                        replace_with_url="<URL>",
                                                        replace_with_email="<EMAIL>",
                                                        replace_with_phone_number="<PHONE>",
                                                        replace_with_number="<NUMBER>",
                                                        replace_with_currency_symbol="<CUR>",
                                                        lang="en"
                                                        ))

In [24]:
train_texts = train_df['text'].tolist()
train_labels = train_df['hospital_expire_flag'].tolist()
val_texts = val_df['text'].tolist()
val_labels = val_df['hospital_expire_flag'].tolist()
test_texts = test_df['text'].tolist()
test_labels = test_df['hospital_expire_flag'].tolist()

In [25]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = [str(text) for text in texts]
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [26]:
class RobertaClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(RobertaClassifier, self).__init__()
        self.bert =  RobertaModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [27]:
# Set up parameters
bert_model_name = 'FacebookAI/roberta-base'
num_classes = 2
max_length = 512
batch_size = 8

In [28]:
tokenizer = RobertaTokenizer.from_pretrained(bert_model_name)

train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaClassifier(bert_model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
epochs = 20
best_roc_auc = 0.0
min_delta = 0.0001
early_stopping_count = 0
early_stopping_patience = 3
gradient_accumulation_steps = 10

# Set the optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

# Set the scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=50, 
    num_training_steps=len(train_dataloader) * epochs // gradient_accumulation_steps
)


In [31]:
# Training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for step, batch in enumerate(train_dataloader):
        optimizer.zero_grad() if step % gradient_accumulation_steps == 0 else None
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        (loss / gradient_accumulation_steps).backward()
        train_loss += loss.item()
        if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_dataloader):
            optimizer.step()
            scheduler.step()
            
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = nn.CrossEntropyLoss()(outputs, labels)
            val_loss += loss.item()
            val_preds.append(F.softmax(outputs, dim=1).cpu().numpy())
            val_labels.append(labels.cpu().numpy())
            
    val_preds = np.concatenate(val_preds)
    val_labels = np.concatenate(val_labels)
    val_loss /= len(val_dataloader)
    train_loss /= len(train_dataloader)
    print(f'Epoch: {epoch+1}/{epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
    
    # Calculate metrics
    val_preds_class = np.argmax(val_preds, axis=1)
    accuracy = accuracy_score(val_labels, val_preds_class)
    recall = recall_score(val_labels, val_preds_class, average='weighted')
    precision = precision_score(val_labels, val_preds_class, average='weighted')
    f1 = f1_score(val_labels, val_preds_class, average='weighted')
    micro_f1 = f1_score(val_labels, val_preds_class, average='micro')
    macro_roc_auc = roc_auc_score(val_labels, val_preds[:, 1], multi_class='ovo', average='macro')  # Changed val_preds to val_preds[:, 1]
    
    print(f'Accuracy: {accuracy:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}, F1: {f1}, Micro F1: {micro_f1:.4f}, Macro Roc Auc: {macro_roc_auc:.4f}')
            
    # Implement early stopping
    if epoch > 0 and macro_roc_auc - best_roc_auc < min_delta:
        early_stopping_count += 1
        print(f'EarlyStopping counter: {early_stopping_count} out of {early_stopping_patience}')
        if early_stopping_count >= early_stopping_patience:
            print('Early stopping')
            break
    else:
        best_roc_auc = macro_roc_auc
        early_stopping_count = 0  # Reset early stopping counter

Epoch: 1/20, Training Loss: 0.3482, Validation Loss: 0.3165
Accuracy: 0.8945, Recall: 0.8945, Precision: 0.8001, F1: 0.8446269609908136, Micro F1: 0.8945, Macro Roc Auc: 0.7126


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 2/20, Training Loss: 0.3057, Validation Loss: 0.3036
Accuracy: 0.8945, Recall: 0.8945, Precision: 0.8001, F1: 0.8446269609908136, Micro F1: 0.8945, Macro Roc Auc: 0.7729


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 3/20, Training Loss: 0.2814, Validation Loss: 0.2806
Accuracy: 0.8943, Recall: 0.8943, Precision: 0.8568, F1: 0.8560756817808703, Micro F1: 0.8943, Macro Roc Auc: 0.7967
Epoch: 4/20, Training Loss: 0.2628, Validation Loss: 0.2850
Accuracy: 0.8979, Recall: 0.8979, Precision: 0.8710, F1: 0.870119465698785, Micro F1: 0.8979, Macro Roc Auc: 0.8014
Epoch: 5/20, Training Loss: 0.2488, Validation Loss: 0.2812
Accuracy: 0.8959, Recall: 0.8959, Precision: 0.8668, F1: 0.8687182589849196, Micro F1: 0.8959, Macro Roc Auc: 0.8012
EarlyStopping counter: 1 out of 3
Epoch: 6/20, Training Loss: 0.2308, Validation Loss: 0.3145
Accuracy: 0.8969, Recall: 0.8969, Precision: 0.8676, F1: 0.8641096169519152, Micro F1: 0.8969, Macro Roc Auc: 0.8047
Epoch: 7/20, Training Loss: 0.2141, Validation Loss: 0.3100
Accuracy: 0.8934, Recall: 0.8934, Precision: 0.8659, F1: 0.8716022986972992, Micro F1: 0.8934, Macro Roc Auc: 0.7977
EarlyStopping counter: 1 out of 3
Epoch: 8/20, Training Loss: 0.1963, Validation L

In [32]:
model.eval()

test_preds = []
test_labels = []

# Iterate over test data
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask)
        test_preds.append(F.softmax(outputs, dim=1).cpu().numpy())
        test_labels.append(labels.cpu().numpy())


In [33]:
test_preds = np.concatenate(test_preds)
test_labels = np.concatenate(test_labels)

test_preds_class = np.argmax(test_preds, axis=1)

report = classification_report(test_labels, test_preds_class, digits=4)

print(report)

              precision    recall  f1-score   support

           0     0.9202    0.9573    0.9384      8797
           1     0.4396    0.2878    0.3479      1025

    accuracy                         0.8874      9822
   macro avg     0.6799    0.6225    0.6431      9822
weighted avg     0.8701    0.8874    0.8768      9822

