In [1]:
# importing the required modules

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [2]:
#Loading the heart_disease_uci dataset
train_data = pd.read_csv('..\data\heart_disease\data_train.csv')
test_data = pd.read_csv('..\data\heart_disease\data_test.csv')
train_data.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,881,62,1.0,3.0,0.0,146.12,170.0,0.0,2.0,120.0,1.0,3.0,0.0,0.0,2.0,1
1,458,54,1.0,1.0,2.0,150.0,216.98,0.0,1.0,122.0,0.0,0.0,2.0,0.0,2.0,0
2,798,51,1.0,3.0,2.0,134.86,339.0,0.0,1.0,132.41,1.0,2.943,1.0,0.0,2.0,1
3,26,50,0.0,0.0,2.0,120.0,219.0,0.0,1.0,158.0,0.0,1.6,1.0,0.0,1.0,0
4,85,52,1.0,0.0,1.0,120.0,325.0,0.0,1.0,172.0,0.0,0.2,2.0,0.0,1.0,0


In [3]:
#Splitting the data into features and target
X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]

X_test = test_data.iloc[:,:-1]
y_test = test_data.iloc[:,-1]
X_train.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,881,62,1.0,3.0,0.0,146.12,170.0,0.0,2.0,120.0,1.0,3.0,0.0,0.0,2.0
1,458,54,1.0,1.0,2.0,150.0,216.98,0.0,1.0,122.0,0.0,0.0,2.0,0.0,2.0
2,798,51,1.0,3.0,2.0,134.86,339.0,0.0,1.0,132.41,1.0,2.943,1.0,0.0,2.0
3,26,50,0.0,0.0,2.0,120.0,219.0,0.0,1.0,158.0,0.0,1.6,1.0,0.0,1.0
4,85,52,1.0,0.0,1.0,120.0,325.0,0.0,1.0,172.0,0.0,0.2,2.0,0.0,1.0


In [4]:
#Scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [5]:
# Convert to pandas DataFrame for easier handling
train_data = pd.DataFrame(X_train)
train_data['target'] = y_train.values

test_data = pd.DataFrame(X_test)
test_data['target'] = y_test.values

In [6]:
#getting dummies for the train and test data categorical columns

categorical_columns = train_data.select_dtypes(include='object').columns
train_data = pd.get_dummies(train_data, columns=categorical_columns)
test_data = pd.get_dummies(test_data, columns=categorical_columns)




The code initializes a BERT tokenizer using the `BertTokenizer` class from the `transformers` library. The tokenizer is set up with the pre-trained model `bert-base-uncased`, which converts all text to lowercase before tokenizing. A function named `tokenize_row` is defined to tokenize each row of a DataFrame individually. This function converts the row to a single string by joining all its values with spaces, as BERT expects textual input. The tokenizer processes the string, adding padding to the maximum length, truncating if necessary, and returning the result as PyTorch tensors. The training and testing data are then tokenized by iterating over each row in the `train_data` and `test_data` DataFrames, applying the `tokenize_row` function, and storing the tokenized inputs in the `train_inputs` and `test_inputs` lists, respectively.

In [8]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize each row individually
def tokenize_row(row, tokenizer):
    # Convert the row to string, as BERT expects textual input
    row_str = " ".join(map(str, row.values))
    inputs = tokenizer(row_str, padding='max_length', truncation=True, return_tensors="pt")
    return inputs

# Tokenize the training and testing data
train_inputs = [tokenize_row(row, tokenizer) for _, row in train_data.iterrows()]
test_inputs = [tokenize_row(row, tokenizer) for _, row in test_data.iterrows()]
print(train_inputs)

[{'input_ids': tensor([[  101,  1015,  1012,  5388, 19317, 12376, 19841, 19961, 16576, 12740,
          2575,  2620,  1014,  1012,  3938, 18827, 12521, 27009,  2575,  2581,
         16576, 10790,  2683,  2620,  1014,  1012, 28952,  2683, 24434, 22932,
         14142, 16048, 16048, 19481,  1015,  1012,  5179, 22610,  2575, 17465,
         11387, 19481, 19317,  2692,  2581,  1011,  1014,  1012,  6282,  2620,
         22407,  2581, 26224, 19317,  2683, 12376, 21619,  2581,  1014,  1012,
          6356, 24087, 25746, 27009,  8889, 28154,  2581, 26224,  2475,  1011,
          1014,  1012, 25491,  2692,  2620,  2620,  2581, 18827, 21057,  2683,
          2620, 24434, 16068,  1011,  1014,  1012, 22649, 10790, 17788,  2620,
         17134, 23499, 19481,  2683,  2683,  1015,  1012,  3438, 17465,  2575,
          2683, 17134,  2683, 17134, 16576, 16576,  2549,  1011,  1014,  1012,
          3515, 12376, 12376, 17134, 28311,  2683,  2683, 16068,  2575,  2581,
          1015,  1012, 21035, 17134, 

The code imports necessary modules from the `torch` library and defines a custom dataset class `HeartDiseaseDataset` that inherits from `torch.utils.data.Dataset`. The class is initialized with `inputs` and `labels`, storing them as instance variables. The `__len__` method returns the number of samples in the dataset, while the `__getitem__` method retrieves the input IDs, attention mask, and label for a given index, removing the batch dimension from the input IDs and attention mask. The dataset and dataloader are then created for both training and testing data. `train_dataset` and `test_dataset` are instances of `HeartDiseaseDataset`, initialized with `train_inputs` and `y_train` for training, and `test_inputs` and `y_test` for testing. The `DataLoader` class is used to create `train_loader` and `test_loader` with a batch size of 16, shuffling the training data.

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

class HeartDiseaseDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = self.inputs[idx]['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = self.inputs[idx]['attention_mask'].squeeze(0)
        label = torch.tensor(self.labels.iloc[idx])
        return input_ids, attention_mask, label

# Create dataset and dataloader
train_dataset = HeartDiseaseDataset(train_inputs, y_train)
test_dataset = HeartDiseaseDataset(test_inputs, y_test)
print(train_dataset)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
print(train_loader)


<__main__.HeartDiseaseDataset object at 0x000001D78FA30820>
<torch.utils.data.dataloader.DataLoader object at 0x000001D78FA30670>


The code imports `BertForSequenceClassification` and `AdamW` from the `transformers` library. It then loads a pre-trained BERT model for sequence classification with two labels using `BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)`. The optimizer is set up using `AdamW` with the model parameters and a learning rate of `5e-5`. A training function `train` is defined, which sets the model to training mode, initializes the total loss, and iterates over batches in the `train_loader`. For each batch, it zeroes the gradients, performs a forward pass to compute the loss, accumulates the loss, performs backpropagation, and updates the model parameters using the optimizer. The average loss for the epoch is returned. The training loop runs for three epochs, calling the `train` function for each epoch and printing the average loss.

In [9]:
from transformers import BertForSequenceClassification, AdamW
import torch

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training function with gradient accumulation
def train(model, train_loader, optimizer, accumulation_steps=4):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for i, batch in enumerate(train_loader):
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
    return total_loss / len(train_loader)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    avg_loss = train(model, train_loader, optimizer)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 0.7021
Epoch 2/3, Loss: 0.6771
Epoch 3/3, Loss: 0.5825


The code defines a function `get_predictions` to obtain predictions from a model using a data loader. The function sets the model to evaluation mode with `model.eval()` and initializes an empty list for predictions. It uses `torch.no_grad()` to disable gradient calculation, iterating over batches in the `data_loader`. For each batch, it extracts `input_ids` and `attention_mask`, performs a forward pass through the model, and computes the predicted class labels using `torch.max` on the model's logits. The predictions are converted to a NumPy array and appended to the predictions list. Finally, the function returns the list of predictions. The function is then called to get predictions on the test set, storing the results in `test_predictions`.

In [10]:
# Get Predictions

# Function to get predictions
def get_predictions(model, data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, _ = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, pred = torch.max(outputs.logits, dim=1)
            predictions.extend(pred.cpu().numpy())
    return predictions

# Get predictions on test set
test_predictions = get_predictions(model, test_loader)

# save predictions to results folder names bert.csv
pd.DataFrame(test_predictions).to_csv('results/bert.csv', index=False)


In [11]:
def evaluate(model, test_loader):
    model.eval()
    total_correct = 0
    total_examples = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            total_correct += (predictions == labels).sum().item()
            total_examples += labels.size(0)
    accuracy = total_correct / total_examples
    return accuracy

# Evaluate the model
accuracy = evaluate(model, test_loader)
print(f'Test Accuracy: {accuracy:.4f}')


Test Accuracy: 0.7446


The code imports `confusion_matrix` from `sklearn.metrics` and `numpy` as `np`. It defines a function `evaluate_confusion_matrix` to create a confusion matrix for evaluating the model. The function sets the model to evaluation mode with `model.eval()` and initializes empty lists for predictions and actual labels. Using `torch.no_grad()` to disable gradient calculation, it iterates over batches in the `test_loader`. For each batch, it extracts `input_ids`, `attention_mask`, and `labels`, performs a forward pass through the model, and appends the predicted class labels (obtained using `torch.argmax` on the model's logits) and actual labels to their respective lists. The function returns a confusion matrix computed from the actual and predicted labels using `confusion_matrix(actuals, predictions)`. The confusion matrix is then evaluated and printed by calling `evaluate_confusion_matrix(model, test_loader)` and storing the result in `conf_matrix`.

In [12]:
# create a confusion matrix to evaluate the model
from sklearn.metrics import confusion_matrix
import numpy as np

def evaluate_confusion_matrix(model, test_loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions.extend(torch.argmax(outputs.logits, dim=-1).cpu().numpy())
            actuals.extend(labels.cpu().numpy())
    return confusion_matrix(actuals, predictions)

conf_matrix = evaluate_confusion_matrix(model, test_loader)
print(conf_matrix)


[[63 12]
 [35 74]]
