In [12]:
# importing the required modules

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [13]:
# Load the data
train_dta = pd.read_csv('../../data/breast_cancer/breast_cancer_data_train.csv')
test_dta = pd.read_csv('../../data/breast_cancer/breast_cancer_data_test.csv')
train_dta.head()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
0,859471,9.029,17.33,58.79,250.5,0.1066,0.1413,0.313,0.04375,0.2111,...,22.65,65.5,324.7,0.1482,0.4365,1.252,0.175,0.4228,0.1175,0
1,873593,21.09,26.57,142.7,1311.0,0.1141,0.2832,0.2487,0.1496,0.2395,...,33.48,176.5,2089.0,0.1491,0.7584,0.678,0.2903,0.4098,0.1284,1
2,859196,9.173,13.86,59.2,260.9,0.07721,0.08751,0.05988,0.0218,0.2341,...,19.23,65.59,310.1,0.09836,0.1678,0.1397,0.05087,0.3282,0.0849,0
3,88466802,10.65,25.22,68.01,347.0,0.09657,0.07234,0.02379,0.01615,0.1897,...,35.19,77.98,455.7,0.1499,0.1398,0.1125,0.06136,0.3409,0.08147,0
4,858970,10.17,14.88,64.55,311.9,0.1134,0.08061,0.01084,0.0129,0.2743,...,17.45,69.86,368.6,0.1275,0.09866,0.02168,0.02579,0.3557,0.0802,0


In [14]:
# splitting the data into features and target
X_train = train_dta.drop(['diagnosis'], axis=1)
y_train = train_dta['diagnosis']

X_test = test_dta.drop(['diagnosis'], axis=1)
y_test = test_dta['diagnosis']

In [15]:
# splitting the data into features and target
X_train = train_dta.drop(['diagnosis'], axis=1)
y_train = train_dta['diagnosis']

X_test = test_dta.drop(['diagnosis'], axis=1)
y_test = test_dta['diagnosis']

In [16]:
# scaling the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize each row individually
def tokenize_row(row, tokenizer):
    # Convert the row to string, as BERT expects textual input
    row_str = " ".join(map(str, row.values))
    inputs = tokenizer(row_str, padding='max_length', truncation=True, return_tensors="pt")
    return inputs

# Tokenize the training and testing data
train_inputs = [tokenize_row(row, tokenizer) for _, row in train_dta.iterrows()]
test_inputs = [tokenize_row(row, tokenizer) for _, row in test_dta.iterrows()]



In [18]:
class BreastCancerDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = self.inputs[idx]['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = self.inputs[idx]['attention_mask'].squeeze(0)
        label = torch.tensor(self.labels[idx])
        return input_ids, attention_mask, label

# Create dataset and dataloader
train_dataset = BreastCancerDataset(train_inputs, y_train)
test_dataset = BreastCancerDataset(test_inputs, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [19]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training function with gradient accumulation
def train(model, train_loader, optimizer, accumulation_steps=4):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for i, batch in enumerate(train_loader):
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
    return total_loss / len(train_loader)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    avg_loss = train(model, train_loader, optimizer)ö
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6558
Epoch 2/5, Loss: 0.4657
Epoch 3/5, Loss: 0.0936
Epoch 4/5, Loss: 0.0143
Epoch 5/5, Loss: 0.0042


In [20]:
# Get Predictions

# Function to get predictions
def get_predictions(model, data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, _ = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, pred = torch.max(outputs.logits, dim=1)
            predictions.extend(pred.cpu().numpy())
    return predictions

# Get predictions on test set
test_predictions = get_predictions(model, test_loader)

# save predictions to results folder names bert.csv
pd.DataFrame(test_predictions).to_csv('results/bert.csv', index=False)


In [21]:
def evaluate(model, test_loader):
    model.eval()
    total_correct = 0
    total_examples = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            total_correct += (predictions == labels).sum().item()
            total_examples += labels.size(0)
    accuracy = total_correct / total_examples
    return accuracy

# Evaluate the model
accuracy = evaluate(model, test_loader)
print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 1.0000


In [22]:
# create a confusion matrix to evaluate the model
from sklearn.metrics import confusion_matrix
import numpy as np

def evaluate_confusion_matrix(model, test_loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions.extend(torch.argmax(outputs.logits, dim=-1).cpu().numpy())
            actuals.extend(labels.cpu().numpy())
    return confusion_matrix(actuals, predictions)

conf_matrix = evaluate_confusion_matrix(model, test_loader)
print(conf_matrix)

[[71  0]
 [ 0 43]]
