In [42]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from tqdm import tqdm
from ucimlrepo import fetch_ucirepo 

In [23]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

In [2]:
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 


In [3]:
breast_cancer_wisconsin_diagnostic.metadata

{'uci_id': 17,
 'name': 'Breast Cancer Wisconsin (Diagnostic)',
 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic',
 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv',
 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.',
 'area': 'Health and Medicine',
 'tasks': ['Classification'],
 'characteristics': ['Multivariate'],
 'num_instances': 569,
 'num_features': 30,
 'feature_types': ['Real'],
 'demographics': [],
 'target_col': ['Diagnosis'],
 'index_col': ['ID'],
 'has_missing_values': 'no',
 'missing_values_symbol': None,
 'year_of_dataset_creation': 1993,
 'last_updated': 'Fri Nov 03 2023',
 'dataset_doi': '10.24432/C5DW2B',
 'creators': ['William Wolberg',
  'Olvi Mangasarian',
  'Nick Street',
  'W. Street'],
 'intro_paper': {'title': 'Nuclear feature extraction for breast tumor diagnosis',
  'authors': 'W. Street, W. Wolberg, O. Mangasarian',
  'published_in': 'Electronic imaging',
  'year': 1993,
  'url': 'https:

In [4]:
y = pd.DataFrame(y.Diagnosis.map(dict(M=1,B=0)))
y

Unnamed: 0,Diagnosis
0,1
1,1
2,1
3,1
4,1
...,...
564,1
565,1
566,1
567,1


In [5]:
y.value_counts()

Diagnosis
0            357
1            212
Name: count, dtype: int64

In [6]:
X = (X-X.min() )/ (X.max()-X.min())
X.head()

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.620776,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.606901,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.556386,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.24831,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.519744,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
print(y_train.value_counts())
print(y_test.value_counts())

Diagnosis
0            291
1            164
Name: count, dtype: int64
Diagnosis
0            66
1            48
Name: count, dtype: int64


We note from above that our datasets are slightly unevenly distributed.

In [8]:
class Model(nn.Module):
    def __init__(self, features):
        super().__init__()
        self.features = features

        self.layers = nn.Sequential(
            nn.Linear(features, 36),
            nn.Dropout(p=0.2),
            nn.ReLU(),
            nn.Linear(36, 16),
            nn.Dropout(p=0.2),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.Dropout(p=0.2),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.Dropout(p=0.2),
            nn.ReLU(),
            nn.Linear(4,1)
        )
    
    def forward(self, x):
        return self.layers(x)

In [9]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.Tensor(x)
        self.y = torch.Tensor(y)
        self.n_samples = len(x)
        self.features = 0 if len(x.shape) < 2 else x.shape[-1]
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.n_samples

In [17]:
def train_fn(loader, model, optimizer, loss_fn, device="cpu"):
    loop = tqdm(loader)

    average_loss = 0
    count = 0

    for batch_idx, (data, targets) in enumerate(loop):
        data = data.to(device=device)
        targets = targets.to(device=device)

        pred = model.forward(data)
        loss = loss_fn(pred, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())
        average_loss += loss.item()
        count += 1
    
    average_loss = average_loss / count
    return average_loss

In [19]:
dataset = CustomDataset(X_train.values, y_train.values)
loader = DataLoader(
    dataset,
    batch_size = 1,
    shuffle = True
)

criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([y_train.value_counts()[0] / y_train.value_counts()[1]]).to(device=DEVICE))
model = Model(dataset.features).to(device=DEVICE)
optimizer = torch.optim.Adam(model.parameters())

NUM_EPOCHS = 50
# 30 should be enough already

for i in range(NUM_EPOCHS):
    ave_loss = train_fn(loader, model, optimizer, criterion, device=DEVICE)
    print(f'Epoch {i+1}: {ave_loss}')

100%|██████████| 455/455 [00:04<00:00, 96.47it/s, loss=0.971] 


Epoch 1: 0.8867958765763503


100%|██████████| 455/455 [00:04<00:00, 106.45it/s, loss=0.115]  


Epoch 2: 0.5652732591564348


100%|██████████| 455/455 [00:04<00:00, 111.46it/s, loss=0.00437] 


Epoch 3: 0.3567843697693976


100%|██████████| 455/455 [00:03<00:00, 143.46it/s, loss=9.54e-6] 


Epoch 4: 0.29513096143747425


100%|██████████| 455/455 [00:03<00:00, 142.87it/s, loss=0.0406]  


Epoch 5: 0.3139108216755091


100%|██████████| 455/455 [00:03<00:00, 142.12it/s, loss=0.325]   


Epoch 6: 0.2630222142856118


100%|██████████| 455/455 [00:03<00:00, 142.28it/s, loss=0.00242] 


Epoch 7: 0.28096430236840675


100%|██████████| 455/455 [00:03<00:00, 138.12it/s, loss=0.000605]


Epoch 8: 0.2046786964853649


100%|██████████| 455/455 [00:03<00:00, 141.22it/s, loss=0.783]   


Epoch 9: 0.2265641656096888


100%|██████████| 455/455 [00:03<00:00, 140.19it/s, loss=0.000178]


Epoch 10: 0.2340362729564941


100%|██████████| 455/455 [00:03<00:00, 140.56it/s, loss=0.00442] 


Epoch 11: 0.20547149277031954


100%|██████████| 455/455 [00:03<00:00, 141.85it/s, loss=8.58e-6] 


Epoch 12: 0.2110996446855297


100%|██████████| 455/455 [00:03<00:00, 136.42it/s, loss=0.142]   


Epoch 13: 0.18562944828559685


100%|██████████| 455/455 [00:03<00:00, 140.95it/s, loss=0.042]   


Epoch 14: 0.193556460834537


100%|██████████| 455/455 [00:03<00:00, 139.83it/s, loss=0.00651] 


Epoch 15: 0.2078582278475037


100%|██████████| 455/455 [00:03<00:00, 139.63it/s, loss=0]       


Epoch 16: 0.15742899121505913


100%|██████████| 455/455 [00:03<00:00, 133.59it/s, loss=0]       


Epoch 17: 0.20332149769776317


100%|██████████| 455/455 [00:03<00:00, 138.84it/s, loss=0.082]   


Epoch 18: 0.2019595593912327


100%|██████████| 455/455 [00:03<00:00, 142.48it/s, loss=0.00151] 


Epoch 19: 0.1944008335245591


100%|██████████| 455/455 [00:03<00:00, 139.44it/s, loss=0.0421]  


Epoch 20: 0.18595439190721255


100%|██████████| 455/455 [00:02<00:00, 152.62it/s, loss=0.00206] 


Epoch 21: 0.17809713800288116


100%|██████████| 455/455 [00:03<00:00, 144.37it/s, loss=0.0478]  


Epoch 22: 0.13653880942257773


100%|██████████| 455/455 [00:03<00:00, 142.95it/s, loss=0.0677]  


Epoch 23: 0.21367289120088878


100%|██████████| 455/455 [00:03<00:00, 150.95it/s, loss=3.91e-5] 


Epoch 24: 0.19466707319830615


100%|██████████| 455/455 [00:03<00:00, 134.09it/s, loss=0.000444]


Epoch 25: 0.13328589077958664


100%|██████████| 455/455 [00:03<00:00, 149.11it/s, loss=4.77e-6] 


Epoch 26: 0.1604026399905606


100%|██████████| 455/455 [00:02<00:00, 152.23it/s, loss=0.0769]  


Epoch 27: 0.16963402770768843


100%|██████████| 455/455 [00:02<00:00, 159.27it/s, loss=1.69e-6] 


Epoch 28: 0.1499278279853095


100%|██████████| 455/455 [00:02<00:00, 160.52it/s, loss=0.00301] 


Epoch 29: 0.1669287873241121


100%|██████████| 455/455 [00:02<00:00, 166.10it/s, loss=0.835]   


Epoch 30: 0.12727452941920217


100%|██████████| 455/455 [00:03<00:00, 126.52it/s, loss=3.09e-5] 


Epoch 31: 0.183467406117785


100%|██████████| 455/455 [00:03<00:00, 117.75it/s, loss=0]       


Epoch 32: 0.1415920072211747


100%|██████████| 455/455 [00:03<00:00, 123.10it/s, loss=0.0016]  


Epoch 33: 0.15059857530902113


100%|██████████| 455/455 [00:02<00:00, 152.57it/s, loss=0]       


Epoch 34: 0.1257462913128454


100%|██████████| 455/455 [00:02<00:00, 155.04it/s, loss=0]       


Epoch 35: 0.14929245753993053


100%|██████████| 455/455 [00:02<00:00, 170.66it/s, loss=0.002]   


Epoch 36: 0.13925325539690178


100%|██████████| 455/455 [00:02<00:00, 169.76it/s, loss=1.03]    


Epoch 37: 0.1508363658867251


100%|██████████| 455/455 [00:02<00:00, 177.45it/s, loss=1.02]    


Epoch 38: 0.11823789136038278


100%|██████████| 455/455 [00:02<00:00, 167.01it/s, loss=0.822]   


Epoch 39: 0.15021099246414135


100%|██████████| 455/455 [00:02<00:00, 168.40it/s, loss=2.86]    


Epoch 40: 0.12712829696661088


100%|██████████| 455/455 [00:02<00:00, 166.50it/s, loss=0.817]   


Epoch 41: 0.11652790943976526


100%|██████████| 455/455 [00:02<00:00, 154.99it/s, loss=0.000203]


Epoch 42: 0.1599344162551993


100%|██████████| 455/455 [00:02<00:00, 167.06it/s, loss=0.00478] 


Epoch 43: 0.13186289460633266


100%|██████████| 455/455 [00:03<00:00, 123.95it/s, loss=0.00306] 


Epoch 44: 0.15470077896476692


100%|██████████| 455/455 [00:03<00:00, 118.60it/s, loss=0.00809] 


Epoch 45: 0.13516912229396272


100%|██████████| 455/455 [00:03<00:00, 119.42it/s, loss=0.0142]  


Epoch 46: 0.15428892008064443


100%|██████████| 455/455 [00:03<00:00, 143.50it/s, loss=0.186]   


Epoch 47: 0.14048643458400403


100%|██████████| 455/455 [00:03<00:00, 141.86it/s, loss=1.91e-6] 


Epoch 48: 0.11539679243828684


100%|██████████| 455/455 [00:03<00:00, 141.97it/s, loss=0.000303]


Epoch 49: 0.10703248491899175


100%|██████████| 455/455 [00:03<00:00, 151.07it/s, loss=5.54e-5] 

Epoch 50: 0.13379262197126016





In [47]:
test_loader = DataLoader(
    CustomDataset(X_test.values, y_test.values),
    batch_size = 1,
    shuffle = True
)

y_pred = []
y_true = []

model.eval()

for batch_idx, (data, targets) in enumerate(test_loader):
    data = data.to(DEVICE)
    targets = targets.to(DEVICE)

    with torch.no_grad():
        pred = model(data)
    
    pred = torch.sigmoid(pred)
    pred = (pred >= 0.5).int()
    y_pred.append(pred.cpu().numpy()[0][0])
    y_true.append(targets.int().cpu().numpy()[0][0])

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')
print(f'True positives: {tp}')
print(f'Balanced accuracy score: {balanced_accuracy_score(y_true, y_pred)}')

True negatives: 66
False positives: 0
False negatives: 3
True positives: 45
Balanced accuracy score: 0.96875
