In [1]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from ucimlrepo import fetch_ucirepo 

In [2]:
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 


In [3]:
breast_cancer_wisconsin_diagnostic.metadata

{'uci_id': 17,
 'name': 'Breast Cancer Wisconsin (Diagnostic)',
 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic',
 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv',
 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.',
 'area': 'Health and Medicine',
 'tasks': ['Classification'],
 'characteristics': ['Multivariate'],
 'num_instances': 569,
 'num_features': 30,
 'feature_types': ['Real'],
 'demographics': [],
 'target_col': ['Diagnosis'],
 'index_col': ['ID'],
 'has_missing_values': 'no',
 'missing_values_symbol': None,
 'year_of_dataset_creation': 1993,
 'last_updated': 'Fri Nov 03 2023',
 'dataset_doi': '10.24432/C5DW2B',
 'creators': ['William Wolberg',
  'Olvi Mangasarian',
  'Nick Street',
  'W. Street'],
 'intro_paper': {'title': 'Nuclear feature extraction for breast tumor diagnosis',
  'authors': 'W. Street, W. Wolberg, O. Mangasarian',
  'published_in': 'Electronic imaging',
  'year': 1993,
  'url': 'https:

In [4]:
y = pd.DataFrame(y.Diagnosis.map(dict(M=1,B=0)))
y

Unnamed: 0,Diagnosis
0,1
1,1
2,1
3,1
4,1
...,...
564,1
565,1
566,1
567,1


In [5]:
y.value_counts()

Diagnosis
0            357
1            212
Name: count, dtype: int64

In [6]:
X = (X-X.min() )/ (X.max()-X.min())
X.head()

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.620776,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.606901,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.556386,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.24831,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.519744,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
print(y_train.value_counts())
print(y_test.value_counts())

Diagnosis
0            291
1            164
Name: count, dtype: int64
Diagnosis
0            66
1            48
Name: count, dtype: int64


In [8]:
class Model(nn.Module):
    def __init__(self, features):
        super().__init__()
        self.features = features

        self.layers = nn.Sequential(
            nn.Linear(features, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.ReLU(),
            nn.Linear(4,1)
        )
    
    def forward(self, x):
        return self.layers(x)

In [9]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.Tensor(x)
        self.y = torch.Tensor(y)
        self.n_samples = len(x)
        self.features = 0 if len(x.shape) < 2 else x.shape[-1]
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.n_samples

In [10]:
def train_fn(loader, model, optimizer, loss_fn, device="cpu"):
    loop = tqdm(loader)

    average_loss = 0
    count = 0

    for batch_idx, (data, targets) in enumerate(loop):
        data = data.to(device=device)
        targets = targets.to(device=device)

        pred = model.forward(data)
        loss = loss_fn(pred, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())
        average_loss += loss.item()
        count += 1
    
    average_loss = average_loss / count
    return average_loss

In [11]:
dataset = CustomDataset(X_train.values, y_train.values)
loader = DataLoader(
    dataset,
    batch_size = 1,
    shuffle = True
)

criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([y_train.value_counts()[0] / y_train.value_counts()[1]]))
model = Model(dataset.features)
optimizer = torch.optim.Adam(model.parameters())

NUM_EPOCHS = 30

for i in range(NUM_EPOCHS):
    ave_loss = train_fn(loader, model, optimizer, criterion)
    print(f'Epoch {i+1}: {ave_loss}')

100%|██████████| 455/455 [00:03<00:00, 149.93it/s, loss=0.318]


Epoch 1: 0.788866334433084


100%|██████████| 455/455 [00:02<00:00, 164.56it/s, loss=0.225]  


Epoch 2: 0.380468709003385


100%|██████████| 455/455 [00:02<00:00, 154.29it/s, loss=0.0551]  


Epoch 3: 0.2303998459874752


100%|██████████| 455/455 [00:02<00:00, 170.74it/s, loss=0.0147]  


Epoch 4: 0.1845297388488089


100%|██████████| 455/455 [00:02<00:00, 165.70it/s, loss=0.00839] 


Epoch 5: 0.1578970211824296


100%|██████████| 455/455 [00:03<00:00, 118.82it/s, loss=0.023]   


Epoch 6: 0.14226528620141196


100%|██████████| 455/455 [00:03<00:00, 144.30it/s, loss=0.188]   


Epoch 7: 0.12854556199135314


100%|██████████| 455/455 [00:03<00:00, 139.25it/s, loss=1.46]    


Epoch 8: 0.12282298343642986


100%|██████████| 455/455 [00:03<00:00, 141.75it/s, loss=1.3]     


Epoch 9: 0.11088654255961723


100%|██████████| 455/455 [00:02<00:00, 173.62it/s, loss=0.00624] 


Epoch 10: 0.10505175049757037


100%|██████████| 455/455 [00:02<00:00, 158.42it/s, loss=0.00174] 


Epoch 11: 0.09773251037763817


100%|██████████| 455/455 [00:02<00:00, 170.63it/s, loss=0.0239]  


Epoch 12: 0.09207670795488188


100%|██████████| 455/455 [00:02<00:00, 158.85it/s, loss=0.00119] 


Epoch 13: 0.09359412429241663


100%|██████████| 455/455 [00:03<00:00, 150.04it/s, loss=2.09]    


Epoch 14: 0.0869520059776095


100%|██████████| 455/455 [00:02<00:00, 163.82it/s, loss=0.0231]  


Epoch 15: 0.10274931702039183


100%|██████████| 455/455 [00:02<00:00, 153.64it/s, loss=0.205]   


Epoch 16: 0.08731692260561214


100%|██████████| 455/455 [00:02<00:00, 164.53it/s, loss=0.00182] 


Epoch 17: 0.06994059470388996


100%|██████████| 455/455 [00:02<00:00, 153.45it/s, loss=0.0213]  


Epoch 18: 0.07124230191869958


100%|██████████| 455/455 [00:02<00:00, 163.47it/s, loss=0.00873] 


Epoch 19: 0.08231027338953475


100%|██████████| 455/455 [00:02<00:00, 153.23it/s, loss=0.116]   


Epoch 20: 0.07401068576370053


100%|██████████| 455/455 [00:02<00:00, 163.71it/s, loss=4.93e-5] 


Epoch 21: 0.07935518567687104


100%|██████████| 455/455 [00:02<00:00, 169.16it/s, loss=0.0511]  


Epoch 22: 0.07768628310896442


100%|██████████| 455/455 [00:02<00:00, 154.08it/s, loss=0.00176] 


Epoch 23: 0.07598079352029469


100%|██████████| 455/455 [00:02<00:00, 158.92it/s, loss=0.00337] 


Epoch 24: 0.08288803798649463


100%|██████████| 455/455 [00:02<00:00, 166.43it/s, loss=0.00199] 


Epoch 25: 0.0707851212906578


100%|██████████| 455/455 [00:02<00:00, 170.70it/s, loss=0.000298]


Epoch 26: 0.07842977335977368


100%|██████████| 455/455 [00:02<00:00, 161.44it/s, loss=0.00409] 


Epoch 27: 0.08821960733724656


100%|██████████| 455/455 [00:02<00:00, 162.41it/s, loss=0.149]   


Epoch 28: 0.0799101669938187


100%|██████████| 455/455 [00:02<00:00, 172.07it/s, loss=1.92e-5] 


Epoch 29: 0.06302175828620282


100%|██████████| 455/455 [00:02<00:00, 160.80it/s, loss=7.85e-5] 

Epoch 30: 0.07135760995709006



