In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

In [None]:
# Change this to the location of your wbpc.data file
file_raw_data = "/home/ralampay/workspace/pattern-recognition-course/data/wdbc.csv"

raw_data = pd.read_csv(file_raw_data, header=None)

raw_data

In [None]:
x = raw_data.iloc[:,2:32]

x

In [None]:
num_features = len(x.columns)

print("Number of Features: {}".format(num_features))

In [None]:
columns = []

for i in range(num_features):
    columns.append("x{}".format(i))

x.columns = columns

x

In [None]:
x_mean = x.mean()
x_std = x.std()
x_standardized = (x - x_mean)/x_std

x_standardized

In [None]:
x_normalized = (x - x.min()) / (x.max() - x.min())

x_normalized

In [None]:
y = raw_data[1].replace(['B'], 0).replace(['M'], 1)

y = y.values

y

In [None]:
num_benign = len(raw_data[raw_data.iloc[:,1] == 'B'])
num_malignant = len(raw_data[raw_data.iloc[:,1] == 'M'])

print("num_benign: {}".format(num_benign))
print("num_malignant: {}".format(num_malignant))

In [None]:
df = x_normalized.copy()
df['y'] = y

df

In [None]:
def partition_dataset(df, num_a=20, num_b=20, val_a=1, val_b=0):
    df_a = df[df.iloc[:,-1] == val_a].sample(num_a)
    df_b = df[df.iloc[:,-1] == val_b].sample(num_b)
    
    df.drop(df_a.index, inplace=True)
    df.drop(df_b.index, inplace=True)
    
    frames = [df_a, df_b]
    df_validation = pd.concat(frames)
    
    return df, df_validation

training, validation = partition_dataset(df, num_a=20, num_b=20)

In [None]:
training

In [None]:
validation

In [None]:
x_training = training.iloc[:,:-1].values
x_validation = validation.iloc[:,:-1].values

# From: y_initial = [1, 0]
# To: y_transformer = [[1,0], [0, 1]]

y_training = [[1, 0] if _y == 1 else [0, 1] for _y in training['y'].values]
y_validation = [[1, 0] if _y == 1 else [0, 1] for _y in validation['y'].values]

y_training

In [None]:
class MultiLayerPerceptron(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.input_1 = nn.Linear(input_dim, 25)
        self.input_2 = nn.Linear(25, 20)
        self.input_3 = nn.Linear(20, 10)
        self.output = nn.Linear(10, output_dim)

    def forward(self, x):
        # f(x) = a(f(x))
        x = F.relu(self.input_1(x))
        x = F.relu(self.input_2(x))
        x = F.relu(self.input_3(x))
        y = F.sigmoid(self.output(x))
        #y = F.softmax(y, dim=-1)

        return y

In [None]:
model = MultiLayerPerceptron(30, 2)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

In [None]:
criterion = criterion.to(device)

In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
        self.n_samples = len(x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

In [None]:
x = torch.tensor(x_training).float().to(device)
y = torch.tensor(y_training).float().to(device)

training_ds = CustomDataset(x, y)

In [None]:
from torch.utils.data import DataLoader

batch_size = 5

train_loader = DataLoader(
    training_ds,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False
)

In [None]:
def train_fn(loader, model, optimizer, loss_fn, device):
    loop = tqdm(loader)

    ave_loss = 0
    count = 0 
    for batch_idx, (data, targets) in enumerate(loop):
        data = data.to(device=device)
        targets = targets.to(device=device)
        
        # Forward
        predictions = model.forward(data)
        
        predictions = F.softmax(predictions, dim=-1)
        
        loss = loss_fn(predictions, targets)
        
        # Backward
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
        
        # Update tqdm
        loop.set_postfix(loss=loss.item())

        count += 1
        ave_loss += loss.item()
    
    ave_loss = ave_loss / count

    return ave_loss

epochs = 10

for epoch in range(epochs):
    print("Epoch: {}".format(epoch))
    ave_loss = train_fn(train_loader, model, optimizer, criterion, device)
    print("Ave Loss: {}".format(ave_loss))

In [None]:
predictions = model.forward(torch.tensor(x_validation).float())
print(predictions)
#predictions = F.one_hot(predictions, -1)

predictions = predictions.detach().cpu().numpy()
predictions = [[1, 0] if _y[0] > _y[1] else [0, 1] for _y in predictions]
predictions

In [None]:
y_validation