Here we have tried modelling a CNN on a dataset of drug-side effect pair. But we only consider side effect groups which is 23 labels. And this is a multi-label classification problem where dataset is highly skewed.

In [766]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np

## For getting one-hot encoding for the smile strings (manual encoding)

In [None]:
df_smiles = pd.read_csv('STITCH_extended_isoSMILES.tsv', sep='\t')

In [5]:


# define SMILES characters ----------------------------------------------------
SMILES_CHARS = [' ',
                '#', '%', '(', ')', '+', '-', '.', '/',
                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                '=', '@', ':','^', '|',
                'A', 'B', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
                'R', 'S', 'T', 'V', 'X', 'Z',
                '[', '\\', ']',
                'a', 'b', 'c', 'd','e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's',
                't', 'u']
                
# define encoder and decoder --------------------------------------------------
smi2index = dict( (c,i) for i,c in enumerate( SMILES_CHARS ) )
index2smi = dict( (i,c) for i,c in enumerate( SMILES_CHARS ) )

def smiles_encoder( smiles, maxlen=400 ): 
    X = np.zeros( ( maxlen, len( SMILES_CHARS ) ) )
    for i, c in enumerate( smiles ):
        X[i, smi2index[c] ] = 1
    return X # X will be the size of max sequence length * vocabulary size

def smiles_decoder( X ):
    smi = ''
    X = X.argmax( axis=-1 )
    for i in X:
        smi += index2smi[ i ]
    return smi

# get a taste of caffeine -----------------------------------------------------
caffeine_smiles = 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C'

caffeine_encoding = smiles_encoder(caffeine_smiles)

print(caffeine_encoding.shape)
print(caffeine_encoding)
print(smiles_decoder(caffeine_encoding))

(400, 61)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
CN1C=NC2=C1C(=O)N(C(=O)N2C)C                                                                                                                                                                                                                                                                                                                                                                                    


# Split_data

In [22]:
import collections
from sklearn.model_selection import train_test_split

In [147]:
TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15

In [154]:
def train_val_test_split(X, y, train_size):
    """Split dataset into data splits."""
#     X_train, X_, y_train, y_ = train_test_split(X, y, train_size=TRAIN_SIZE, stratify=y) # because of multilable classification, stratification is difficult as it needs more sample under each class
#     X_train, X_, y_train, y_ = train_test_split(X, y, train_size=TRAIN_SIZE, stratify=y)
    X_train, X_, y_train, y_ = train_test_split(X, y, train_size=TRAIN_SIZE)
    X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5)
    return X_train, X_val, X_test, y_train, y_val, y_test

p.s.: The stratify parameter set it to split data in a way to allocate test_size amount of data to each class. In this case, you don't have sufficient class labels of one of your classes to keep the data splitting ratio equal to test_size.

In [875]:
df_data = pd.read_csv('SIDER_with_isoSMILES.tsv', sep='\t')
df_data

Unnamed: 0.1,Unnamed: 0,STITCH,SMILES,Blood and lymphatic system disorders,Cardiac disorders,"Congenital, familial and genetic disorders",Ear and labyrinth disorders,Endocrine disorders,Eye disorders,Gastrointestinal disorders,...,Musculoskeletal and connective tissue disorders,"Neoplasms benign, malignant and unspecified (incl cysts and polyps)",Nervous system disorders,"Pregnancy, puerperium and perinatal conditions",Psychiatric disorders,Renal and urinary disorders,Reproductive system and breast disorders,"Respiratory, thoracic and mediastinal disorders",Skin and subcutaneous tissue disorders,Vascular disorders
0,2,CID000000143,Nc1nc(=O)c2c([nH]1)NCC(CNc1ccc(C(=O)NC(CCC(=O)O)C(=O)O)cc1)N2C=O,1,0,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,25,CID000000772,CC(=O)NC1C(O)OC(COS(=O)(=O)O)C(OC2OC(C(=O)O)C(OC3OC(CO)C(OC4OC(C(=O)O)C(O)C(O)C4OS(=O)(=O)O)C(OS(=O)(=O)O)C3NS(=O)(=O)O)C(O)C2OS(=O)(=O)O)C1O,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,50,CID000002019,Cc1c2oc3c(C)ccc(C(=O)NC4C(=O)NC(C(C)C)C(=O)N5CCCC5C(=O)N(C)CC(=O)N(C)C(C(C)C)C(=O)OC4C)c3nc-2c(C(=O)NC2C(=O)NC(C(C)C)C(=O)N3CCCC3C(=O)N(C)CC(=O)N(C)C(C(C)C)C(=O)OC2C)c(N)c1=O,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,66,CID000002156,CCCCc1oc2ccccc2c1C(=O)c1cc(I)c(OCC[NH+](CC)CC)c(I)c1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,70,CID000002162,CCOC(=O)C1=C(COCCN)NC(C)=C(C(=O)OC)C1c1ccccc1Cl,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
820,1549,CID070683024,COCCOC(=O)NCC(=O)N[C@H]1[C@H]([C@H](O)[C@H](O)CO)O[C@@](OC[C@H]2O[C@H](O[C@H](C)[C@H](N)C(=O)O)[C@H](NC(C)=O)[C@@H](O)[C@H]2O)(C(=O)O)C[C@@H]1O,1,1,1,1,0,1,1,...,1,0,1,1,0,0,0,1,1,1
821,1550,CID070685014,C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@H](CC(N)=O)NC(=O)[C@@H]2CSSC[C@@H](N)C(=O)N[C@H]3CSSC[C@H](NC1=O)C(=O)N[C@@H]([C@@H](C)O)C(=O)NCC(=O)N[C@H](C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)O)CSSC[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CCC(=O)O)NC3=O)C(=O)N2,1,1,1,1,0,1,1,...,1,1,1,1,1,1,1,1,1,1
822,1552,CID070788982,NCCCC[C@@H]1NC(=O)[C@@H](Cc2c[nH]c3ccccc23)NC(=O)[C@H](c2ccccc2)NC(=O)[C@@H]2C[C@@H](OC(=O)NCCN)CN2C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](Cc2ccc(OCc3ccccc3)cc2)NC1=O.N[C@@H](CC(=O)O)C(=O)O.N[C@@H](CC(=O)O)C(=O)O,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
823,1553,CID071306410,CC(C)(C)NCC(O)COc1nsnc1N1CCOCC1.CCNC1CN(CCCOC)S(=O)(=O)c2sc(S(N)(=O)=O)cc21,1,1,1,1,1,1,1,...,1,0,1,1,1,1,0,1,1,1


In [874]:
# random_sample_df = df_data.sample(n=400, random_state=42) # for pilot dataset
# random_sample_df = df_data.sample(n=400) # for pilot dataset
# random_sample_df

In [876]:
# Data
# X = random_sample_df["SMILES"].values
# y = random_sample_df.iloc[:, 3:26].values
X = df_data["SMILES"].values
y = df_data.iloc[:, 3:26].values
# X = df_data["SMILES"].values
# y = df_data.iloc[:, 4:27].values
print(y.shape)
print(X.shape)
# print(y[:,1].shape)

(825, 23)
(825,)


In [877]:
max_seq_len = max(1, max(len(sequence) for sequence in X))
print(max_seq_len)

398


In [620]:
# c = 0
# df_test = pd.read_csv('SIDER_with_isoSMILES.tsv', sep='\t')
# X_t = df_test["SMILES"].values
# y_t = df_test.iloc[:, 4:27].values
# for i in range(y_t.shape[0]):
#     if np.all(y_t[i] == 0):
#         c += 1
#         print(i)
# print(c)

In [625]:
##-- Calculate class occurance in the sampled data --##
class_vectors = np.array(y)
class_count = np.sum(class_vectors, axis=0)
print(len(class_count),  class_count, np.sum(class_count))


23 [378 380 369 336 345 395 398 399 354 396 397 399 372 391 342 399 384 372
 368 379 392 394 398] 8737


In [690]:
## -- one way of calculating class weights --##
from sklearn.utils.class_weight import compute_class_weight
from torch.autograd import Variable
def class_weights(dataset):
    Y = []
    for i in range(len(dataset)):
        Y.append(dataset[i][1])
    Y = np.vstack(Y)

    neg_weights = []
    pos_weights = []
    for i in range(Y.shape[1]):
        w = compute_class_weight(class_weight ="balanced", classes = np.unique(Y[:, i]), y=Y[:, i]) # Estimate class weights for unbalanced datasets.
        neg_weights.append(w[0]) # corresponds to negative of a label/class
        pos_weights.append(w[1]) # corresponds to positive of a label/class
    return torch.from_numpy(np.array([neg_weights, pos_weights])).type(torch.FloatTensor)
#     return Variable(torch.from_numpy(np.array([neg_weights, pos_weights])).type(torch.FloatTensor),
#                     requires_grad=False)

a = class_weights(y)
print(a)

tensor([[10.0000],
        [ 0.5263]])


In [626]:
# Create data splits
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    X=X, y=y, train_size=TRAIN_SIZE)
print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print (f"Sample point: {X_train[0]} → {y_train[0]}")

X_train: (280,), y_train: (280, 23)
X_val: (60,), y_val: (60, 23)
X_test: (60,), y_test: (60, 23)
Sample point: CC1=C(CC(=O)[O-])c2cc(F)ccc2/C1=C\c1ccc(S(C)=O... → [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


# Tokenizer

Convert our text input data into token indices. This means that every token (we can decide what a token is char, word, sub-word, etc.) is mapped to a unique index which allows us to represent our text as an array of indices.

In [627]:
import json
from collections import Counter
from more_itertools import take

In [628]:
class Tokenizer(object):
    def __init__(self, char_level, num_tokens=None,
                 pad_token="<PAD>", oov_token="<UNK>",
                 token_to_index=None):
        self.char_level = char_level
        self.separator = "" if self.char_level else " "
        if num_tokens: num_tokens -= 2 # pad + unk tokens
        self.num_tokens = num_tokens
        self.pad_token = pad_token
        self.oov_token = oov_token
        if not token_to_index:
            token_to_index = {pad_token: 0, oov_token: 1}
        self.token_to_index = token_to_index
        self.index_to_token = {v: k for k, v in self.token_to_index.items()}

    def __len__(self):
        return len(self.token_to_index)

    def __str__(self):
        return f"<Tokenizer(num_tokens={len(self)})>"

    def fit_on_texts(self, texts):
        if not self.char_level:
            texts = [text.split(" ") for text in texts]
        all_tokens = [token for text in texts for token in text]
        counts = Counter(all_tokens).most_common(self.num_tokens)
        self.min_token_freq = counts[-1][1]
        for token, count in counts:
            index = len(self)
            self.token_to_index[token] = index
            self.index_to_token[index] = token
        return self

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            if not self.char_level:
                text = text.split(" ")
            sequence = []
            for token in text:
                sequence.append(self.token_to_index.get(
                    token, self.token_to_index[self.oov_token]))
            sequences.append(np.asarray(sequence))
        return sequences

    def sequences_to_texts(self, sequences):
        texts = []
        for sequence in sequences:
            text = []
            for index in sequence:
                text.append(self.index_to_token.get(index, self.oov_token))
            texts.append(self.separator.join([token for token in text]))
        return texts

    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {
                "char_level": self.char_level,
                "oov_token": self.oov_token,
                "token_to_index": self.token_to_index
            }
            json.dump(contents, fp, indent=4, sort_keys=False)

    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

In [629]:
# -- test --#
smiles = ['CN1C=NC2=C1C(=O)N(C(=O)N2C)C']
tokenizer = Tokenizer(char_level=True, num_tokens=62)
tokenizer.fit_on_texts(texts=smiles)
VOCAB_SIZE = len(tokenizer)
print (tokenizer)
print (take(5, tokenizer.token_to_index.items()))
print (f"least freq token's freq: {tokenizer.min_token_freq}") 
test = tokenizer.texts_to_sequences(smiles)
print(test)

<Tokenizer(num_tokens=10)>
[('<PAD>', 0), ('<UNK>', 1), ('C', 2), ('N', 3), ('=', 4)]
least freq token's freq: 2
[array([2, 3, 7, 2, 4, 3, 2, 8, 4, 2, 7, 2, 5, 4, 9, 6, 3, 5, 2, 5, 4, 9,
       6, 3, 8, 2, 6, 2])]


In [878]:
tokenizer = Tokenizer(char_level=True, num_tokens=62)
# tokenizer.fit_on_texts(texts=X_train)
tokenizer.fit_on_texts(texts=X)
VOCAB_SIZE = len(tokenizer)
print (tokenizer)
print (take(5, tokenizer.token_to_index.items()))
print (f"least freq token's freq: {tokenizer.min_token_freq}") 

<Tokenizer(num_tokens=47)>
[('<PAD>', 0), ('<UNK>', 1), ('C', 2), ('(', 3), (')', 4)]
least freq token's freq: 1


In [879]:
# Convert texts to sequences of indices
X = tokenizer.texts_to_sequences(X)
# X_train = tokenizer.texts_to_sequences(X_train)
# X_val = tokenizer.texts_to_sequences(X_val)
# X_test = tokenizer.texts_to_sequences(X_test)
# preprocessed_text = tokenizer.sequences_to_texts([X_train[0]])[0]
preprocessed_text = tokenizer.sequences_to_texts([X[0]])[0]
print ("Text to indices:\n"
    f"  (preprocessed) → {preprocessed_text}\n"
    f"  (tokenized) → {X[0]}")

Text to indices:
  (preprocessed) → Nc1nc(=O)c2c([nH]1)NCC(CNc1ccc(C(=O)NC(CCC(=O)O)C(=O)O)cc1)N2C=O
  (tokenized) → [13  5 12 16  5  3 10  6  4  5 14  5  3  8 16 11  9 12  4 13  2  2  3  2
 13  5 12  5  5  5  3  2  3 10  6  4 13  2  3  2  2  2  3 10  6  4  6  4
  2  3 10  6  4  6  4  5  5 12  4 13 14  2 10  6]


# One-hot encoding

OHE of the tokens will create matrix with binary values where each vocabulary or tokens will be indicated by 0 or 1.

In [880]:
def to_categorical(seq, num_classes):
    """One-hot encode a sequence of tokens."""
    one_hot = np.zeros((len(seq), num_classes)) # num_classes is the vocabulary size
    for i, item in enumerate(seq):
        one_hot[i, item] = 1.
    return one_hot

In [633]:
## -- test --##
print (X_train[0]) # token for the first smile in the data
print (len(X_train[0]))
cat = to_categorical(seq=X_train[0], num_classes=len(tokenizer))
print (cat)
print (cat.shape)

[ 2  2 12 10  2  4  2  2  4 10  6  5  9  6 20 11  5  3 15  3  3  4 23  5
  3  3  3 15 18  2 12 10  2 24  3 12  3  3  3  4 21  4  2  5 10  6  7  7
  7]
49
[[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(49, 38)


In [881]:
# Convert tokens to one-hot
vocab_size = len(tokenizer)
X = [to_categorical(seq, num_classes=vocab_size) for seq in X]
# X_train = [to_categorical(seq, num_classes=vocab_size) for seq in X_train]
# X_val = [to_categorical(seq, num_classes=vocab_size) for seq in X_val]
# X_test = [to_categorical(seq, num_classes=vocab_size) for seq in X_test]

In [882]:
print(len(X[0]))

64


# Padding
Our inputs are all of varying length but we need each batch to be uniformly shaped. Therefore, we will use padding to make all the inputs in the batch the same length.

Here we will create a batch of shape (N (i.e., sample size), max_seq_len, vocab_size) so we'll need to be able to pad 3D sequences.

In [883]:
def pad_sequences(sequences, max_seq_len=0): # ACROSS DIFFERENT BATCHES WE CAN MAKE IT CONSISTENT, SAY max_seq_len=382
    """Pad sequences to max length in sequence."""
    max_seq_len = max(max_seq_len, max(len(sequence) for sequence in sequences))
    num_classes = sequences[0].shape[-1]
    padded_sequences = np.zeros((len(sequences), max_seq_len, num_classes))
    for i, sequence in enumerate(sequences):
        padded_sequences[i][:len(sequence)] = sequence
    return padded_sequences


In [884]:
# -- Test -- #
print (X[0].shape, X[1].shape, X[2].shape)
padded = pad_sequences(X[0:3])
print (padded.shape) # (N (i.e., sample size), max_seq_len, vocab_size)

(64, 47) (141, 47) (174, 47)
(3, 174, 47)


# Dataset

In [637]:
import torch
import torch.nn as nn

In [638]:
# -- test--#
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __str__(self):
        return f"<Dataset(N={len(self)})>"
    
    def __getitem__(self, idx):
        return self.data[idx]

# Create some example data
data = [1, 2, 3, 4, 5]

# Create an instance of the custom dataset
dataset = CustomDataset(data)

# Access individual samples using indexing
print(dataset[0])  # Output: 1
print(dataset[2])  # Output: 3
print(len(dataset))
print(str(dataset))


# y = torch.LongTensor(y_train.astype(np.int32))
y = torch.LongTensor(y_train)
print(y.shape)
print(y[0])
print(y_train.shape)

# X = pad_sequences(X_train[0:3], max_seq_len=1)

# # X = torch.FloatTensor(np.array(X).astype(np.int32)) # float because input is kind of continuous in the pixel space
# # #         y = torch.LongTensor(y.astype(np.int32)) # integer casting of labels as we only have 0 and 1
y = torch.LongTensor(np.array(y_train).astype(np.int32))
print(y)
print(y.shape)
# # print(np.array(X).astype(np.int32))
# print(X, y)
# print(X_train[0].shape, X_train[2].shape)

1
3
5
<Dataset(N=5)>
torch.Size([280, 23])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
(280, 23)
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])
torch.Size([280, 23])


In [639]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, X, y, max_filter_size):
        self.X = X
        self.y = y
        self.max_filter_size = max_filter_size # this is padding arguement

    def __len__(self):
        return len(self.y)

    def __str__(self):
        return f"<Dataset(N={len(self)})>"

    def __getitem__(self, index):
        X = self.X[index]
        y = self.y[index]
        return X, y

    def collate_fn(self, batch):
        """Custom collate function."""
        batch_X = [item[0] for item in batch]
        batch_y = [item[1] for item in batch]

        # Pad sequences (if necessary)
        batch_X = pad_sequences(batch_X, max_seq_len=self.max_filter_size) # padding for consistent dimensions for each input batch


        # Convert batch_y to tensors and reshape
#         batch_y = torch.stack([torch.LongTensor(y).view(1, -1) for y in batch_y], dim=0)
        batch_y = torch.stack([torch.FloatTensor(y).view(1, -1) for y in batch_y], dim=0)

        # Cast
        batch_X = torch.FloatTensor(batch_X) # float because input is kind of continuous in the pixel space
    #     batch_y = torch.LongTensor(batch_y)

        return batch_X, batch_y


    def create_dataloader(self, batch_size, shuffle=False, drop_last=True):
        return torch.utils.data.DataLoader(
            dataset=self, batch_size=batch_size, collate_fn=self.collate_fn,
            shuffle=shuffle, drop_last=drop_last, pin_memory=True) 



drop_last: If set to True, it drops the last incomplete batch if its size is less than the specified batch size. This is commonly used to ensure all batches have the same size.

pin_memory: If set to True, data loader copies Tensors into pinned memory before returning them. This can speed up GPU transfer.

In [640]:
# Create datasets for embedding
train_dataset = Dataset(X=X_train, y=y_train, max_filter_size=1)
val_dataset = Dataset(X=X_val, y=y_val, max_filter_size=1)
test_dataset = Dataset(X=X_test, y=y_test, max_filter_size=1)
print ("Datasets:\n"
    f"  Train dataset:{str(train_dataset)}\n"
    f"  Val dataset: {str(val_dataset)}\n"
    f"  Test dataset: {str(test_dataset)}\n"
    "Sample point:\n"
    f"  X: {test_dataset[0][0]}\n"
    f"  y: {test_dataset[0][1]}\n"
       f" X_test.shape: {test_dataset[0][0].shape}\n"
       f" Y_test.shape: {test_dataset[0][1].shape}")

Datasets:
  Train dataset:<Dataset(N=280)>
  Val dataset: <Dataset(N=60)>
  Test dataset: <Dataset(N=60)>
Sample point:
  X: [[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
  y: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 X_test.shape: (49, 38)
 Y_test.shape: (23,)


In [436]:
# Iterate over batches
# for batch_X, batch_y in dataloader:
#     # Print shape of y in the current batch
#     print("Shape of y in current batch:", batch_y.shape)
#     print("Shape of x in current batch:", batch_X.shape)


In [641]:
# Create dataloaders
batch_size = 16
train_dataloader = train_dataset.create_dataloader(batch_size=batch_size)
val_dataloader = val_dataset.create_dataloader(batch_size=batch_size)
test_dataloader = test_dataset.create_dataloader(batch_size=batch_size)
batch_X, batch_y = next(iter(test_dataloader))
print ("Sample batch:\n"
    f"  X: {list(batch_X.size())}\n"
    f"  y: {list(batch_y.size())}\n"
    "Sample point:\n"
    f"  X: {batch_X[0]}\n"
    f"  y: {batch_y[0]}")
print(batch_y.shape)
y_new = batch_y.squeeze(1)
print(y_new.shape)

Sample batch:
  X: [16, 49, 38]
  y: [16, 1, 23]
Sample point:
  X: tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
  y: tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1.]])
torch.Size([16, 1, 23])
torch.Size([16, 23])


In [498]:
# batch_X, batch_y = next(iter(test_dataloader))
# inputs, y_true = batch_X, batch_y
# print(batch_y)
# for i, batch in enumerate(test_dataloader):
#     inputs, y_true = batch
#     print(y_true)
    

# CNN Model

In [642]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [815]:
class CNN(nn.Module):
    def __init__(self, vocab_size, num_filters, filter_size,
                 hidden_dim, dropout_p, num_classes):
        super(CNN, self).__init__()

        # Convolutional filters
        self.filter_size = filter_size # we'll used 1d filters like 1x3
        self.conv = nn.Conv1d(
            in_channels=vocab_size, out_channels=num_filters,
            kernel_size=filter_size, stride=1, padding=0, padding_mode="zeros") # padding 0 is no padding
        self.batch_norm = nn.BatchNorm1d(num_features=num_filters) 

        # FC layers
        self.fc1 = nn.Linear(num_filters, hidden_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.fc2 = nn.Linear(hidden_dim, num_classes)
        self.sigmoid = nn.Sigmoid()  # Sigmoid activation function for multi-label classification

    def forward(self, inputs, channel_first=False,):

        # Rearrange input so num_channels is in dim 1 (N, C, L)
        # With PyTorch, when dealing with convolution, our inputs (X) need to have the 
        # channels as the second dimension, so our inputs will be (N, vocab_size, max_seq_len)
        x_in, = inputs
        if not channel_first:
            x_in = x_in.transpose(1, 2)

        # Padding for `SAME` padding
        max_seq_len = x_in.shape[2] # after transpose, the 3rd dim (i.e. 2) is the seq length
        padding_left = int((self.conv.stride[0]*(max_seq_len-1) - max_seq_len + self.filter_size)/2)
        padding_right = int(math.ceil((self.conv.stride[0]*(max_seq_len-1) - max_seq_len + self.filter_size)/2))

        # Conv outputs
        z = self.conv(F.pad(x_in, (padding_left, padding_right)))
        z = self.batch_norm(z)  # Batch normalization
        z = F.relu(z)  # Activation function
        z = F.max_pool1d(z, z.size(2)).squeeze(2) # a 1-dimensional max pooling operation to the input tensor z along its last dimension

        # FC layer
        z = self.fc1(z)
        z = self.dropout(z)
        z = self.fc2(z)
        z = self.sigmoid(z)  # Sigmoid activation function for multi-label classification
        return z


In [901]:
NUM_FILTERS = 50
HIDDEN_DIM = 300
DROPOUT_P = 0.2
FILTER_SIZE = 3
NUM_CLASSES = test_dataset[0][1].shape[0]

In [902]:
# Set device
cuda = True
device = torch.device("cuda" if (
    torch.cuda.is_available() and cuda) else "cpu")
torch.set_default_tensor_type("torch.FloatTensor")
if device.type == "cuda":
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
print (device)

cpu


In [903]:
# Initialize model
model = CNN(vocab_size=VOCAB_SIZE, num_filters=NUM_FILTERS, filter_size=FILTER_SIZE,
            hidden_dim=HIDDEN_DIM, dropout_p=DROPOUT_P, num_classes=NUM_CLASSES)
model = model.to(device) # set device
print (model.named_parameters)

<bound method Module.named_parameters of CNN(
  (conv): Conv1d(47, 50, kernel_size=(3,), stride=(1,))
  (batch_norm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=50, out_features=300, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=300, out_features=23, bias=True)
  (sigmoid): Sigmoid()
)>


# Training

In [647]:
from torch.optim import Adam

In [851]:
## -- Trainer class to run single or multi-CV training (both exmaples are covered at the following)-- #
class Trainer(object):
    def __init__(self, model, device, loss_fn=None, optimizer=None, scheduler=None):

        # Set params
        self.model = model
        self.device = device
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler

    def train_step(self, dataloader):
        """Train step."""
        # Set model to train mode
        self.model.train()
        loss = 0.0

        # Iterate over train batches
        for i, batch in enumerate(dataloader):

            # Step
            batch = [item.to(self.device) for item in batch]  # Set device
            inputs, targets = batch[:-1], batch[-1]
            self.optimizer.zero_grad()  # Reset gradients
            z = self.model(inputs)  # Forward pass
#             J = self.loss_fn(z, targets)  # Define loss
            J = self.loss_fn(z, targets.squeeze(1))  # Define loss
            J.backward()  # Backward pass
            self.optimizer.step()  # Update weights

            # Cumulative Metrics
            loss += (J.detach().item() - loss) / (i + 1)

        return loss

    def eval_step(self, dataloader):
        """Validation or test step."""
        # Set model to eval mode
        self.model.eval()
        loss = 0.0
        y_trues, y_probs = [], []

        # Iterate over val batches
        with torch.inference_mode():
            for i, batch in enumerate(dataloader):

                # Step
                batch = [item.to(self.device) for item in batch]  # Set device
                inputs, y_true = batch[:-1], batch[-1]

                z = self.model(inputs)  # Forward pass
#                 J = self.loss_fn(z, y_true).item()
                J = self.loss_fn(z, y_true.squeeze(1)).item()

                # Cumulative Metrics
                loss += (J - loss) / (i + 1)

                # Store outputs
#                 y_prob = F.softmax(z).cpu().numpy() # probably we don't need this as we already put sigmoid on the output layer
                y_prob = z.detach().numpy() # probably this is how we cast tensor to numpy
                y_probs.extend(y_prob)
                y_trues.extend(y_true.cpu().numpy())

        return loss, np.vstack(y_trues), np.vstack(y_probs)

    def predict_step(self, dataloader):
        """Prediction step."""
        # Set model to eval mode
        self.model.eval()
        y_probs = []

        # Iterate over val batches
        with torch.inference_mode():
            for i, batch in enumerate(dataloader):

                # Forward pass w/ inputs
                inputs, targets = batch[:-1], batch[-1]
                z = self.model(inputs)

                # Store outputs
                y_prob = F.softmax(z).cpu().numpy()
                y_probs.extend(y_prob)

        return np.vstack(y_probs)

    def train(self, num_epochs, patience, train_dataloader, val_dataloader):
        history = {'train_loss': [], 'test_loss': []}
        best_val_loss = np.inf
        for epoch in range(num_epochs):
            # Steps
            train_loss = self.train_step(dataloader=train_dataloader)
            val_loss, _, _ = self.eval_step(dataloader=val_dataloader)
            self.scheduler.step(val_loss)

            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = self.model
                _patience = patience  # reset _patience
            else:
                _patience -= 1
            if not _patience:  # 0
                print("Stopping early!")
                break

            # Logging
            print(
                f"Epoch: {epoch+1} | "
                f"train_loss: {train_loss:.5f}, "
                f"val_loss: {val_loss:.5f}, "
                f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, "
                f"_patience: {_patience}"
            )
            history['train_loss'].append(train_loss)
            history['test_loss'].append(val_loss)
        avg_train_loss = np.mean(history['train_loss'])
        avg_test_loss = np.mean(history['test_loss'])
#         print(avg_train_loss,avg_test_loss)
        return best_model, avg_train_loss, avg_test_loss

## for single CV training that uses the previously done train_test_split dataset

In [888]:
LEARNING_RATE = 1e-3
PATIENCE = 5
NUM_EPOCHS = 20

In [889]:
# This is necessary as our dataset is massively skewed to positives for all classes
def calculate_pos_weights(data): # data = labels
#     class_vectors = np.array(data)
    class_counts = np.sum(data, axis=0)
#     pos_weights = np.ones_like(class_counts)
    neg_counts = [len(data)-pos_count for pos_count in class_counts]
    pos_weights = []
    for i in range(len(class_counts)):
        pos_weights.append(neg_counts[i]/ class_counts[i] + 1e-5)
    return torch.as_tensor(np.array(pos_weights), dtype=torch.float).to(device)

In [890]:
class_weights = calculate_pos_weights(data=y) # positive weights
print(class_weights)

tensor([0.0550, 0.0591, 0.0827, 0.2009, 0.1587, 0.0300, 0.0061, 0.0073, 0.1411,
        0.0160, 0.0123, 0.0037, 0.0813, 0.0274, 0.1702, 0.0135, 0.0404, 0.0645,
        0.0870, 0.0673, 0.0274, 0.0160, 0.0061])


In [891]:
# Define Loss
# class_weights_tensor = torch.Tensor(list(class_weights.values())).to(device)
# loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=class_weights)

In [848]:
# Define optimizer & scheduler
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.1, patience=3)

The ReduceLROnPlateau scheduler in PyTorch adjusts the learning rate when a metric has stopped improving. 

optimizer: The optimizer for which the learning rate will be adjusted based on the specified metric.

mode: Specifies whether to monitor the metric for improvement by 'min' or 'max'. If 'min', the learning rate will be reduced when the monitored metric has stopped decreasing; if 'max', it will be reduced when the monitored metric has stopped increasing.

factor: The factor by which the learning rate will be reduced. For example, if factor=0.1, the learning rate will be multiplied by 0.1.

patience: The number of epochs with no improvement after which the learning rate will be reduced. If set to 3, for example, the learning rate will be reduced after 3 epochs with no improvement.

In [849]:
# Trainer module
trainer = Trainer(
    model=model, device=device, loss_fn=loss_fn,
    optimizer=optimizer, scheduler=scheduler)

In [850]:
# Train
best_model = trainer.train(
    NUM_EPOCHS, PATIENCE, train_dataloader, val_dataloader)

Epoch: 1 | train_loss: 0.07757, val_loss: 0.07692, lr: 1.00E-03, _patience: 5
Epoch: 2 | train_loss: 0.07603, val_loss: 0.07680, lr: 1.00E-03, _patience: 5
Epoch: 3 | train_loss: 0.07433, val_loss: 0.07637, lr: 1.00E-03, _patience: 5
Epoch: 4 | train_loss: 0.07305, val_loss: 0.07587, lr: 1.00E-03, _patience: 5
Epoch: 5 | train_loss: 0.07147, val_loss: 0.07624, lr: 1.00E-03, _patience: 4
Epoch: 6 | train_loss: 0.07023, val_loss: 0.07619, lr: 1.00E-03, _patience: 3
Epoch: 7 | train_loss: 0.07002, val_loss: 0.07574, lr: 1.00E-03, _patience: 5
Epoch: 8 | train_loss: 0.06873, val_loss: 0.07624, lr: 1.00E-03, _patience: 4
Epoch: 9 | train_loss: 0.06848, val_loss: 0.07667, lr: 1.00E-03, _patience: 3
Epoch: 10 | train_loss: 0.06757, val_loss: 0.07767, lr: 1.00E-03, _patience: 2
Epoch: 11 | train_loss: 0.06670, val_loss: 0.07744, lr: 1.00E-04, _patience: 1
Stopping early!
0.07129105634310028 0.07656022476201708


In [751]:
print(best_model.parameters)
# print(best_model.eval

<bound method Module.parameters of CNN(
  (conv): Conv1d(38, 50, kernel_size=(3,), stride=(1,))
  (batch_norm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=50, out_features=100, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=100, out_features=23, bias=True)
  (sigmoid): Sigmoid()
)>


# K-fold CV

In [892]:
from sklearn.model_selection import KFold
import random
from torch.utils.data import random_split,SubsetRandomSampler, ConcatDataset

In [904]:
k=3
splits=KFold(n_splits=k,shuffle=True,random_state=42)

In [905]:
## this class includes sampling from different folds ##
class Dataset(torch.utils.data.Dataset):
    def __init__(self, X, y, max_filter_size):
        self.X = X
        self.y = y
        self.max_filter_size = max_filter_size # this is padding arguement
#         self.sampler = sampler # sampling list

    def __len__(self):
        return len(self.y)

    def __str__(self):
        return f"<Dataset(N={len(self)})>"

    def __getitem__(self, index):
        X = self.X[index]
        y = self.y[index]
        return X, y

    def collate_fn(self, batch):
        """Custom collate function."""
        batch_X = [item[0] for item in batch]
        batch_y = [item[1] for item in batch]

        # Pad sequences (if necessary)
        batch_X = pad_sequences(batch_X, max_seq_len=self.max_filter_size) # padding for consistent dimensions for each input batch


        # Convert batch_y to tensors and reshape
#         batch_y = torch.stack([torch.LongTensor(y).view(1, -1) for y in batch_y], dim=0)
        batch_y = torch.stack([torch.FloatTensor(y).view(1, -1) for y in batch_y], dim=0)

        # Cast
        batch_X = torch.FloatTensor(batch_X) # float because input is kind of continuous in the pixel space
    #     batch_y = torch.LongTensor(batch_y)

        return batch_X, batch_y


    def create_dataloader(self, batch_size, sampler, shuffle=False, drop_last=True):
        return torch.utils.data.DataLoader(
            dataset=self, batch_size=batch_size, sampler=sampler, collate_fn=self.collate_fn,
            shuffle=shuffle, drop_last=drop_last, pin_memory=True) 



In [906]:
dataset = Dataset(X=X, y=y, max_filter_size=400)
print ("Datasets:\n"
    f"  Train dataset:{str(dataset)}\n"
    "Sample point:\n"
    f"  X: {dataset[0][0]}\n"
    f"  y: {dataset[0][1]}\n"
       f" X.shape: {dataset[0][0].shape}\n"
       f" Y.shape: {dataset[0][1].shape}")

Datasets:
  Train dataset:<Dataset(N=825)>
Sample point:
  X: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
  y: [1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 X.shape: (64, 47)
 Y.shape: (23,)


In [907]:
LEARNING_RATE = 1e-3
PATIENCE = 10
NUM_EPOCHS = 40

# --define class weights
class_weights = calculate_pos_weights(data=y) # positive weights
print(class_weights)

#--Define Loss
loss_fn = nn.BCEWithLogitsLoss(pos_weight=class_weights)

#--Define optimizer & scheduler
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.1, patience=3)

tensor([0.0550, 0.0591, 0.0827, 0.2009, 0.1587, 0.0300, 0.0061, 0.0073, 0.1411,
        0.0160, 0.0123, 0.0037, 0.0813, 0.0274, 0.1702, 0.0135, 0.0404, 0.0645,
        0.0870, 0.0673, 0.0274, 0.0160, 0.0061])


In [910]:
from sklearn.metrics import multilabel_confusion_matrix, classification_report
from sklearn.metrics import hamming_loss, accuracy_score, precision_recall_fscore_support
from sklearn.metrics import roc_auc_score

# Create dataloaders
batch_size = 32
k=10
splits=KFold(n_splits=k,shuffle=True)


# Trainer module
trainer = Trainer(
    model=model, device=device, loss_fn=loss_fn,
    optimizer=optimizer, scheduler=scheduler)

    
history = {'Hamming loss': [], 'microPrecision': [], 'microRecall': [], 'microF1': [],'micro_auc': [], 'macro_auc': [] }

for fold, (train_idx,val_idx) in enumerate(splits.split(np.arange(len(dataset)))):

    print('Fold {}'.format(fold + 1))

    train_sampler = SubsetRandomSampler(train_idx)
    test_sampler = SubsetRandomSampler(val_idx)
    train_dataloader = dataset.create_dataloader(batch_size=batch_size, sampler=train_sampler)
    val_dataloader = dataset.create_dataloader(batch_size=batch_size, sampler=test_sampler)
    batch_X, batch_y = next(iter(val_dataloader))
    print ("Sample batch:\n"
    f"  X: {list(batch_X.size())}\n"
    f"  y: {list(batch_y.size())}\n")
        # Train
    best_model, avg_train_loss,avg_test_loss = trainer.train(
        NUM_EPOCHS, PATIENCE, train_dataloader, val_dataloader)
    
    print(f'Performance of {fold} fold:')
    print("Average Training Loss: {:.4f} \t Average Test Loss: {:.4f} ".format(avg_train_loss,avg_test_loss)) 
    
    # For evaluation
    test_loss, y_true, y_pred_prob = trainer.eval_step(dataloader=val_dataloader)

    # 2. Apply thresholding to convert probabilities to binary predictions
    threshold = 0.5
    y_pred_binary = (y_pred_prob > threshold).astype(float)

    # 3. Calculate evaluation metrics
    hamming_loss_value = hamming_loss(y_true, y_pred_binary)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred_binary, average='micro')
    # Micro-average AUC
    try:
        micro_auc = roc_auc_score(y_true.ravel(), y_pred_prob.ravel())
        macro_auc = roc_auc_score(y_true, y_pred_prob, average='macro')
    except ValueError:
        pass
#     micro_auc = roc_auc_score(y_true, y_pred_prob)

#     # Macro-average AUC
#     macro_auc = roc_auc_score(y_true, y_pred_prob, average='macro')

    history['Hamming loss'].append(hamming_loss_value)
    history['microPrecision'].append(precision)
    history['microRecall'].append(recall)
    history['microF1'].append(f1)
    history['micro_auc'].append(micro_auc)
    history['macro_auc'].append(macro_auc)
    # Print or use the evaluation metrics as needed
    print("Hamming Loss: {:.4f} \t mPrecision: {:.4f} \t mRecall: {:.4f} \t mF1: {:.4f} \t micro_auc: {:.4f} \t macros_auc: {:.4f} ".format(hamming_loss_value,precision,recall,f1,micro_auc,macro_auc))
    


Fold 1
Sample batch:
  X: [32, 400, 47]
  y: [32, 1, 23]

Epoch: 1 | train_loss: 0.07371, val_loss: 0.07915, lr: 1.00E-08, _patience: 10
Epoch: 2 | train_loss: 0.07362, val_loss: 0.07950, lr: 1.00E-08, _patience: 9
Epoch: 3 | train_loss: 0.07411, val_loss: 0.07253, lr: 1.00E-08, _patience: 10
Epoch: 4 | train_loss: 0.07345, val_loss: 0.07563, lr: 1.00E-08, _patience: 9
Epoch: 5 | train_loss: 0.07417, val_loss: 0.07595, lr: 1.00E-08, _patience: 8
Epoch: 6 | train_loss: 0.07412, val_loss: 0.07208, lr: 1.00E-08, _patience: 10
Epoch: 7 | train_loss: 0.07412, val_loss: 0.08335, lr: 1.00E-08, _patience: 9
Epoch: 8 | train_loss: 0.07360, val_loss: 0.07875, lr: 1.00E-08, _patience: 8
Epoch: 9 | train_loss: 0.07418, val_loss: 0.07806, lr: 1.00E-08, _patience: 7
Epoch: 10 | train_loss: 0.07377, val_loss: 0.08375, lr: 1.00E-08, _patience: 6
Epoch: 11 | train_loss: 0.07363, val_loss: 0.08586, lr: 1.00E-08, _patience: 5
Epoch: 12 | train_loss: 0.07371, val_loss: 0.07585, lr: 1.00E-08, _patience: 4


Epoch: 4 | train_loss: 0.07396, val_loss: 0.07415, lr: 1.00E-08, _patience: 8
Epoch: 5 | train_loss: 0.07429, val_loss: 0.07445, lr: 1.00E-08, _patience: 7
Epoch: 6 | train_loss: 0.07416, val_loss: 0.07551, lr: 1.00E-08, _patience: 6
Epoch: 7 | train_loss: 0.07410, val_loss: 0.07760, lr: 1.00E-08, _patience: 5
Epoch: 8 | train_loss: 0.07424, val_loss: 0.07580, lr: 1.00E-08, _patience: 4
Epoch: 9 | train_loss: 0.07425, val_loss: 0.07353, lr: 1.00E-08, _patience: 3
Epoch: 10 | train_loss: 0.07413, val_loss: 0.07378, lr: 1.00E-08, _patience: 2
Epoch: 11 | train_loss: 0.07381, val_loss: 0.07475, lr: 1.00E-08, _patience: 1
Stopping early!
Performance of 5 fold:
Average Training Loss: 0.0741 	 Average Test Loss: 0.0750 
Hamming Loss: 0.8492 	 mPrecision: 1.0000 	 mRecall: 0.0968 	 mF1: 0.1765 	 micro_auc: 0.6240 	 macros_auc: 0.7496 
Fold 7
Sample batch:
  X: [32, 400, 47]
  y: [32, 1, 23]

Epoch: 1 | train_loss: 0.07501, val_loss: 0.07082, lr: 1.00E-08, _patience: 10
Epoch: 2 | train_loss: 

In [870]:
# print(y_true)

In [911]:
print("Average performance over all folds: ".format(avg_train_loss,avg_test_loss))

print("Hamming Loss:", np.mean(history['Hamming loss']))
print("Precision:", np.mean(history['microPrecision']))
print("Recall:", np.mean(history['microRecall']))
print("F1-score:", np.mean(history['microF1']))
print("micro_auc:", np.mean(history['micro_auc']))
print("macro_auc:", np.mean(history['macro_auc']))

Average performance over all folds: 
Hamming Loss: 0.8649456521739131
Precision: 0.9992307692307693
Recall: 0.09043640958314962
F1-score: 0.16579009603889885
micro_auc: 0.6200477743306908
macro_auc: 0.74051493043151


So recall and F1 is really poor for the whole dataset. But for a side effect predictor, its best to minimize false negatives as we don't want say no side effect when there can be a true side effect that can exist. So need high recall. Also hamming loss is really high. So i believe, for a whole dataset, we need a more complex models like many kernels and filters and hidden layers to improve prediction.

# Evaluation

In [752]:
import json
from pathlib import Path
from sklearn.metrics import precision_recall_fscore_support

In [859]:
from sklearn.metrics import hamming_loss, accuracy_score, precision_recall_fscore_support

# 1. Get predictions from the model
test_loss, y_true, y_pred_prob = trainer.eval_step(dataloader=test_dataloader)


# 2. Apply thresholding to convert probabilities to binary predictions
threshold = 0.5
# threshold = 0.043 # 1/23 classes
# y_pred_binary = (y_pred_prob > threshold).astype(int)
y_pred_binary = (y_pred_prob > threshold).astype(float)

print(y_true.shape, y_pred_prob.shape, y_pred_binary.shape)
for i in range(y_true.shape[0]):
    if np.array_equal(y_true[i], y_pred_binary[i]):
        print(y_true[i], y_pred_binary[i])

# print(y_true[i], y_pred_binary[i]) for i in range(y_true.shape[0]) if y_true[i] == y_pred_binary[i] 

# 3. Calculate evaluation metrics
hamming_loss_value = hamming_loss(y_true, y_pred_binary)
accuracy = accuracy_score(y_true, y_pred_binary)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred_binary, average='micro')

# Print or use the evaluation metrics as needed
print("Hamming Loss:", hamming_loss_value)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


RuntimeError: Given groups=1, weight of size [50, 45, 3], expected input[16, 38, 51] to have 45 channels, but got 38 channels instead

In [855]:
# for i in range(y_true.shape[0]):
#     print(y_true[i], y_pred_binary[i])
#     print(y_pred_prob[i])