In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
class DNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(DNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_classes)
    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out

In [None]:

# Train the model
def train(model, train_loader, optimizer, criterion, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
            print(
                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                    epoch,
                    batch_idx * len(data),
                    len(train_loader.dataset),
                    100.0 * batch_idx / len(train_loader),
                    loss.item(),
                )
            )
# Test the model
def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    ones = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.data.max(1, keepdim=True)[
                1
            ]  # get the index of the max log-probability
            labels = target.data.max(1, keepdim=True)[
                1
            ]
            correct += pred.eq(labels).cpu().sum()
            ones += pred.sum()
    test_loss /= len(test_loader.dataset)
    print(
        " Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%) {}  ".format(
            test_loss,
            correct,
            len(test_loader.dataset),
            100.0 * correct / len(test_loader.dataset),
            ones
        )
    )
    return correct / len(test_loader.dataset)

In [None]:
smiles_df = pd.read_pickle('./drive/MyDrive/colab/sider_with_vec_08.pickle')

In [None]:
smiles_df.describe().loc[['count', 'mean']]

Unnamed: 0,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,Reproductive system and breast disorders,...,"Congenital, familial and genetic disorders",Infections and infestations,"Respiratory, thoracic and mediastinal disorders",Psychiatric disorders,Renal and urinary disorders,"Pregnancy, puerperium and perinatal conditions",Ear and labyrinth disorders,Cardiac disorders,Nervous system disorders,"Injury, poisoning and procedural complications"
count,1427.0,1427.0,1427.0,1427.0,1427.0,1427.0,1427.0,1427.0,1427.0,1427.0,...,1427.0,1427.0,1427.0,1427.0,1427.0,1427.0,1427.0,1427.0,1427.0,1427.0
mean,0.520673,0.697968,0.015417,0.613875,0.806587,0.698669,0.909601,0.175893,0.717589,0.50946,...,0.177295,0.704975,0.742817,0.711983,0.638402,0.087596,0.461808,0.692362,0.913805,0.662929


mean이 0.5랑 가까운 Hepatobiliary disorders, Ear and labyrinth disorders으로 실험


In [None]:
vecs = smiles_df['vec']
input_size = len(vecs[0])
hidden_size = 10000
print(input_size, hidden_size)

30522 10000


In [None]:
hd_model = DNN(input_size, hidden_size, 2)
hd_optimizer = optim.AdamW(hd_model.parameters(), lr=1e-6)
hd_criterion = nn.CrossEntropyLoss()

In [None]:
class HDDataset(torch.utils.data.Dataset):
    def __init__(self, smiles_df, se):
        self.smiles_df = smiles_df
        self.vecs = smiles_df['vec']
        self.hd = smiles_df[se]
    def __getitem__(self, idx):
        if self.hd[idx] == 0:
            target = torch.Tensor([1, 0])
        else:
            target = torch.Tensor([0, 1])
        return (torch.Tensor(self.vecs[idx]), target)
    def __len__(self):
        return len(self.smiles_df)

hd_dataset = HDDataset(smiles_df, 'Hepatobiliary disorders')

In [None]:
hd_train_dataset, hd_test_dataset, hd_val_dataset = torch.utils.data.random_split(hd_dataset, [0.7, 0.2, 0.1])

In [None]:
hd_train_loader = torch.utils.data.DataLoader(hd_train_dataset, batch_size=24, shuffle=True)
hd_test_loader = torch.utils.data.DataLoader(hd_test_dataset, batch_size=24, shuffle=True)
hd_val_loader = torch.utils.data.DataLoader(hd_val_dataset, batch_size=24, shuffle=True)

In [None]:
hd_model = hd_model.to(device).train()
!nvidia-smi

Thu Dec 15 01:11:01 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   28C    P0    50W / 400W |  12680MiB / 40536MiB |     17%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
for epoch in range(50):
  train(hd_model, hd_train_loader, hd_optimizer, hd_criterion, epoch)
  test(hd_model, hd_test_loader, hd_criterion)

 Test set: Average loss: 0.0292, Accuracy: 173/286 (60%) 134  
 Test set: Average loss: 0.0305, Accuracy: 183/286 (64%) 190  
 Test set: Average loss: 0.0300, Accuracy: 168/286 (59%) 245  
 Test set: Average loss: 0.0295, Accuracy: 172/286 (60%) 227  
 Test set: Average loss: 0.0295, Accuracy: 179/286 (63%) 218  
 Test set: Average loss: 0.0289, Accuracy: 174/286 (61%) 131  
 Test set: Average loss: 0.0295, Accuracy: 180/286 (63%) 153  
 Test set: Average loss: 0.0296, Accuracy: 174/286 (61%) 129  
 Test set: Average loss: 0.0302, Accuracy: 175/286 (61%) 124  
 Test set: Average loss: 0.0292, Accuracy: 182/286 (64%) 209  
 Test set: Average loss: 0.0295, Accuracy: 183/286 (64%) 188  
 Test set: Average loss: 0.0331, Accuracy: 154/286 (54%) 43  
 Test set: Average loss: 0.0296, Accuracy: 177/286 (62%) 156  
 Test set: Average loss: 0.0315, Accuracy: 157/286 (55%) 54  
 Test set: Average loss: 0.0306, Accuracy: 169/286 (59%) 222  
 Test set: Average loss: 0.0302, Accuracy: 170/286 (59%) 

In [None]:
data, targets = next(iter(hd_train_loader))

In [None]:
ones = 0
cnt = 0
for batch in hd_test_loader:
    _, target = batch
    labels = target.data.max(1, keepdim=True)[
        1
    ]
    ones += labels.sum()
    cnt+= len(target)
print(ones / cnt)

tensor(0.5280)


In [None]:
el_model = DNN(input_size, hidden_size, 2)
el_optimizer = optim.AdamW(el_model.parameters(), lr=1e-6)
el_criterion = nn.CrossEntropyLoss()

el_dataset = HDDataset(smiles_df, 'Ear and labyrinth disorders')
el_train_dataset, el_test_dataset, el_val_dataset = torch.utils.data.random_split(el_dataset, [0.7, 0.2, 0.1])
el_train_loader = torch.utils.data.DataLoader(el_train_dataset, batch_size=24, shuffle=True)
el_test_loader = torch.utils.data.DataLoader(el_test_dataset, batch_size=24, shuffle=True)
el_val_loader = torch.utils.data.DataLoader(el_val_dataset, batch_size=24, shuffle=True)

el_model = el_model.to(device).train()
!nvidia-smi

Thu Dec 15 01:18:45 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0    51W / 400W |  15012MiB / 40536MiB |     31%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
for epoch in range(10):
  train(el_model, el_train_loader, el_optimizer, el_criterion, epoch)
  test(el_model, el_test_loader, el_criterion)

 Test set: Average loss: 0.0296, Accuracy: 142/286 (50%)   
 Test set: Average loss: 0.0297, Accuracy: 143/286 (50%)   
 Test set: Average loss: 0.0316, Accuracy: 136/286 (48%)   
 Test set: Average loss: 0.0292, Accuracy: 154/286 (54%)   
 Test set: Average loss: 0.0313, Accuracy: 135/286 (47%)   
 Test set: Average loss: 0.0292, Accuracy: 154/286 (54%)   
 Test set: Average loss: 0.0291, Accuracy: 162/286 (57%)   
 Test set: Average loss: 0.0297, Accuracy: 148/286 (52%)   
 Test set: Average loss: 0.0296, Accuracy: 157/286 (55%)   
 Test set: Average loss: 0.0297, Accuracy: 149/286 (52%)   


In [None]:
ones = 0
cnt = 0
for batch in el_test_loader:
    _, target = batch
    labels = target.data.max(1, keepdim=True)[
        1
    ]
    ones += labels.sum()
    cnt+= len(target)
print(ones / cnt)

tensor(0.4545)
