In [1]:
import random
import os

import numpy as np
import pandas as pd

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from rdkit import DataStructs
from rdkit.Chem import PandasTools, AllChem

In [2]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)

seed_everything(42) # Seed 고정

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [4]:
PandasTools.AddMoleculeColumnToFrame(train,'SMILES','Molecule')
PandasTools.AddMoleculeColumnToFrame(test,'SMILES','Molecule')

In [5]:
train.head()

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,Molecule
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,117.37,<rdkit.Chem.rdchem.Mol object at 0x000001BC523...
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,73.47,<rdkit.Chem.rdchem.Mol object at 0x000001BC523...
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45,<rdkit.Chem.rdchem.Mol object at 0x000001BC523...
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,92.6,<rdkit.Chem.rdchem.Mol object at 0x000001BC523...
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,42.43,<rdkit.Chem.rdchem.Mol object at 0x000001BC523...


In [6]:
fp = AllChem.GetHashedMorganFingerprint(train["Molecule"].iloc[0], 6, nBits=4096)
ar = np.zeros((1,), dtype=np.int8)
DataStructs.ConvertToNumpyArray(fp, ar)

In [7]:
ar

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [5]:
def mol2fp(mol):
  fp = AllChem.GetHashedMorganFingerprint(mol, 6, nBits=4096)
  ar = np.zeros((1,), dtype=np.int8)
  DataStructs.ConvertToNumpyArray(fp, ar)
  return ar

In [6]:
train["FPs"] = train.Molecule.apply(mol2fp)
test["FPs"] = test.Molecule.apply(mol2fp)

In [7]:
train = train[['FPs','MLM', 'HLM']]
test = test[['FPs']]

In [8]:
class CustomDataset(Dataset):
  def __init__(self, df, target, transform, is_test=False):
    self.df = df
    self.target = target # HLM or MLM
    self.is_test = is_test # train,valid / test

    self.feature_select = transform
    if not self.is_test:
      self.fp = self.feature_select.fit_transform(np.stack(df['FPs']))
    else: # valid or test
      self.fp = self.feature_select.transform(np.stack(df['FPs']))

  def __getitem__(self, index):
    fp = self.fp[index]
    if not self.is_test: # test가 아닌 경우(label 존재)
      label = self.df[self.target][index]
      return torch.tensor(fp).float(), torch.tensor(label).float().unsqueeze(dim=-1) # feature, label

    else: # test인 경우
      return torch.tensor(fp).float() # feature
      
  def __len__(self):
    return len(self.df)

In [9]:
transform = VarianceThreshold(threshold=0.05)

train_MLM = CustomDataset(df=train, target='MLM', transform=transform, is_test=False)
train_HLM = CustomDataset(df=train, target='HLM', transform=transform, is_test=False)

input_size = train_MLM.fp.shape[1]
input_size

251

In [11]:
# Hyperparameter
CFG = {'BATCH_SIZE': 256,
       'EPOCHS': 100,
       'INPUT_SIZE': input_size,
       'HIDDEN_SIZE': 1024,
       'OUTPUT_SIZE': 1,
       'DROPOUT_RATE': 0.8,
       'LEARNING_RATE': 0.001}

In [12]:
train_MLM_dataset, valid_MLM_dataset = train_test_split(train_MLM, test_size=0.2, random_state=42)
train_HLM_dataset, valid_HLM_dataset = train_test_split(train_HLM, test_size=0.2, random_state=42)

In [13]:
train_MLM_loader = DataLoader(dataset=train_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)


train_HLM_loader = DataLoader(dataset=train_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

In [14]:
class Net(nn.Module):
  def __init__(self, input_size, hidden_size, dropout_rate, out_size):
    super(Net, self).__init__()
    
    # fc 레이어 3개와 출력 레이어
    self.fc1 = nn.Linear(input_size, hidden_size) 
    self.fc2 = nn.Linear(hidden_size, hidden_size)
    self.fc3 = nn.Linear(hidden_size, hidden_size)
    self.fc_out = nn.Linear(hidden_size, out_size)
    
    # 정규화
    self.ln1 = nn.LayerNorm(hidden_size)
    self.ln2 = nn.LayerNorm(hidden_size)
    self.ln3 = nn.LayerNorm(hidden_size)        
    
    # 활성화 함수
    self.activation = nn.LeakyReLU()
    
    # Dropout
    self.dropout = nn.Dropout(dropout_rate)
    
  def forward(self, x):
    out = self.fc1(x)
    out = self.ln1(out)
    out = self.activation(out)
    out = self.dropout(out)
    
    out = self.fc2(out)
    out = self.ln2(out)
    out = self.activation(out)
    out = self.dropout(out)
    
    out = self.fc3(out)
    out = self.ln3(out)
    out = self.activation(out)
    out = self.dropout(out)

    out = self.fc_out(out)
    return out

In [15]:
model_MLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'])
model_HLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'])

In [16]:
criterion = nn.MSELoss()
optimizer_MLM = torch.optim.Adam(model_MLM.parameters(), lr=CFG['LEARNING_RATE'])
optimizer_HLM = torch.optim.Adam(model_HLM.parameters(), lr=CFG['LEARNING_RATE'])

In [17]:
def train(train_loader, valid_loader, model, criterion, optimizer, epochs):
  model.train()
  
  for epoch in range(epochs):
    running_loss = 0
    for inputs, targets in train_loader:
      optimizer.zero_grad()
      
      output = model(inputs)
      loss = criterion(output, targets)
      loss.backward()
      optimizer.step()
      
      running_loss += loss.item()
    
    if epoch % 100 == 0:
      valid_loss = 0
      with torch.no_grad():
        for inputs, targets in valid_loader:
          output = model(inputs)
          loss = criterion(output, targets)
          valid_loss += loss.item()
              
      print(f'Epoch: {epoch}/{epochs}, Train Loss: {running_loss/len(train_loader)}, Valid Loss: {valid_loss/len(valid_HLM_loader)}')
      
      model.train()
  
  return model

In [18]:
print("Training Start: MLM")
model_MLM = train(train_MLM_loader, valid_MLM_loader, model_MLM, criterion, optimizer_MLM, epochs=CFG['EPOCHS'])

print("Training Start: HLM")
model_HLM = train(train_HLM_loader, valid_HLM_loader, model_HLM, criterion, optimizer_HLM, epochs=CFG['EPOCHS']) 

Training Start: MLM
Epoch: 0/100, Train Loss: 2092.469171697443, Valid Loss: 1900.7298990885417
Training Start: HLM
Epoch: 0/100, Train Loss: 3231.799161044034, Valid Loss: 2640.923095703125


In [19]:
test_MLM = CustomDataset(df=test, target=None, transform=transform, is_test=True)
test_HLM = CustomDataset(df=test, target=None, transform=transform, is_test=True)

test_MLM_loader = DataLoader(dataset=test_MLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

test_HLM_loader = DataLoader(dataset=test_HLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

In [20]:
def inference(test_loader, model):
    model.eval()
    preds = []
    
    with torch.no_grad():
        for inputs in test_loader:
            output = model(inputs)
            preds.extend(output.cpu().numpy().flatten().tolist())
    
    return preds

In [21]:
predictions_MLM = inference(test_MLM_loader, model_MLM)
predictions_HLM = inference(test_HLM_loader, model_HLM)

In [23]:
submission = pd.read_csv('./data/sample_submission.csv')
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,0,0
1,TEST_001,0,0
2,TEST_002,0,0
3,TEST_003,0,0
4,TEST_004,0,0
...,...,...,...
485,TEST_485,0,0
486,TEST_486,0,0
487,TEST_487,0,0
488,TEST_488,0,0


In [24]:
submission['MLM'] = predictions_MLM
submission['HLM'] = predictions_HLM
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,5.405065,30.902985
1,TEST_001,72.821205,83.917885
2,TEST_002,17.116953,62.560791
3,TEST_003,45.830437,64.533440
4,TEST_004,17.479342,83.008675
...,...,...,...
485,TEST_485,29.197660,52.771233
486,TEST_486,84.147728,85.586189
487,TEST_487,17.749884,85.448059
488,TEST_488,30.138720,82.093597


In [25]:
submission.to_csv('baseline_submission.csv', index=False)