# **2.3 Learning Unsupervised Embeddings for Molecules**

In [5]:
!pip install -qq --pre deepchem
import deepchem
import warnings
warnings.filterwarnings('ignore')
deepchem.__version__

'2.8.1.dev'

In [6]:
import deepchem as dc
tasks, datasets, transformers = dc.molnet.load_muv(split='stratified')
train_dataset, valid_dataset, test_dataset = datasets
train_smiles = train_dataset.ids
valid_smiles = valid_dataset.ids

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m


In [7]:
tokens = set()
for s in train_smiles:
  tokens = tokens.union(set(c for c in s))
tokens = sorted(list(tokens))

In [None]:
from deepchem.models.optimizers import Adam, ExponentialDecay
max_length = max(len(s) for s in train_smiles)
batch_size = 100
batches_per_epoch = len(train_smiles)/batch_size
model = dc.models.SeqToSeq(tokens,
                           tokens,
                           max_length,
                           encoder_layers=2,
                           decoder_layers=2,
                           embedding_dimension=256,
                           model_dir='fingerprint',
                           batch_size=batch_size,
                           learning_rate=ExponentialDecay(0.001, 0.9, batches_per_epoch))

SeqToSeq 손실함수가 불규칙함으로 학습률을 점차 줄여가며 과적합되지 않고 일반화 될 수 있도록 함 (*0.9)

Overshooting 방지 / Local Minimum 방지

Scheduling/Learning Rate Decay

In [None]:
def generate_sequences(epochs):
  for i in range(epochs):
    for s in train_smiles:
      yield (s, s)

model.fit_sequences(generate_sequences(40))

In [None]:
predicted = model.predict_from_sequences(valid_smiles[:500])
count = 0
for s,p in zip(valid_smiles[:500], predicted):
  if ''.join(p) == s:
    count += 1
print('reproduced', count, 'of 500 validation SMILES strings')

In [None]:
import numpy as np
train_embeddings = model.predict_embeddings(train_smiles)
train_embeddings_dataset = dc.data.NumpyDataset(train_embeddings,
                                                train_dataset.y,
                                                train_dataset.w.astype(np.float32),
                                                train_dataset.ids)

valid_embeddings = model.predict_embeddings(valid_smiles)
valid_embeddings_dataset = dc.data.NumpyDataset(valid_embeddings,
                                                valid_dataset.y,
                                                valid_dataset.w.astype(np.float32),
                                                valid_dataset.ids)

In [None]:
classifier = dc.models.MultitaskClassifier(n_tasks=len(tasks),
                                                      n_features=256,
                                                      layer_sizes=[512])
classifier.fit(train_embeddings_dataset, nb_epoch=10)

In [None]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification")
train_score = classifier.evaluate(train_embeddings_dataset, [metric], transformers)
valid_score = classifier.evaluate(valid_embeddings_dataset, [metric], transformers)
print('Training set ROC AUC:', train_score)
print('Validation set ROC AUC:', valid_score)

SeqToSeq 는 인코더와 디코더로 나뉨

Self-Reconstruction으로 smiles->벡터->smiles를 했을 때 얼마나 똑같이 복원하는지 확인

인코딩만하고 Embedding으로 목표로하는 물성 학습

# 💡 2.3 SeqToSeq Embedding Insight

AI가 Self-Reconstruction으로 분자의 핵심특성 스스로 정의 및 수치화 가능

256차원의 벡터로 응축함으로써, 정보 손실 최소화/ 데이터 표준화

이 Latent Space는 미세한 화학적 패턴 포착 가능