In [None]:
pip install transformers torch datasets rdkit tqdm


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
import pandas as pd

# Load the dataset from the CSV file (adapt to your format)
data_path = "output.csv"  # Replace with your file path
df = pd.read_csv(data_path)

# Define tokenizers and models using Auto classes
chem_tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MLM")
chem_model = AutoModel.from_pretrained("DeepChem/ChemBERTa-77M-MLM")

prot_tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert_bfd")
prot_model = AutoModel.from_pretrained("Rostlab/prot_bert_bfd")




tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.26k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/420 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/13.7M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

In [None]:
# Combine SMILES and protein sequence tokenizers with prompt engineering
def tokenize_function(examples):
    smiles = examples['smiles']
    protein_sequence = examples['protein_sequence']

    # Create prompts
    prompts = [f"Given the SMILES: {smi} and the protein sequence: {prot}, predict the Ki value."
               for smi, prot in zip(smiles, protein_sequence)]

    # Tokenize the prompts
    prompt_tokens = chem_tokenizer(prompts, truncation=True, padding='max_length', max_length=512, return_tensors="pt")

    # Tokenize protein sequence
    protein_tokens = prot_tokenizer(protein_sequence, truncation=True, padding='max_length', max_length=512, return_tensors="pt")

    # Combine the input_ids and attention_mask from both tokenizations
    combined_input_ids = torch.cat([prompt_tokens['input_ids'], protein_tokens['input_ids']], dim=1)
    combined_attention_mask = torch.cat([prompt_tokens['attention_mask'], protein_tokens['attention_mask']], dim=1)

    return {
        'input_ids': combined_input_ids.squeeze(),
        'attention_mask': combined_attention_mask.squeeze(),
        'ki_value': examples['ki_value']  # Ki values as targets
    }

# Convert the dataframe to a Huggingface dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.shuffle().select([i for i in range(1000)])


# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns (if present)
tokenized_dataset = tokenized_dataset.remove_columns(['compound_id', 'target_id', 'protein_sequence', 'smiles'])



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:

# Split into train and validation sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    logging_steps=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)


class DPIPredictor(torch.nn.Module):
    def __init__(self, chem_model, prot_model):
        super(DPIPredictor, self).__init__()
        self.chem_model = chem_model
        self.prot_model = prot_model
        # Linear layer for regression; output should be a scalar value (Ki value)
        self.fc = torch.nn.Linear(chem_model.config.hidden_size + prot_model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        # Split input_ids and attention_mask for prompts and protein sequences
        prompt_input_ids, prot_input_ids = torch.split(input_ids, [512, 512], dim=1)  # Adjust prompt length
        prompt_attention_mask, prot_attention_mask = torch.split(attention_mask, [512, 512], dim=1)

        # Get embeddings from ChemBERTa
        chem_outputs = self.chem_model(prompt_input_ids.contiguous(), attention_mask=prompt_attention_mask.contiguous())
        chem_embedding = chem_outputs.last_hidden_state[:, 0, :].contiguous()  # CLS token embedding

        # Get embeddings from ProtBERT
        prot_outputs = self.prot_model(prot_input_ids.contiguous(), attention_mask=prot_attention_mask.contiguous())
        prot_embedding = prot_outputs.last_hidden_state[:, 0, :].contiguous()  # CLS token embedding

        # Concatenate embeddings and pass through regression head
        combined_embedding = torch.cat([chem_embedding, prot_embedding], dim=1).contiguous()
        ki_prediction = self.fc(combined_embedding).contiguous()

        # Return a prediction of shape [batch_size, 1] (not a scalar)
        return ki_prediction.squeeze(-1).contiguous()

    # Override the state_dict method to ensure all weights are contiguous before saving
    def state_dict(self, *args, **kwargs):
        state_dict = super().state_dict(*args, **kwargs)
        for key in state_dict:
            if isinstance(state_dict[key], torch.Tensor) and not state_dict[key].is_contiguous():
                state_dict[key] = state_dict[key].contiguous()
        return state_dict




# Initialize model
dpi_model = DPIPredictor(chem_model, prot_model)

# Define the Trainer object
# Define the Trainer object with compute_metrics
trainer = Trainer(
    model=dpi_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorForSeq2Seq(chem_tokenizer),  # Data collator

)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()



Epoch,Training Loss,Validation Loss
1,-12.0155,No log


KeyError: "The `metric_for_best_model` training argument is set to 'eval_mse', which is not found in the evaluation metrics. The available evaluation metrics are: []. Consider changing the `metric_for_best_model` via the TrainingArguments."