In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["WANDB_DISABLED"] = "true"

In [3]:
pip install accelerate

[0mCollecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[0mInstalling collected packages: accelerate
[0mSuccessfully installed accelerate-0.21.0
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
from pathlib import Path

import accelerate
import datasets
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import PreTrainedTokenizerFast
from transformers import BertConfig, BertForMaskedLM


from genslm.utils import read_fasta, Sequence

In [5]:
# Sequence pre-processing helpers

# Assign a unique character to each codon so that we can use it as an
# input token to a BPE tokenizer. This implements a codon-pair encoding.
CODON_CHAR = {
    "TCG": "A",
    "GCA": "B",
    "CTT": "C",
    "ATT": "D",
    "TTA": "E",
    "GGG": "F",
    "CGT": "G",
    "TAA": "H",
    "AAA": "I",
    "CTC": "J",
    "AGT": "K",
    "CCA": "L",
    "TGT": "M",
    "GCC": "N",
    "GTT": "O",
    "ATA": "P",
    "TAC": "Q",
    "TTT": "R",
    "TGC": "S",
    "CAC": "T",
    "ACG": "U",
    "CCC": "V",
    "ATC": "W",
    "CAT": "X",
    "AGA": "Y",
    "GAG": "Z",
    "GTG": "a",
    "GGT": "b",
    "GCT": "c",
    "TTC": "d",
    "AAC": "e",
    "TAT": "f",
    "GTA": "g",
    "CCG": "h",
    "ACA": "i",
    "CGA": "j",
    "TAG": "k",
    "CTG": "l",
    "GGA": "m",
    "ATG": "n",
    "TCT": "o",
    "CGG": "p",
    "GAT": "q",
    "ACC": "r",
    "GAC": "s",
    "GTC": "t",
    "TGG": "u",
    "CCT": "v",
    "GAA": "w",
    "TCA": "x",
    "CAA": "y",
    "AAT": "z",
    "ACT": "0",
    "GCG": "1",
    "GGC": "2",
    "CTA": "3",
    "AAG": "4",
    "AGG": "5",
    "CAG": "6",
    "AGC": "7",
    "CGC": "8",
    "TTG": "9",
    "TCC": "!",
    "TGA": "@",
    "XXX": "*"
}


def group_and_contextualize(seq: str, k: int = 3):
    grouped_codons = " ".join(seq[i : i + k] for i in range(0, len(seq), k)).upper()
    # Removes all modulo 3 chars
    return "".join(CODON_CHAR.get(codon, "") for codon in grouped_codons.split())


def decode_grouped_context(seq: str, sep: str = " "):
    return sep.join(CHAR_CODON[elem] for elem in seq)

In [6]:
def train_model(
    config: BertConfig, train_args: TrainingArguments, tokenizer, dataset, data_collator
):
    # Build model
    model = BertForMaskedLM(config)
    model_size = sum(t.numel() for t in model.parameters())
    print(f"BERT size: {model_size/1000**2:.1f}M parameters")

    # Build trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=train_args,
        data_collator=data_collator,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
    )

    # train
    trainer.train()

    return trainer

# Vocab size 50,257

In [7]:
# Setup Tokenizer
special_tokens = {
    "unk_token": "[UNK]",
    "cls_token": "[CLS]",
    "sep_token": "[SEP]",
    "pad_token": "[PAD]",
    "mask_token": "[MASK]",
    "bos_token": "[BOS]",
    "eos_token": "[EOS]",
}

tokenizer = PreTrainedTokenizerFast.from_pretrained("mdh-codon-bpe-vs50257")
tokenizer.add_special_tokens(special_tokens)
None

OSError: mdh-codon-bpe-vs50257 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [None]:
# Dataset
sequence_file = Path(
    "/lambda_stor/homes/khippe/genslm_foundation/genome_data/mdh_sc23/fasta/mdh_natural_sequences.ffn"
)
sequences = read_fasta(sequence_file)


dataset_seqs = [group_and_contextualize(seq.sequence) for seq in sequences]
tokenized_seqs = tokenizer(
    dataset_seqs,
    max_length=1024,
    padding="max_length",
    truncation=True,
    return_tensors="pt",
)


data = {
    "input_ids": tokenized_seqs.input_ids.tolist(),
    "attention_mask": tokenized_seqs.attention_mask.tolist(),
}

dataset = Dataset.from_dict(data)
dataset = dataset.train_test_split(test_size=0.05)
print(dataset)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=True)

In [None]:
# Modelling

# 50m params
config = BertConfig(
    hidden_size=512,
    num_hidden_layers=8,
    num_attention_heads=8,
    intermediate_size=2048,
    vocab_size=tokenizer.vocab_size,
    pad_token_id=tokenizer.pad_token_id,
    max_position_embeddings=1024,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

args = TrainingArguments(
    output_dir="mdh-cBPE-BERT-50m",
    per_device_train_batch_size=28,
    per_device_eval_batch_size=28,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=25,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=500,
    fp16=True,
    push_to_hub=False,
)

train_model(config, args, tokenizer, dataset, data_collator)

In [None]:
# Modelling

# 125m params
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    pad_token_id=tokenizer.pad_token_id,
    max_position_embeddings=1024,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

# Trainer

args = TrainingArguments(
    output_dir="mdh-cBPE-BERT-125m",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=25,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=500,
    fp16=True,
    push_to_hub=False,
)

train_model(config, args, tokenizer, dataset, data_collator)

# Vocab Size 30,000

In [11]:
# Setup Tokenizer
special_tokens = {
    "unk_token": "[UNK]",
    "cls_token": "[CLS]",
    "sep_token": "[SEP]",
    "pad_token": "[PAD]",
    "mask_token": "[MASK]",
    "bos_token": "[BOS]",
    "eos_token": "[EOS]",
}

tokenizer = PreTrainedTokenizerFast.from_pretrained("mdh-codon-bpe-vs30000")
tokenizer.add_special_tokens(special_tokens)
None

In [12]:
# Dataset
sequence_file = Path(
    "/lambda_stor/homes/khippe/genslm_foundation/genome_data/mdh_sc23/fasta/mdh_natural_sequences.ffn"
)
sequences = read_fasta(sequence_file)


dataset_seqs = [group_and_contextualize(seq.sequence) for seq in sequences]
tokenized_seqs = tokenizer(
    dataset_seqs,
    max_length=256,
    padding="max_length",
    truncation=True,
    return_tensors="pt",
)


data = {
    "input_ids": tokenized_seqs.input_ids.tolist(),
    "attention_mask": tokenized_seqs.attention_mask.tolist(),
}

dataset = Dataset.from_dict(data)
dataset = dataset.train_test_split(test_size=0.05)
print(dataset)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=True)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 34799
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1832
    })
})


In [14]:
# 41.0M params
config = BertConfig(
    hidden_size=512,
    num_hidden_layers=8,
    num_attention_heads=8,
    intermediate_size=2048,
    vocab_size=tokenizer.vocab_size,
    pad_token_id=tokenizer.pad_token_id,
    max_position_embeddings=256,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

args = TrainingArguments(
    output_dir="mdh-cBPE-BERT-50m-vs30000",
    per_device_train_batch_size=28,
    per_device_eval_batch_size=28,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=25,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=500,
    fp16=True,
    push_to_hub=False,
)

trainer = train_model(config, args, tokenizer, dataset, data_collator)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Generate config GenerationConfig {
  "bos_token_id": 5,
  "eos_token_id": 6,
  "pad_token_id": 3,
  "transformers_version": "4.26.1"
}



BERT size: 41.0M parameters


Using cuda_amp half precision backend
***** Running training *****
  Num examples = 34799
  Num Epochs = 10
  Instantaneous batch size per device = 28
  Total train batch size (w. parallel, distributed & accumulation) = 56
  Gradient Accumulation steps = 2
  Total optimization steps = 6210
  Number of trainable parameters = 41005872


Step,Training Loss,Validation Loss
50,10.2365,10.130811
100,9.7482,9.586514
150,9.2644,9.145963
200,9.0118,8.973907
250,8.9147,8.877105
300,8.8249,8.754184
350,8.7226,8.658297
400,8.6243,8.570626
450,8.5252,8.464322
500,8.4577,8.387164


***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
Saving model checkpoint to mdh-cBPE-BERT-50m-vs30000/checkpoint-500
Configuration saved in mdh-cBPE-BERT-50m-vs30000/checkpoint-500/config.json
Configuration saved in mdh-cBPE-BERT-50m-vs30000/checkpoint-500/generation_config.json
Model weights saved in mdh-cBPE-BERT-50m-vs30000/checkpoint

In [None]:
# 108.9M params
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    pad_token_id=tokenizer.pad_token_id,
    max_position_embeddings=256,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

args = TrainingArguments(
    output_dir="mdh-cBPE-BERT-125m-vs30000",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=25,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=500,
    fp16=True,
    push_to_hub=False,
)

trainer = train_model(config, args, tokenizer, dataset, data_collator)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Generate config GenerationConfig {
  "bos_token_id": 5,
  "eos_token_id": 6,
  "pad_token_id": 3,
  "transformers_version": "4.26.1"
}

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 34799
  Num Epochs = 10
  Instantaneous batch size per device = 28
  Total train batch size (w. parallel, distributed & accumulation) = 56
  Gradient Accumulation steps = 2
  Total optimization steps = 6210
  Number of trainable parameters = 108916272


BERT size: 108.9M parameters


Step,Training Loss,Validation Loss
50,10.1941,10.043712
100,9.54,9.351885
150,9.1074,9.03263
200,8.9919,8.934639
250,8.8432,8.77887
300,8.7312,8.63648
350,8.6182,8.569446
400,8.5294,8.447598
450,8.4272,8.35921
500,8.3553,8.28828


***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
***** Running Evaluation *****
  Num examples = 1832
  Batch size = 28
Saving model checkpoint to mdh-cBPE-BERT-125m-vs30000/checkpoint-500
Configuration saved in mdh-cBPE-BERT-125m-vs30000/checkpoint-500/config.json
Configuration saved in mdh-cBPE-BERT-125m-vs30000/checkpoint-500/generation_config.json
Model weights saved in mdh-cBPE-BERT-125m-vs30000/checkp