# In this Notebook we use DeepSpeed to pretrain RoBERTa

## What is DeepSpeed?
### DeepSpeed is a deep learning optimization library. It's used for efficient, effective and easy distributed training.

#### The main reason to use this library in this competetion is the memory efficiency that it provides.
#### Even with single GPU we'll be able to train bigger and better SOTA models
#### Use larger models and increased batch sizes without Out Of Memory errors

In [None]:
import pandas as pd
import torch
import gc
import os

import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install git+https://github.com/huggingface/transformers/

In [None]:
from transformers import (AutoModel,AutoModelForMaskedLM,AutoModelForSequenceClassification, AutoTokenizer, LineByLineTextDataset, DataCollatorForLanguageModeling,Trainer, TrainingArguments)

In [None]:
!pip install deepspeed

In [None]:
import deepspeed

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

text  = '.'.join(train_data.excerpt.tolist() + test_data.excerpt.tolist())

with open('excerpt.txt','w') as f:
    f.write(text)

In [None]:
model_name = 'roberta-large'
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained('./clrp_roberta_large_ds')

In [None]:
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9998'
os.environ['RANK'] = "0"
os.environ['LOCAL_RANK'] = "0"
os.environ['WORLD_SIZE'] = "1"
# os.environ['MAX_JOBS'] = "4"

In [None]:
deepspeed_config = {
    "fp16": {
        "enabled": False,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "zero_optimization": {
        "stage": 0,
        "offload_optimizer": {
         "device": "cpu",
         "pin_memory": True
     },
     "offload_param": {
        "device": "cpu",
        "pin_memory": True
        },
        "allgather_partitions": True,
        "allgather_bucket_size": 2e8,
        "overlap_comm": True,
        "reduce_scatter": True,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": True,
        "cpu_offload": True
    },

    "zero_allow_untested_optimizer": True,

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas" : "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto"
        }
    },
    "sparse_attention": {
        "mode": "fixed",
        "block": 16,
        "different_layout_per_head": True,
        "num_local_blocks": 4,
        "num_global_blocks": 1,
        "attention": "bidirectional",
        "horizontal_global_attention": False,
        "num_different_global_patterns": 4,
        "num_random_blocks": 0,
        "local_window_blocks": [4],
        "global_block_indices": [0],
        "global_block_end_indices": None,
        "num_sliding_window_blocks": 3
  },

    "steps_per_print": 2000,
    "wall_clock_breakdown": False,
    "train_micro_batch_size_per_gpu" : 'auto',
    "gradient_clipping": "auto",
    "prescale_gradients" : False
}


In [None]:
dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "excerpt.txt",
    block_size = 256,
)

valid_dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "excerpt.txt",
    block_size = 256)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,mlm = True, mlm_probability = 0.15
)

training_args = TrainingArguments(
    num_train_epochs = 5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    evaluation_strategy = 'epoch',
    save_total_limit = 1,
    #     eval_steps = 66,
    save_steps = 268,
    metric_for_best_model = 'eval_loss',
    greater_is_better = False,
#     gradient_accumulation_steps = 1,
    load_best_model_at_end = True,
    prediction_loss_only = True,
    report_to = "none",
    output_dir = "./clrp_roberta_base_trainer",
    overwrite_output_dir = True,
    ## DeepSpeed Args
    max_grad_norm = 1.0,
    local_rank = 0,
    ## optimizer
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon = 1e-8,
    weight_decay = 1e-7,
    ## learning rate scheduler
#     warmup_steps = 0,
#     warmup_max_lr = 2e-5,
    fp16 = False,
    learning_rate = 3e-5,)
#     deepspeed = deepspeed_config)

training_args._setup_devices

In [None]:
trainer = Trainer(
    model = model,
    args=training_args,
    data_collator = data_collator,
    train_dataset = dataset,
    eval_dataset = valid_dataset)

In [None]:
trainer.train()

In [None]:
trainer.save_model('./clrp_roberta_large_ds')

## Finetuning : In Progress
## Inference : In Progress