# Requirements

In [None]:
from pathlib import Path
kaggle = Path("/content/kaggle.json")

!sudo apt-get install -y git-lfs
!pip install wandb
!pip install kaggle
!pip install sentencepiece
!pip install transformers
!git config --global user.email "simonmeoni@aol.com"
!git config --global user.name "Simon Meoni"

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c commonlitreadabilityprize
!unzip train.csv.zip

In [None]:
import pandas as pd
import os
import torch
import gc
import math
import numpy as np
from torch import nn
import torch.nn.functional as F
import concurrent.futures
from transformers import (
    AutoTokenizer,
    AlbertModel,
    AutoModel,
    AdamW,
    AutoModelForMaskedLM,
    LineByLineTextDataset,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    AutoModelForSeq2SeqLM,
)
import transformers
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader, Subset, random_split
import glob
import random
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
from torch.optim.lr_scheduler import CosineAnnealingLR

# Hyperparams

In [None]:
model_name = "roberta-large"

# Dataset

In [None]:
train_data = pd.read_csv("/content/train.csv")
test_data = pd.read_csv("/content/test.csv")
data = pd.concat([train_data, test_data])
text = "\n".join(data.excerpt.tolist())

with open("text.txt", "w") as f:
    f.write(text)

# Get Model

In [None]:
# model to pretrain : roberta-base, roberta-large,
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Train

In [None]:
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="text.txt",  # mention train text file here
    block_size=256,
)

valid_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="text.txt",  # mention valid text file here
    block_size=256,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir="./clrp_pt",  # select model path for checkpoint
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    save_total_limit=0,
    eval_steps=150,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    prediction_loss_only=True,
    report_to="none",
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

# Upload Model

In [None]:
!rm /content/clrp_pt
trainer.train()