This is the inference of my DeBERTa training notebook. 
[You can run your own and then add data from your notebook instead of my checkpoints](https://www.kaggle.com/crained/deberta-pytorch-commonlit-readability-train). 

# Imports

In [None]:
import torch 
from torch import nn 
import torch.nn.functional as F
import numpy as np 
import pandas as pd 
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl 
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import model_selection
import transformers
from transformers import get_linear_schedule_with_warmup, AdamW

In [None]:
!pip3 install deberta
from DeBERTa import deberta

vocab_path, vocab_type = deberta.load_vocab(pretrained_id='base')
tokenizer = deberta.tokenizers[vocab_type](vocab_path)

# Data and Preprocess

In [None]:
#taking only the id,excerpt,target
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv",usecols=["id","excerpt","target"])
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv",usecols=["id","excerpt"])
print("train shape",df.shape)
df.head()

In [None]:
test_df.head()

In [None]:
#remove \n and replace \'s with 'sfrom the text
def prep_text(text_df):
    text_df = text_df.str.replace("\n","",regex=False) 
    return text_df.str.replace("\'s",r"s",regex=True).values
df["excerpt"] = prep_text(df["excerpt"])
test_df["excerpt"] = prep_text(test_df["excerpt"])

In [None]:
max_words = df["excerpt"].apply(lambda x: len(x.split())).max()
print("maximum words in instance:",max_words)

# Create Folds 

Directly Copied from [Abhishek Notebook](https://www.kaggle.com/abhishek/step-1-create-folds)

In [None]:
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=num_splits)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

# read training data
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

# create folds
df = create_folds(df, num_splits=5)

# Bert Model and Training Module

Constants

In [None]:
BATCH_SIZE = 16
EPOCHS = 15
LEARNING_RATE = 2e-5
NUM_TRAIN_STEPS = int((df.shape[0]/BATCH_SIZE)*EPOCHS)
NUM_WARMUP_STEPS = 0
FOLDS = df.kfold.unique()
NUM_FOLDS = df.kfold.nunique() 

RMSE as Criterion

In [None]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.eps = 1e-8
        
    def forward(self,output,target):
        return torch.sqrt(F.mse_loss(output,target)+self.eps)

In [None]:
class deBertaModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = transformers.AutoModel.from_pretrained("../input/deberta/base")
        self.drop = nn.Dropout(0.5)
        self.fc = nn.Linear(768,1)
    
    def forward(self,inputs):
        out = self.model(**inputs)
        last_hiddens = out[0]
        out = self.drop(last_hiddens[:,0,:].squeeze(1))
        return self.fc(out)
    
    def configure_optimizers(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=NUM_WARMUP_STEPS, num_training_steps=NUM_TRAIN_STEPS)
        return [optimizer],[scheduler] 
    
    def loss_fn(self,output,target):
        return RMSELoss()(output.view(-1),target.view(-1))
    
    def training_step(self,batch,batch_idx):
        inputs = batch["inputs"]
        labels = batch["label"]
        output = self(inputs)
        loss = self.loss_fn(output,labels)
        return loss
    
    def validation_step(self,batch,batch_idx):
        inputs = batch["inputs"]
        labels = batch["label"]
        output = self(inputs)
        loss = self.loss_fn(output,labels)
        self.log("val_loss",loss,prog_bar=True)


# Tokenize Dataset and Dataloader

In [None]:
class deBertaDataset(Dataset):
    def __init__(self,texts,labels,max_len):
        super().__init__()
        self.texts = texts
        self.max_len = max_len
        self.labels = labels
        self.tokenizer = transformers.AutoTokenizer.from_pretrained("../input/deberta/base")
    
    def __len__(self):
        return self.labels.shape[0]
    
    def __getitem__(self,idx):
        text = " ".join(self.texts[idx].split())
        label = self.labels[idx]
        inputs = self.tokenizer(text,return_tensors="pt",max_length = self.max_len, padding="max_length",truncation=True)
        return {
            "inputs":{"input_ids":inputs["input_ids"][0],
                      "token_type_ids":inputs["token_type_ids"][0],
                      "attention_mask":inputs["attention_mask"][0],},
            "label":torch.tensor(label,dtype=torch.float)
        }
    


# Trainer

# Load Weights and Inference 

If you want to run your checkpoints just change the input location below from my "oopshiooops" to yours. 

In [None]:
prediction = np.zeros(test_df.shape[0]) 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for fold in FOLDS:
    print("Fold:",fold)
    loaded_model = deBertaModel.load_from_checkpoint(f"../input/oopshiooops/checkpoint_{fold}fold.ckpt",map_location=device)
    loaded_model.to(device)
    loaded_model.eval() 
    #using the same deBertaDataset module of train, here dummy labels are provided
    test_dataset = deBertaDataset(test_df.excerpt.values,labels = np.ones(test_df.shape[0]),max_len=max_words)
    test_dataloader = DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=False,num_workers=4)
    output = []
    for batch in test_dataloader:
        x  = batch["inputs"]
        for key in x.keys():
            x[key] = x[key].to(device)
        assert x["input_ids"].is_cuda, f"data is not in model device({loaded_model.device.type})"
        out = loaded_model(x)
        output.extend(out.cpu().detach().numpy())
    prediction += np.hstack(output)
test_df["target"] = prediction/NUM_FOLDS
sub = test_df.drop("excerpt",axis=1) 
sub.to_csv("submission.csv",index=False)

In [None]:
sub.head()