In [None]:
import os
import pandas as pd
import numpy as np
import random
import gc


from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CosineAnnealingLR, 
                                      ReduceLROnPlateau)
from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,get_constant_schedule_with_warmup)
from transformers import PreTrainedModel, RobertaTokenizerFast, RobertaConfig, RobertaModel, AdamW

scaler = torch.cuda.amp.GradScaler()

from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Set Configs/Constants

class config:
    
    SEED = 42
    MAX_LEN = 256
    TRAIN_BATCH_SIZE = 16
    VAL_BATCH_SIZE = 64
    ROBERTA_MODEL_PATH = '../input/roberta-base'
    EPOCHS = 3
    LR = 1e-5
    TEXT_COLUMN = 'excerpt'

In [None]:
def set_seed(seed = 0):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state

random_state = set_seed(config.SEED)

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
def create_kfolds(df,target_col, seed):

    df["kfold"] = -1

    df = df.sample(frac=1).reset_index(drop=True)

    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=seed)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df)):
        print(len(train_idx), len(val_idx))
        df.loc[val_idx, 'kfold'] = fold

    return df

def create_Stratkfolds(df,target_col, seed):

    df["kfold"] = -1

    df = df.sample(frac=1).reset_index(drop=True)

    ### This was taken from https://www.kaggle.com/abhishek/step-1-create-folds
    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(df))))
    
    # bin targets
    df.loc[:, "bins"] = pd.cut(
        df[target_col], bins=num_bins, labels=False
    )

    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df, y = df.bins.values)):
        print(len(train_idx), len(val_idx))
        df.loc[val_idx, 'kfold'] = fold

    return df

In [None]:
# Reading Data
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

target = train['target'].to_numpy()

In [None]:
# Model with classifier layers on top of RoBERTa
class ReadabilityModel(torch.nn.Module):
    def __init__(self, conf, dropout_rate=0.3):
        super(ReadabilityModel, self).__init__()
        
        self.roberta = RobertaModel.from_pretrained(config.ROBERTA_MODEL_PATH, config = conf)
        self.dropout=nn.Dropout(dropout_rate)
        self.linear=nn.Linear(768,1)
        
    def forward(self, ids, mask):
        output1 = self.roberta(input_ids=ids, attention_mask=mask)
        output1 = output1.hidden_states
        output1 = output1[-1]
        xlnet_output=self.dropout(output1)
        
        out = torch.mean(xlnet_output, 1, False)
        final_output=self.linear(out)
        final_outputs = final_output.squeeze(-1).squeeze(-1)
        
        return final_outputs

In [None]:
# Clear GPU Memory Unused

# gc.collect()
torch.cuda.empty_cache()
# torch.cuda.clear_memory_allocated()  # entirely clear all allocated memory

In [None]:
model_name = config.ROBERTA_MODEL_PATH
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)

model_config = RobertaConfig.from_pretrained(model_name)
model_config.output_hidden_states = True

model = ReadabilityModel(conf = model_config ,dropout_rate=0.4)
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=config.LR, weight_decay=0.01)
scheduler = get_constant_schedule_with_warmup(optimizer, 100)

def loss_fn(output,target):
    return torch.sqrt(nn.MSELoss()(output,target))

epochs = config.EPOCHS

FOLD_MAPPPING = {
    0: [1, 2, 3, 4],
    1: [0, 2, 3, 4],
    2: [0, 1, 3, 4],
    3: [0, 1, 2, 4],
    4: [0, 1, 2, 3]
}

In [None]:
# Create FOLDS
train = create_Stratkfolds(train,'target', config.SEED)

In [None]:
class ReadabiltyDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.sentences = data[config.TEXT_COLUMN].to_numpy()
        self.target = data['target']
        self.tokenizer = tokenizer
        self.max_len = config.MAX_LEN

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        sentences = str(self.sentences[item])
        sentences = " ".join(sentences.split())

        inputs = self.tokenizer.encode_plus(
            sentences,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float),
        }

In [None]:
# Taken from https://www.kaggle.com/chumajin/pytorch-bert-beginner-s-room
def training(train_dataloader,model,optimizer,scheduler):
    
    model.train()
    torch.backends.cudnn.benchmark = True

    allpreds = []
    alltargets = []

    for a in tqdm(train_dataloader):

        losses = []

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():

            ids = a["ids"].to(device,non_blocking=True)
            mask = a["mask"].to(device,non_blocking=True)

            output = model(ids,mask)

            target = a["targets"].to(device,non_blocking=True)

            loss = loss_fn(output,target)


            # For scoring
            losses.append(loss.item())
            allpreds.append(output.detach().cpu().numpy())
            alltargets.append(target.detach().squeeze(-1).cpu().numpy())

        scaler.scale(loss).backward() # backwards of loss
        scaler.step(optimizer) # Update optimizer
        scaler.update() # scaler update

        scheduler.step() # Update learning rate schedule

        # Combine dataloader minutes

    allpreds = np.concatenate(allpreds)
    alltargets = np.concatenate(alltargets)

    # I don't use loss, but I collect it

    losses = np.mean(losses)

    # Score with rmse
    train_rme_loss = np.sqrt(mean_squared_error(alltargets,allpreds))

    return losses,train_rme_loss

In [None]:
def validating(valid_dataloader, model):
    
    model.eval()

    allpreds = []
    alltargets = []

    for a in valid_dataloader:

        losses = []

        with torch.no_grad():

            ids = a["ids"].to(device,non_blocking=True)
            mask = a["mask"].to(device,non_blocking=True)

            output = model(ids,mask)

            target = a["targets"].to(device,non_blocking=True)

            loss = loss_fn(output,target)


            # For scoring
            losses.append(loss.item())
            allpreds.append(output.detach().cpu().numpy())
            alltargets.append(target.detach().squeeze(-1).cpu().numpy())


    # Combine dataloader minutes

    allpreds = np.concatenate(allpreds)
    alltargets = np.concatenate(alltargets)

    # I don't use loss, but I collect it

    losses = np.mean(losses)

    # Score with rmse
    valid_rme_loss = np.sqrt(mean_squared_error(alltargets,allpreds))

    return allpreds,losses,valid_rme_loss

In [None]:
for FOLD in FOLD_MAPPPING.keys():
    
    print(" Fold Number : {0}".format(str(FOLD)))
    
    train_df = train[(train.kfold.isin(FOLD_MAPPPING.get(FOLD)))].reset_index(drop=True)
    valid_df = train[(train.kfold==FOLD)].reset_index(drop=True)
    
    train_data = ReadabiltyDataset(data = train_df, tokenizer = tokenizer) 
    train_loader = DataLoader(dataset = train_data, shuffle=True, batch_size = config.TRAIN_BATCH_SIZE, num_workers=4,pin_memory=True)

    val_data = ReadabiltyDataset(data = valid_df, tokenizer = tokenizer) 
    val_loader = DataLoader(dataset = val_data, shuffle=False, batch_size = config.VAL_BATCH_SIZE, num_workers=4,pin_memory=True)
    
    train_steps = int(len(train_df)/config.TRAIN_BATCH_SIZE * config.EPOCHS)

    num_steps = int(train_steps*0.1)
    
    trainlosses = []
    vallosses = []
    bestscore = None

    trainscores = []
    validscores = []

    for epoch in range(epochs):
    
        print("---------------" + str(epoch) + "start-------------")

        trainloss,trainscore = training(train_loader,model,optimizer,scheduler)

        trainlosses.append(trainloss)
        trainscores.append(trainscore)

        print("trainscore is " + str(trainscore))

        preds,validloss,valscore=validating(val_loader,model)

        vallosses.append(validloss)
        validscores.append(valscore)


        print("valscore is " + str(valscore))

        if bestscore is None:
            bestscore = valscore

            print("Save first model")

            state = {
                            'state_dict': model.state_dict(),
                            'optimizer_dict': optimizer.state_dict(),
                            "bestscore":bestscore
                        }


            torch.save(state, "roberta_model_fold{0}.pth".format(FOLD))

        elif bestscore > valscore:

            bestscore = valscore

            print("found better point")

            state = {
                            'state_dict': model.state_dict(),
                            'optimizer_dict': optimizer.state_dict(),
                            "bestscore":bestscore
                        }


            torch.save(state, "roberta_model_fold{0}.pth".format(FOLD))

        else:
            pass

In [None]:
import gc
del train_dataset,valid_dataset,train_dataloader,valid_dataloader
_ = gc.collect()

In [None]:
class ReadabiltyInfDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.sentences = data[config.TEXT_COLUMN].to_numpy()
        self.tokenizer = tokenizer
        self.max_len = config.MAX_LEN

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        sentences = str(self.sentences[item])
        sentences = " ".join(sentences.split())

        inputs = self.tokenizer.encode_plus(
            sentences,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long)
        }

In [None]:
test_dataset = ReadabiltyInfDataset(test, tokenizer)

In [None]:
test_dataloader = DataLoader(test_dataset,batch_size=config.VAL_BATCH_SIZE,shuffle = False,num_workers=4,pin_memory=True)

In [None]:
pthes = [os.path.join("./",s) for s in os.listdir("./") if ".pth" in s]
# pthes = ['./model0.pth']
pthes

In [None]:
states = [torch.load(s) for s in pthes]

In [None]:
def predicting(test_dataloader, model, states):

    allpreds = []
    
    for state in states:
        model.load_state_dict(state["state_dict"])
        model.to(device)
        model.eval()
    
    
        preds = []
        allvalloss=0

        with torch.no_grad():


            for a in tqdm(test_dataloader):

                ids = a["ids"].to(device)
                mask = a["mask"].to(device)

               # output = model(ids,mask,tokentype)
                output = model(ids,mask)

                preds.append(output.cpu().numpy())

            preds = np.concatenate(preds)
            
            allpreds.append(preds)

    return allpreds

In [None]:
allpreds = predicting(test_dataloader,model,states)

In [None]:
findf = pd.DataFrame(allpreds)
findf = findf.T

In [None]:
findf = findf.mean(axis=1)

In [None]:
submission_df = pd.DataFrame({'id': test.id, 'target': findf})
submission_df

In [None]:
submission_df.to_csv('submission.csv', index = False)