In [None]:
import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt
import seaborn as sns
import os
import nltk
from wordcloud import WordCloud,STOPWORDS
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset,DataLoader
from torch.cuda import amp
import transformers
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from collections import defaultdict
from loguru import logger
nltk.download('stopwords')
stop_words = stopwords.words('english')
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
trtdata=pd.read_csv('../input/commonlitreadabilityprize/train.csv')
tstdata=pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
trtdata.head(4)

In [None]:
trtdata.shape

In [None]:
trtdata.isnull().sum() * 100 / len(trtdata)

In [None]:
#for test data element
tstdata.head(7)

# EDA of the text data

In [None]:
import missingno as msno
%matplotlib inline
msno.matrix(trtdata,figsize=(10,5),fontsize=12,sort="ascending",color=(0.50, 0.50, 0.50))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
sns.distplot(trtdata['target'], ax=ax,color=(0.20, 0.20, 0.20))
plt.title("Target Distribution",font="Serif",size="18",color=(0.70, 0.70, 0.50))
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))
sns.distplot(trtdata['standard_error'], ax=ax,color=(0.10, 0.10, 0.20))
plt.title("Standard_error Distribution",font="Serif",size="18")
plt.show()

In [None]:
sns.jointplot(x=trtdata['target'], y=trtdata['standard_error'], kind='hex',height=8,color=(0.30, 0.30, 0.30))
plt.suptitle("Target vs Standard error ",font="Serif",size="12")
plt.subplots_adjust(top=0.94)
plt.show()

# Cleaning and preprocessing of the text data

In [None]:
#before cleaning text data 
print(trtdata.excerpt.min())

In [None]:
text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
def preprocess(x,stem=False):
    x=re.sub(text_cleaning_re,'  ',str(x).lower()).strip()
    tokens=[]
    for token in x.split('\n'):
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
        return '  '.join(tokens)
trtdata.excerpt=trtdata.excerpt.apply(lambda x:preprocess(x))

In [None]:
#After cleaning text data|
print(trtdata.excerpt.min())

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
test_s = trtdata["excerpt"].iloc[0]
test_s

In [None]:
result1 = tokenizer.encode_plus(test_s)
result1

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn import model_selection

In [None]:
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)
    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    # bin targets
    data.loc[:, "bins"] = pd.cut(data["target"], bins=num_bins, labels=False)
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=num_splits)
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    # drop the bins column
    data = data.drop("bins", axis=1)
# return dataframe with folds
    return data
df = create_folds(trtdata, num_splits=5)
df.head()

In [None]:
class CONFIG:
    seed = 42
    max_len = 205
    train_batch_size = 64
    valid_batch_size = 64
    epochs = 10
    learning_rate = 1e-4
    n_accumulate = 1
    folds = 10
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    tokenizer.save_pretrained('./tokenizer')
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
class BERTDataset(Dataset):
    def __init__(self, train, tokenizer, max_len):
        self.text = train['excerpt'].values
        self.target =train['target'].values
        self.max_len = max_len
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.text)
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True)
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(self.target[index], dtype=torch.float)}        

In [None]:
def criterion(outputs, targets):
    return nn.MSELoss()(outputs, targets)

In [None]:
class BERTClass(nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Linear(768, 1)
        self.dropout = nn.Dropout(p=0.3)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.bert(ids, attention_mask = mask, 
                              token_type_ids = token_type_ids, 
                              return_dict=False)
        output = self.dropout(output)
        output = self.fc(output)
        return output

model = BERTClass()
model.to(CONFIG.device);

In [None]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    scaler = amp.GradScaler()
    dataset_size = 0
    running_loss = 0.0
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype = torch.float)
        batch_size = ids.size(0)
        
        with amp.autocast(enabled=True):
            outputs = model(ids, mask, token_type_ids)
            loss = criterion(outputs, targets)
            loss = loss / CONFIG.n_accumulate
            
        scaler.scale(loss).backward()
        
        if (step + 1) % CONFIG.n_accumulate == 0:
            scaler.step(optimizer)
            scaler.update()
            
            # zero the parameter gradients
            optimizer.zero_grad()
            
            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss/dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    return epoch_loss

In [None]:
@torch.no_grad()
def valid_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.eval()
    dataset_size = 0
    running_loss = 0.0
    TARGETS = []
    PREDS = []
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype = torch.float)
        batch_size = ids.size(0)
        outputs = model(ids, mask, token_type_ids)
        loss = criterion(outputs, targets)
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        epoch_loss = running_loss/dataset_size
        PREDS.extend(outputs.cpu().detach().numpy().tolist())
        TARGETS.extend(targets.cpu().detach().numpy().tolist())
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
    
    val_rmse = mean_squared_error(TARGETS, PREDS, squared=False)
    gc.collect()
    return epoch_loss, val_rmse

In [None]:
#validation function
@logger.catch
def run(model, optimizer, scheduler, device, num_epochs):    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_rmse = np.inf
    history = defaultdict(list)
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG.device, epoch=epoch)
        valid_epoch_loss, valid_epoch_rmse = valid_one_epoch(model, optimizer, scheduler,
                                                             dataloader=valid_loader, 
                                                             device=CONFIG.device, epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(valid_epoch_loss)
        history['Valid RMSE'].append(valid_epoch_rmse)
        
        print(f'Valid RMSE: {valid_epoch_rmse}')
        
        # deep copy the model
        if valid_epoch_rmse <= best_epoch_rmse:
            print(f"Validation RMSE Improved ({best_epoch_rmse} ---> {valid_epoch_rmse})")
            best_epoch_rmse = valid_epoch_rmse
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = "Loss{:.4f}_epoch{:.0f}.bin".format(best_epoch_rmse, epoch)
            torch.save(model.state_dict(), PATH)
            print("Model Saved")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_rmse))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history
def prepare_data(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = BERTDataset(df_train, CONFIG.tokenizer, CONFIG.max_len)
    valid_dataset = BERTDataset(df_valid, CONFIG.tokenizer, CONFIG.max_len)

    train_loader = DataLoader(train_dataset, batch_size=CONFIG.train_batch_size, 
                              num_workers=4, shuffle=True, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG.valid_batch_size, 
                              num_workers=4, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [None]:
train_loader, valid_loader = prepare_data(fold=0)

In [None]:
# Defining Optimizer with weight decay to params other than bias and layer norms
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
     'weight_decay': 0.0001},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
     'weight_decay': 0.0}
    ]  

optimizer = AdamW(optimizer_parameters, lr=CONFIG.learning_rate)

# Defining LR Scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=len(train_loader)*CONFIG.epochs
)
# scheduler = None


In [None]:
lrs = []
for epoch in range(1, CONFIG.epochs + 1):
    if scheduler is not None:
        scheduler.step()
    lrs.append(optimizer.param_groups[0]["lr"])
plt.plot(lrs);

In [None]:
from datetime import time
model, history = run(model, optimizer, scheduler=scheduler, device=CONFIG.device, num_epochs=CONFIG.epochs)