In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import torch
import os
import random
from matplotlib import pyplot as plt
import time
import tensorflow as tf
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [None]:
N_FOLD = 5
ROBERTA_PATH = 'roberta-base'

train_path = '../input/commonlitreadabilityprize/train.csv'

In [None]:
from sklearn import model_selection
import seaborn as sns
def create_folds(data, num_splits):
    data["kfold"] = -1
    data = data.sample(frac=1).reset_index(drop=True)
    num_bins = int(np.floor(1 + np.log2(len(data))))
    data.loc[:, "bins"] = pd.cut(data["target"], bins=num_bins, labels=False)
    kf = model_selection.StratifiedKFold(n_splits=num_splits)
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    return data


df_data_raw =  pd.read_csv(train_path)
df_data = create_folds(df_data_raw,N_FOLD)
fig = plt.figure(figsize=(25,2*N_FOLD))
for k in range(N_FOLD):
    ax = fig.add_subplot(int(N_FOLD/3)+1, 3, k+1)
    dis_data=df_data.query(f"kfold != {k}")
    sns.histplot(dis_data['target'], kde=True, stat="density", linewidth=0,alpha = 0.5)
    ax.set_title('fold{} mean: {:.3f}, std: {:.3f}'.format(k, np.mean(dis_data['target']), np.std(dis_data['target'])))

In [None]:
df_data

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)

In [None]:
# For people who use model pretrained with uncased data
def processing(text):
    text = text.replace('\n','')
    text = text.lower() # if encoder is uncased
    text = text.strip()
    return text

# How to use

## Pytorch

In [None]:
class LitDataset(Dataset):
    def __init__(self, data_df, tokenizer, is_train=True):
        super().__init__()
        self.is_train = is_train
#         self.text = [processing(text) for text in data_df['excerpt']]
        self.text = [text.replace("\n", " ") for text in data_df['excerpt']]
        if  self.is_train:
            self.target = torch.tensor(data_df.target.values, dtype=torch.float32)        
        self.tokenized = tokenizer(
            self.text,
            padding = 'max_length',            
            max_length = 256,
            truncation = True,
            return_attention_mask=True
        )        
 
    
    def __len__(self):
        return len(self.text)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.tokenized['input_ids'][index])
        attention_mask = torch.tensor(self.tokenized['attention_mask'][index])
        
        if self.is_train:
            target = self.target[index]
            return (input_ids, attention_mask, target)
                       
        else:
            return (input_ids, attention_mask) 

In [None]:
dataset = LitDataset(df_data, tokenizer)
for fold in range(N_FOLD):
    print('Fold {}:'.format(fold))
    #####################################################
    train_idx = df_data.index[df_data['kfold'] != fold]
    valid_idx = df_data.index[df_data['kfold'] == fold]
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    val_subsampler = torch.utils.data.SubsetRandomSampler(valid_idx)
    train_loader = torch.utils.data.DataLoader(
                      dataset, 
                      batch_size=16, sampler=train_subsampler)

    val_loader = torch.utils.data.DataLoader(
                      dataset, 
                      batch_size=16, sampler=val_subsampler)
    #####################################################
    # model = ...
    # model.train()
    # for epoch in range(...):
    # for i, data in enumerate(train_loader, 0):
    #      ..............................
    #####################################################
    
    print('Train: {}, Validation: {}'.format(len(train_idx), len(valid_idx)))

## Tensorflow

In [None]:
def get_dataset(data_df, tokenizer, is_train = True, batch_size=16, seq_len=256,shuffle=True):
    text = [text.replace("\n", " ") for text in data_df['excerpt']]
    tokenized_inputs =  tokenizer(text=text,max_length = seq_len,truncation = True,padding = 'max_length')
    if is_train:
        target_value = tf.cast(data_df.target,dtype=tf.float32)
        dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs['input_ids'], 
                                                      'attention_mask': tokenized_inputs['attention_mask']}, 
                                                      tf.expand_dims(target_value, axis=1)))
    else:
        dataset = tf.data.Dataset.from_tensor_slices({'input_ids': tokenized_inputs['input_ids'], 
                                                  'attention_mask': tokenized_inputs['attention_mask']})
    if shuffle:
        dataset = dataset.shuffle(1024)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

In [None]:
for fold in range(N_FOLD):
    print('Fold {}:'.format(fold))
    #####################################################
    train_idx = df_data.index[df_data['kfold'] != fold]
    valid_idx = df_data.index[df_data['kfold'] == fold]
    df_train,df_val = df_data.iloc[train_idx], df_data.iloc[valid_idx]
    dataset_train = get_dataset(df_train, tokenizer)
    dataset_val = get_dataset(df_val, tokenizer,shuffle=False)
    #####################################################
    # model = ...
    # history_train = model.fit(dataset_train, ...)
    # history_val = model.evaluate(dataset_val, ...)
    #####################################################
    
    print('Train: {}, Validation: {}'.format(len(train_idx), len(valid_idx)))