In [None]:
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import logging
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from tensorflow.keras import backend as K
from transformers import RobertaTokenizer, TFRobertaModel
from kaggle_datasets import KaggleDatasets
tf.get_logger().setLevel(logging.ERROR)
from kaggle_datasets import KaggleDatasets

In [None]:
# Configurations
EPOCHS = 70
# Batch size
BATCH_SIZE = 6 # 6 ok # 10 12 # 24
# Seed
SEED = 123
# Learning rate
# LR = 0.000040 # raw ,large 20epoch时候val loss只能到0.9不收敛
LR = 0.000010 # instead

# Verbosity
VERBOSE = 2
# Number of folds for training
FOLDS = 5

# Max length
MAX_LEN = 250

# Get the trained model we want to use
# MODEL = 'roberta-base'
MODEL = 'roberta-large'

# Let's load our model tokenizer
tokenizer = RobertaTokenizer.from_pretrained(MODEL)

# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
# -------------- add new --------------------------
from sklearn import datasets
from sklearn import model_selection


def create_folds(data, num_splits, SEED=42):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=num_splits, shuffle = True, random_state = SEED)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

# df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
# df = create_folds(df, FOLDS, SEED)
# df.head()
# ----------------------------------------

In [None]:
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

# This function tokenize the text according to a transformers model tokenizer
def regular_encode(texts, tokenizer, maxlen = MAX_LEN):
    enc_di = tokenizer.batch_encode_plus(
        texts,
        padding = 'max_length',
        truncation = True,
        max_length = maxlen,
    )
    
    return np.array(enc_di['input_ids'])

# This function encode our training sentences
def encode_texts(x_train, x_val, MAX_LEN):
    x_train = regular_encode(x_train.tolist(), tokenizer, maxlen = MAX_LEN)
    x_val = regular_encode(x_val.tolist(), tokenizer, maxlen = MAX_LEN)
    return x_train, x_val

# Function to transform arrays to tensors
def transform_to_tensors(x_train, x_val, y_train, y_val):
    
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, y_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    
    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_val, y_val))
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    
    return train_dataset, valid_dataset

# Function to build our model
def build_roberta_base_model(max_len = MAX_LEN):
    transformer = TFRobertaModel.from_pretrained(MODEL)
    input_word_ids = tf.keras.layers.Input(shape = (max_len, ), dtype = tf.int32, name = 'input_word_ids')
    sequence_output = transformer(input_word_ids)[0]
    # We only need the cls_token, resulting in a 2d array
    cls_token = sequence_output[:, 0, :]
    output = tf.keras.layers.Dense(1, activation = 'linear', dtype = 'float32')(cls_token)
    model = tf.keras.models.Model(inputs = [input_word_ids], outputs = [output])
    model.compile(optimizer = tf.keras.optimizers.Adam(lr = LR),
                  loss = [tf.keras.losses.MeanSquaredError()],
                  metrics = [tf.keras.metrics.RootMeanSquaredError()])
    return model

# Function to train and evaluate our model
def train_and_evaluate():
    
    # Read our training data
    df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
     
    # Seed everything
    seed_everything(SEED)
#     ### raw
#     # Initiate kfold object with shuffle and a specific seed
#     kfold = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
    
    # -------------- instead ------------------------------------
#     df['url_legal']=df['url_legal'].astype(str)
#     df['url_legal']=df['url_legal'].apply(lambda x:x.split('/')[-1])
#     df['excerpt']=df['excerpt']+' '+df['url_legal']+' '+df['license'].astype(str)
    df = create_folds(df, FOLDS, SEED)
    # --------------------------------------------------
    
    # Create out of folds array to store predictions
    oof_predictions = np.zeros(len(df))
#     # raw
#     for fold, (trn_ind, val_ind) in enumerate(kfold.split(df)):
    # instead
    for fold in range(FOLDS):
        
        print('\n')
        print('-'*50)
        print(f'Training fold {fold + 1}')
        K.clear_session()
        
        # ---------------------raw-----------------------------
        # Get text features and target
#         x_train, x_val = df['excerpt'].iloc[trn_ind], df['excerpt'].iloc[val_ind]
#         y_train, y_val = df['target'].iloc[trn_ind].values, df['target'].iloc[val_ind].values
        # instead
        df_train = df[df['kfold'] != fold].reset_index(drop=True)
        df_valid = df[df['kfold'] == fold].reset_index(drop=True)
        x_train, x_val = df_train['excerpt'], df_valid['excerpt']
        y_train, y_val = df_train['target'].values, df_valid['target'].values        
        del df_train, df_valid
        # --------------------------------------------------
        
        
        # Encode our text with Roberta tokenizer
        x_train, x_val = encode_texts(x_train, x_val, MAX_LEN)
        # Function to transform our numpy array to a tf Dataset
        train_dataset, valid_dataset = transform_to_tensors(x_train, x_val, y_train, y_val)
        # Build model
        model = build_roberta_base_model(max_len = MAX_LEN)
        # Model checkpoint
        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'Roberta_Base_{SEED}_{fold + 1}.h5', 
                                                        monitor = 'val_root_mean_squared_error', 
                                                        verbose = VERBOSE, 
                                                        save_best_only = True,
                                                        save_weights_only = True, 
                                                        mode = 'min')
        steps = x_train.shape[0] // (BATCH_SIZE * 16)
        # Training phase
        history = model.fit(train_dataset,
                            batch_size = BATCH_SIZE,
                            epochs = EPOCHS,
                            verbose = VERBOSE,
                            callbacks = [checkpoint],
                            validation_data = valid_dataset,
                            steps_per_epoch = steps)
        
        
        # Load best epoch weights
        model.load_weights(f'Roberta_Base_{SEED}_{fold + 1}.h5')
        # Predict validation set to save them in the out of folds array
        val_pred = model.predict(valid_dataset)
#         # raw
#         oof_predictions[val_ind] = val_pred.reshape(-1)
        # instead
        oof_predictions[df['kfold']==fold]=val_pred.reshape(-1)
        del model,x_train, x_val,y_train, y_val,train_dataset, valid_dataset
        import gc
        gc.collect()
    print('\n')
    print('-'*50)
    # Calculate out of folds root mean squared error
    oof_rmse = np.sqrt(mean_squared_error(df['target'], oof_predictions))
    print(f'Our out of folds RMSE is {oof_rmse}')
    

train_and_evaluate()