In [None]:
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import RepeatVector

from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, Flatten, Dense, MaxPooling1D, GlobalMaxPool1D, GlobalMaxPool2D, Conv2D, MaxPool2D
from tensorflow.keras.layers import Embedding, Dropout, Bidirectional, LSTM, GlobalMaxPool1D, Dense, GRU, Reshape, Concatenate

from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras.layers import TimeDistributed, Activation, Dot
from tensorflow.keras.models import Model, load_model

from tensorflow.keras.layers import BatchNormalization, concatenate, multiply, add, dot
from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.python.keras.layers import Layer
from tensorflow.keras.metrics import RootMeanSquaredError


from tensorflow.keras.optimizers import Adam

from tensorflow.keras.utils import plot_model, to_categorical
import tensorflow as tf

from scipy.stats import describe
import collections
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
from time import gmtime, strftime
from tensorflow.keras.callbacks import TensorBoard
import re
import string

import warnings
warnings.filterwarnings('ignore')
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 100)
from tqdm import tqdm

import nltk
stopwords = nltk.corpus.stopwords.words('english')
from nltk.tokenize import word_tokenize, sent_tokenize

import spacy
import re

import lightgbm as lgb
import gensim

from transformers import AutoTokenizer, AutoConfig, AutoModel
from transformers import TFRobertaModel, RobertaConfig, RobertaTokenizerFast, RobertaTokenizer
from transformers import BertTokenizer, BertConfig, TFBertModel
from transformers import DistilBertTokenizer, DistilBertConfig, TFDistilBertModel


In [None]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
actual_test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
train = train[['id', 'excerpt', 'target']]
actual_test = actual_test[['id', 'excerpt']]
train['excerpt'] = train['excerpt'].str.lower()
actual_test['excerpt'] = actual_test['excerpt'].str.lower()

In [None]:
pretrained_dir = '../input/tfbert-base-uncased'
model_config = BertConfig.from_pretrained(pretrained_dir)
model_config.output_hidden_states = True
tokenizer = BertTokenizer.from_pretrained(pretrained_dir)
bert_model = TFBertModel.from_pretrained(pretrained_dir, config=model_config)

In [None]:
distill_path = '../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased'
distill_config = DistilBertConfig.from_pretrained(distill_path)
distill_config.output_hidden_states = True
distill_tokenizer = DistilBertTokenizer.from_pretrained(distill_path)
distill_model = TFDistilBertModel.from_pretrained(distill_path, config=distill_config)

In [None]:
%%time
max_len = 250

def distill_encode(texts, tokenizer, max_len=max_len):
    input_ids = []
    attention_mask = []
    
    for text in tqdm(texts):
        token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
                         add_special_tokens=True)
        input_ids.append(token['input_ids'])
        attention_mask.append(token['attention_mask'])
    
    return np.array(input_ids), np.array(attention_mask)


def bert_encode(texts, tokenizer, max_len=max_len):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    
    for text in texts:
        token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
                         add_special_tokens=True)
        input_ids.append(token['input_ids'])
        token_type_ids.append(token['token_type_ids'])
        attention_mask.append(token['attention_mask'])
    
    return np.array(input_ids), np.array(token_type_ids), np.array(attention_mask)

In [None]:

X_distill = distill_encode(train['excerpt'].values, distill_tokenizer, max_len=max_len)
test_encoded_distill = distill_encode(actual_test['excerpt'].values, distill_tokenizer, max_len=max_len)


X_bert = bert_encode(train['excerpt'].values, tokenizer, max_len=max_len)
test_encoded_bert = bert_encode(actual_test['excerpt'].values, tokenizer, max_len=max_len)

In [None]:
def build_distill_model(bert_model, max_len=max_len):    
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
    
    sequence_output = bert_model(input_ids, attention_mask=attention_mask)[0]
    print(sequence_output.shape)
    
    x= Bidirectional(CuDNNLSTM(1024, return_sequences=True))(sequence_output)
    x = Dropout(0.2)(x)
    att_vector = TimeDistributed(Dense(1))(x)
    att_vector = Reshape((max_len, ))(att_vector)
    att_vector= Activation('softmax', name='attention_vec')(att_vector)
    att_output = Dot(axes=1)([x, att_vector])
    out = Dense(1024, activation='relu')(att_output)
    out= Dense(1, activation='linear')(out)

    
    model = Model(inputs=[input_ids, attention_mask], outputs=out)
    model.layers[2].trainable=False
    model.compile(Adam(lr=1e-5), loss='mean_squared_error', metrics=[RootMeanSquaredError()])
    
    return model

In [None]:
build_distill_model(distill_model, max_len=max_len).summary()

In [None]:
def build_bert_model(bert_model, max_len=max_len):    
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    token_type_ids = Input(shape=(max_len,), dtype=tf.int32, name="token_type_ids")
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
    
    sequence_output = bert_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
    print(sequence_output.shape)
    
    x= Bidirectional(CuDNNLSTM(1024, return_sequences=True))(sequence_output)
    x = Dropout(0.2)(x)
    att_vector = TimeDistributed(Dense(1))(x)
    att_vector = Reshape((max_len, ))(att_vector)
    att_vector= Activation('softmax', name='attention_vec')(att_vector)
    att_output = Dot(axes=1)([x, att_vector])
    out = Dense(1024, activation='relu')(att_output)
    out= Dense(1, activation='linear')(out)

    
    model = Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=out)
    #Here the layer number is 3
    model.layers[3].trainable=False
    model.compile(Adam(lr=1e-5), loss='mean_squared_error', metrics=[RootMeanSquaredError()])
    
    return model

In [None]:
build_bert_model(bert_model, max_len=max_len).summary()

In [None]:
kf = KFold(n_splits = 4, shuffle=True, random_state=60)
batch_size=32

In [None]:

def run_fold(train_data,train_target, test_data, b_model=str):
    from IPython.display import clear_output
    import gc
    oof_preds = np.zeros(train_data[0].shape[0])
    test_preds = np.zeros(test_data[0].shape[0])
    for cnt, (tr_ind , val_ind) in enumerate(kf.split(train_data[0], train_target)):
        print(f'Fold:{cnt}, Train set: {len(tr_ind)}, Test set:{len(val_ind)}') 

        X_train = [x[tr_ind] for x in train_data]
        X_val = [x[val_ind] for x in train_data]
        y_train, y_val = train_target[tr_ind],train_target[val_ind]
            
        filepath = 'my_best_model.hdf5'
        checkpoint = ModelCheckpoint(filepath, monitor='val_root_mean_squared_error', verbose=1, save_best_only=True,save_weights_only=True, 
                                     mode='auto')
        if b_model == 'bert':
            model = build_bert_model(bert_model, max_len=max_len)
        if b_model == 'distill':
            model = build_distill_model(distill_model, max_len=max_len)
        
        model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,validation_data=(X_val, y_val), 
                 callbacks=[checkpoint], verbose=1)
        
        model.load_weights(filepath)
        oof_preds[val_ind] = model.predict(X_val, batch_size=batch_size).ravel()

        test_preds += model.predict(test_data, batch_size=batch_size).ravel()

        del model
        
    test_preds = test_preds / kf.n_splits
    overall_rmse = np.sqrt(mean_squared_error(train_target, oof_preds))
    
    print(f"Overall RMSE : {overall_rmse}")
    gc.collect()
    print("")
    return oof_preds , test_preds

# REGRESSION PREDS

In [None]:
%%time
epochs =15
oof_distill, test_distill = run_fold(X_distill, train['target'], test_encoded_distill, 'distill')

In [None]:
np.sqrt(mean_squared_error(train['target'], oof_distill))

In [None]:
%%time
epochs =15
oof_bert, test_bert = run_fold(X_bert, train['target'], test_encoded_bert, 'bert')

In [None]:
np.sqrt(mean_squared_error(train['target'], oof_bert))

In [None]:
train_pred_df = pd.concat([pd.Series(oof_distill), pd.DataFrame(oof_bert)], axis=1)
test_pred_df = pd.concat([pd.Series(test_distill), pd.DataFrame(test_bert)], axis=1)

X1 = train_pred_df.values
y1= train['target'].values

kf = KFold(n_splits = 5, shuffle=True, random_state=60)

oof_preds1 = np.zeros(len(train))
test_preds1= np.zeros(len(actual_test))

for cnt, (tr_ind , val_ind) in enumerate(kf.split(X1, y1)):
    print(f'Fold:{cnt}, Train set: {len(tr_ind)}, Test set:{len(val_ind)}') 

    
    X_train, X_val = X1[tr_ind], X1[val_ind]
    y_train, y_val = y1[tr_ind], y1[val_ind]
    model_lgb = lgb.LGBMRegressor(random_state=60, n_jobs=-1)
    model_lgb.fit(X_train, y_train, eval_set=(X_val, y_val), eval_metric='rmse', early_stopping_rounds=100, verbose=50)
    
    oof_preds1[val_ind] = model_lgb.predict(X_val)
    test_preds1 += model_lgb.predict(test_pred_df) / kf.n_splits
    
    print(f"Fold {cnt} RMSE: ",np.sqrt(mean_squared_error(y_val, oof_preds1[val_ind])) )
overall_rmse = np.sqrt(mean_squared_error(y1, oof_preds1))
print(f"Overall RMSE : {overall_rmse}")

In [None]:
actual_test['target'] = test_preds1
actual_test

In [None]:
actual_test[['id', 'target']].to_csv('submission.csv', index=False)

In [None]:
pd.read_csv('submission.csv')