In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler

from transformers import TFAutoModel, AutoTokenizer
from transformers import BertConfig
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay

import tensorflow as tf
import tensorflow.keras as k
import transformers

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
validation = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
train.drop(['url_legal', 'license'], axis = 1, inplace = True)
train.shape, validation.shape

In [None]:
scalar = StandardScaler()
train['target_normal'] = scalar.fit_transform(train.target.values.reshape(-1, 1))
train.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(list(train.excerpt.values), list(train.target.values), test_size = 0.1, random_state = 0)
# x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
checkpoint = "bert-base-uncased"
config = BertConfig()
config.output_hidden_states = False

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModel.from_pretrained(checkpoint)

In [None]:
def create_model():
    input_ids = k.layers.Input(shape = (314, ), name = 'input_ids', dtype = 'int32')
    att_mask = k.layers.Input(shape = (314, ), name = 'attention_mask', dtype = 'int32')
    
    embedding_layer = model(input_ids, attention_mask = att_mask)[0]
    X = tf.keras.layers.Bidirectional(k.layers.LSTM(100, return_sequences = True))(embedding_layer)
    X = k.layers.Dropout(0.1)(X)
    X = tf.keras.layers.Bidirectional(k.layers.LSTM(50, return_sequences = True))(X)
    X = k.layers.Dropout(0.1)(X)
    X = tf.keras.layers.GlobalMaxPool1D()(X)
    X = tf.keras.layers.Dense(50, activation = 'relu')(X)
    X = tf.keras.layers.Dropout(0.2)(X)
    outs = tf.keras.layers.Dense(1)(X)
    
    mod = k.Model(inputs = [input_ids, att_mask], outputs = outs)
    
    for layer in mod.layers[:3]:
        layer.trainable = False
    
    return mod

def root_mean_squared_error(y_true, y_pred):
        return tf.sqrt(tf.reduce_mean((y_true - y_pred)**2))

In [None]:
batch = 128
num_epochs = 20

tokenized_train = tokenizer(x_train, truncation = True, padding = 'max_length', return_tensors = 'tf', max_length = 314)
tokenized_test = tokenizer(x_test, truncation = True, padding = 'max_length', return_tensors = 'tf', max_length = 314)
tokenized_validation = tokenizer(list(validation.excerpt.values), truncation = True, padding = 'max_length', return_tensors = 'tf', max_length = 314)

train_steps = tokenized_train['input_ids'].shape[0]//batch * num_epochs

In [None]:
new_model = create_model()
new_model.summary()

In [None]:
lr_schedule = PolynomialDecay(initial_learning_rate = 5e-5,end_learning_rate = 0, decay_steps = train_steps)
opt = Adam(learning_rate = lr_schedule)

new_model.compile(optimizer = opt, loss = root_mean_squared_error, metrics = ['mse'])
history = new_model.fit([tokenized_train['input_ids'], tokenized_train['attention_mask']], np.array(y_train),
                        validation_data = ([tokenized_test['input_ids'], tokenized_test['attention_mask']], np.array(y_test)),
                        epochs = num_epochs,
                        batch_size = batch)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

In [None]:
validation['target'] = new_model([tokenized_validation['input_ids'], tokenized_validation['attention_mask']]).numpy()

In [None]:
final_output = validation[['id', 'target']]
final_output.to_csv('submission.csv', header = False)
final_output