In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

result = pd.DataFrame()
result['id'] = test['id']

print(train.shape)
print(test.shape)

In [None]:
max_seq_length = 200   # 文本最大长度
tokenizer = BertTokenizer.from_pretrained('../input/huggingface-bert/bert-base-uncased')

In [None]:
# function for creating the input_ids, masks for the bert input
def input_for_model(df, tokenizer, max_seq_length):
    columns_list = df.columns.tolist()
    input_ids = np.zeros((len(df), max_seq_length))
    input_attention_masks = np.zeros((len(df), max_seq_length))
    
    if 'target' in columns_list:
        train_labels = np.zeros((len(df), 1))
        for i, labels in enumerate(df['target']):
            train_labels[i,:] = labels
    
    for i, sequence in enumerate(df['excerpt']):
        tokens = tokenizer.encode_plus(
            sequence,
            max_length = max_seq_length, # max length of the text that can go to BERT
            truncation=True, padding='max_length',
            add_special_tokens = True, # add [CLS], [SEP]
            return_token_type_ids = False, 
            return_attention_mask = True, # add attention mask to not focus on pad tokens
            return_tensors = 'tf'
        )
        input_ids[i,:], input_attention_masks[i,:] = tokens['input_ids'], tokens['attention_mask']
    
    if 'target' in columns_list:
        return input_ids, input_attention_masks, train_labels
    else:
        return input_ids, input_attention_masks

In [None]:
train_ids, train_attention_masks, train_labels = input_for_model(train, tokenizer, max_seq_length)
test_ids, test_attention_masks = input_for_model(test, tokenizer, max_seq_length)
train_inputs = {"input_ids":train_ids[:2500], "attention_mask":train_attention_masks[:2500]}
train_outputs = train_labels[:2500]
valid_inputs = {"input_ids":train_ids[2500:], "attention_mask":train_attention_masks[2500:]}
valid_outputs = train_labels[2500:]
test_inputs = {"input_ids":test_ids, "attention_mask":test_attention_masks}

In [None]:
# train_data = tf.data.Dataset.from_tensor_slices((train_ids, train_attention_masks, train_labels))
# train_data = train_data.shuffle(500).batch(batch_size)
# test_data = tf.data.Dataset.from_tensor_slices((test_ids, test_attention_masks))
# test_data = test_data.batch(batch_size)

In [None]:
bert_model = TFBertModel.from_pretrained('../input/huggingface-bert/bert-base-uncased')

input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="attention_mask")

sequence_output = bert_model(input_ids, attention_mask=attention_mask)[0][:,0,:]
x = tf.keras.layers.Dropout(0.1)(sequence_output)
out = tf.keras.layers.Dense(1, activation='linear', name="outputs")(x)

# x = tf.keras.layers.GlobalMaxPool1D()(sequence_output)
# x = tf.keras.layers.BatchNormalization()(x)
# x = tf.keras.layers.Dense(128, activation='relu')(x)
# x = tf.keras.layers.Dropout(0.1)(x)
# x = tf.keras.layers.Dense(32, activation='relu')(x)
# out = tf.keras.layers.Dense(1, activation='linear', name="outputs")(x)

model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=out)

# model.layers[2].trainable = False   # Freeze the BERT model

In [None]:
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(lr=1e-4)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError()])
model.fit(train_inputs, train_outputs, epochs=10, batch_size=8, validation_data=(valid_inputs, valid_outputs))

In [None]:
result['target'] = model.predict(test_inputs)
result.to_csv("/kaggle/working/submission.csv", index=False)