In [None]:
!pip install keras-tuner
!pip install tensorflow_addons

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import glob
import tensorflow.keras.layers as layers
import tensorflow_addons as tfa
import random
from transformers import BertTokenizer, TFBertModel
from transformers import RobertaTokenizer, TFRobertaModel
import kerastuner as kt

In [None]:
ds_path = "../input/commonlitreadabilityprize/"

In [None]:
# Turn on tpu
# Detect TPU, return appropriate distribution strategy
strategy = tf.distribute.get_strategy() 

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
train_df = pd.read_csv(ds_path + "train.csv")
test_df = pd.read_csv(ds_path + "test.csv")
print(train_df.head())
print(test_df.head())


In [None]:
train_row = 2834
def prepare_train_df(train_df):
    excerpt = train_df["excerpt"].to_list()
    target = train_df["target"].to_list()
    standard_error = train_df["standard_error"].to_list()
    '''
    for i in range(2834 * 10):
        random_number = np.random.uniform(low=-1.0, high=1.0)
        target[i] = target[i] + 1.96*random_number*standard_error[i]
    '''
    return pd.DataFrame(zip(excerpt, target), columns =['excerpt', 'target']) 

final_train_df = prepare_train_df(train_df)
final_train_df

In [None]:
model_path = "../input/huggingface-roberta-variants/roberta-base/roberta-base"
# tokenizer = BertTokenizer.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)
encodings = tokenizer(
    final_train_df["excerpt"].to_list(), 
    truncation=True, 
    padding='max_length',
    max_length=350,
)

encoded_input = [
    np.array(encodings["input_ids"]), 
    # np.array(encodings["token_type_ids"]),
    # np.array(encodings["attention_mask"])
]

In [None]:
def create_model():
    # lr = hp.Float("learning_rate", min_value=1e-3, max_value=1e-6)
    # wd = hp.Float("weight_decay", min_value=1e-4, max_value=1e-8)
    # dropout_1 = hp.Float("dropout_1", min_value=0, max_value=0.5)
    # dropout_2 = hp.Float("dropout_2", min_value=0, max_value=0.5)
    lr = 0.0005539112740501631
    wd = 5.994404522700284e-05
    dropout_1 = 0.2597885402021029
    dropout_2 = 0.23003497097673098
    with strategy.scope():
        model_path = "../input/huggingface-roberta-variants/roberta-base/roberta-base"
        # pretrained_model = TFBertModel.from_pretrained(model_path)
        pretrained_model = TFRobertaModel.from_pretrained(model_path)
        pretrained_model.trainable = False
        
        input_ids = layers.Input(shape=(350,), dtype=tf.int32, name='input_ids')
        # input_type = layers.Input(shape=(350,), dtype=tf.int32, name='token_type_ids')
        # input_mask = layers.Input(shape=(350,), dtype=tf.int32, name='attention_mask')
        inputs = [input_ids]
        pretrained = pretrained_model.roberta(inputs)
        x = pretrained.last_hidden_state
        x = layers.GlobalAveragePooling1D()(x)
        x = layers.Dense(512, activation="relu")(x)
        x = layers.Dropout(dropout_1)(x)
        x = layers.Dense(512, activation="relu")(x)
        x = layers.Dropout(dropout_2)(x)
        outputs = layers.Dense(1)(x)
        model = tf.keras.Model(inputs=inputs, outputs=outputs, name="nlp")

        model.compile(
            optimizer=tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd),
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )
    return model
'''
tuner = kt.BayesianOptimization(
    create_model,
    objective=kt.Objective("val_root_mean_squared_error", direction="min"),
    max_trials=25,
    overwrite=True,
    directory="tuner",
    project_name="commonlitreadabilityprize",
)
tuner.search_space_summary()
tuner.search(
    encoded_input,
    final_train_df["target"].to_numpy(),
    epochs=8, 
    validation_split=0.2
)
best_hp = tuner.get_best_hyperparameters()[0]
model = tuner.hypermodel.build(best_hp)
tuner.results_summary()
'''

model = create_model()
model.summary()


In [None]:
'''
checkpoint_filepath = 'best_checkpoint'
options = tf.train.CheckpointOptions(experimental_io_device="/job:localhost")
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    options=options
)

model.fit(
    encoded_input,
    final_train_df["target"].to_numpy(),
    batch_size=64,
    validation_split=0.2,
    epochs=50,
    callbacks=[model_checkpoint_callback]
)

with strategy.scope():
    model.layers[1].trainable = True
    
    model.compile(
        optimizer=tfa.optimizers.AdamW(learning_rate=1e-7, weight_decay=1e-10),
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )
    
model.fit(
    encoded_input,
    final_train_df["target"].to_numpy(),
    batch_size=64,
    validation_split=0.2,
    epochs=50,
    callbacks=[model_checkpoint_callback]
)
'''
'''
save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
model.save(
    'saved-model', 
    options=save_locally
)
'''

In [None]:
# Sửa lại đoạn này sau
encodings_test = tokenizer(
    test_df["excerpt"].to_list(), 
    truncation=True, 
    padding='max_length',
    max_length=350,
)

encoded_test_input = [
    np.array(encodings_test["input_ids"]), 
    # np.array(encodings_test["token_type_ids"]),
    # np.array(encodings_test["attention_mask"])
]
# model.load_weights(checkpoint_filepath, options=options)
model.load_weights("../input/commonlitpretrained/best_checkpoint")
'''
load_locally = tf.saved_model.LoadOptions(
    experimental_io_device='/job:localhost'
)
model = tf.keras.models.load_model(
    'saved-model',
    options=load_locally,
)
'''
predict_data = model.predict(encoded_test_input)
print(predict_data)
test_df = test_df.assign(target=predict_data)
selected_column = ["id", "target"]
final_result = test_df[selected_column]
final_result.to_csv("submission.csv", index=False)