In [None]:
!pip install keras-tuner
# !pip install tensorflow_addons
# !pip install autokeras
import sys
sys.path.append('../input/autokeras')

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import glob
import tensorflow.keras.layers as layers
import tensorflow_addons as tfa
import random
from transformers import BertTokenizer, TFBertModel
from transformers import RobertaTokenizer, TFRobertaModel
from transformers import BartTokenizer, TFBartModel
from transformers import ConvBertTokenizer, TFConvBertModel
from transformers import GPT2Tokenizer, TFGPT2Model
# import keras_tuner as kt
import autokeras as ak
from tensorflow.keras.models import load_model

In [None]:
ds_path = "../input/commonlitreadabilityprize/"

In [None]:
# Turn on tpu
# Detect TPU, return appropriate distribution strategy
strategy = tf.distribute.get_strategy() 

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
train_df = pd.read_csv(ds_path + "train.csv")
test_df = pd.read_csv(ds_path + "test.csv")
print(train_df.head())
print(test_df.head())


In [None]:
train_row = 2834

def get_pooling_data(data, tokenizer, model):
    inputs = tokenizer(
        data, 
        truncation=True, 
        padding='max_length',
        max_length=350,
        return_tensors="tf"
    )
    x = model(inputs)
    x = x.last_hidden_state
    result = tf.keras.layers.GlobalAveragePooling1D()(x)
    return result.numpy().flatten()
    
def prepare_data(df, training=True):
    excerpt = df["excerpt"].to_list()
    size = len(excerpt)
    
    model_path_1 = "../input/huggingface-roberta-variants/roberta-large/roberta-large"
    tokenizer_1 = RobertaTokenizer.from_pretrained(model_path_1)
    model_1 = TFRobertaModel.from_pretrained(model_path_1)
    
    model_path_2 = "../input/tfbert-base-uncased"
    tokenizer_2 = BertTokenizer.from_pretrained(model_path_2)
    model_2 = TFBertModel.from_pretrained(model_path_2)
    
    model_path_3 = "../input/huggingface-roberta-variants/roberta-base/roberta-base"
    tokenizer_3 = RobertaTokenizer.from_pretrained(model_path_3)
    model_3 = TFRobertaModel.from_pretrained(model_path_3)
    
    model_path_4 = "../input/tfbart-base"
    tokenizer_4 = BartTokenizer.from_pretrained(model_path_4)
    model_4 = TFBartModel.from_pretrained(model_path_4)
    
    model_path_5 = "../input/convbertbase/conv-bert-base"
    tokenizer_5 = ConvBertTokenizer.from_pretrained(model_path_5)
    model_5 = TFConvBertModel.from_pretrained(model_path_5)
    
    outputs = []
    for index, data in enumerate(excerpt):
        x_1 = get_pooling_data(data, tokenizer_1, model_1)
        x_2 = get_pooling_data(data, tokenizer_2, model_2)
        x_3 = get_pooling_data(data, tokenizer_3, model_3)
        x_4 = get_pooling_data(data, tokenizer_4, model_4)
        x_5 = get_pooling_data(data, tokenizer_5, model_5)
        x = np.concatenate((x_1, x_2, x_3, x_4, x_5)).flatten() # shape 2834,768
        if index % 200 == 0:
            print(f"prepared data {index}")
            print(x.shape)
        outputs.append(x)
    
    outputs = np.asarray(outputs)
    if training:
        target = np.asarray(df["target"].to_list())
        return outputs, target
    else:
        return outputs
    

# x_train, y_train = prepare_data(train_df)
x_test = prepare_data(test_df, training=False)

In [None]:
# model_creator = ak.StructuredDataRegressor(max_trials=70, overwrite=True)
# Feed the structured data regressor with training data.
# model_creator.fit(x_train, y_train)


In [None]:
# model = model_creator.export_model()
# model.save("saved_model")
model = load_model("../input/commonlitautokeraspretrained/saved_model", custom_objects=ak.CUSTOM_OBJECTS)

In [None]:
predict_data = model.predict(x_test)
print(predict_data)
test_df = test_df.assign(target=predict_data)
selected_column = ["id", "target"]
final_result = test_df[selected_column]
final_result.to_csv("submission.csv", index=False)