# Preparation

## Loading modules

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import tensorflow as tf
# import tensorflow_text as text
import tensorflow_hub as hub
import transformers
from transformers import AutoTokenizer
from contextlib import nullcontext

In [None]:
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

## Loading data

In [None]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
submission_df = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

train_df.head()

In [None]:
test_df

Lets find out how long the excerpts are

In [None]:
excerpts_lengths = [len(excerpt.split()) for excerpt in train_df.excerpt]
sns.kdeplot(excerpts_lengths);
print('Max len: ', max(excerpts_lengths))

In [None]:
MAX_LENGTH = 256

In [None]:
TARGET_MIN = train_df.target.min()
TARGET_MAX = train_df.target.max()
train_df['target_normilized'] = (train_df.target-TARGET_MIN)/(TARGET_MAX-TARGET_MIN)

# Creating TF Dataset

In [None]:
from transformers import AutoTokenizer
# MODEL_NAME = 'bert-base-cased'
MODEL_NAME = 'albert-base-v1'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
train_texts_encoded = tokenizer(text=list(train_df.excerpt.values),
                    add_special_tokens=True,
                    max_length=MAX_LENGTH,
                    truncation=True,
                    padding='max_length',
                    return_attention_mask=True,
                    return_token_type_ids=True,
                    return_tensors='tf'
                    )
train_texts_encoded

In [None]:
test_texts_encoded = tokenizer(text=list(test_df.excerpt.values),
                    add_special_tokens=True,
                    max_length=256,
                    truncation=True,
                    padding='max_length',
                    return_attention_mask=True,
                    return_token_type_ids=True,
                    return_tensors='tf'
                    )
test_texts_encoded

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((dict(train_texts_encoded), train_df.target.values))
BATCH_SIZE =1024
AUTOTUNE = tf.data.AUTOTUNE

val_size = int(len(train_df)*0.2)
val_dataset = dataset.take(val_size).batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
train_dataset = dataset.skip(val_size).batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
# train_dataset = train_dataset.batch(BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices(dict(test_texts_encoded)).batch(BATCH_SIZE)

In [None]:
next(iter(train_dataset))

In [None]:
with tpu_strategy.scope():
# with nullcontext():
    transformer = transformers.TFAutoModel.from_pretrained(MODEL_NAME);

In [None]:
def create_model():

    input_1 = tf.keras.Input(shape=(MAX_LENGTH,),name='input_ids', dtype='int32')
    input_2 = tf.keras.Input(shape=(MAX_LENGTH,),name='attention_mask', dtype='int32')
    input_3 = tf.keras.Input(shape=(MAX_LENGTH,),name='token_type_ids', dtype='int32')

    x = transformer((input_1, input_2, input_3))[0][:,0,:]
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(200, activation='relu')(x)
    y = tf.keras.layers.Dense(1, activation='linear', name='output_layer')(x)
    model = tf.keras.Model(inputs=(input_1, input_2, input_3), outputs=y)
    return model

In [None]:
with tpu_strategy.scope():
# with nullcontext():
    model = create_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss='mse')
    model.summary()

In [None]:
lr_reduction = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=2,
    min_lr=1e-8,
    verbose=1
)

history = model.fit(
    train_dataset,
    epochs = 20,
    verbose = 2,
    batch_size = BATCH_SIZE,
    callbacks = [lr_reduction],
    validation_data=val_dataset
)

In [None]:
from matplotlib import pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

In [None]:
predictions = model.predict(dict(test_texts_encoded), verbose=1)
submission_df.target = predictions
submission_df.to_csv('submission.csv', index=False)
submission_df