In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers
from tensorflow import keras


# Model constants.
max_features = 30000
embedding_dim = 256
sequence_length = 220

In [None]:
vectorize_layer = TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
    ngrams = 2
)

In [None]:
id_col = 'id'
target_col = 'target'
text_col = 'excerpt'

max_len = 220
n_fold = 5
n_est = 9
n_stop = 2
batch_size = 8
seed = 42

In [None]:
trn = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv', index_col=id_col)
tst = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv', index_col=id_col)
y = trn[target_col].values
print(trn.shape, y.shape, tst.shape)

In [None]:
df_vocab = pd.concat([trn, tst])

In [None]:
dataset_train = tf.data.Dataset.from_tensor_slices((trn['excerpt'], trn['target']))

In [None]:
dataset_vocab = tf.data.Dataset.from_tensor_slices((df_vocab['excerpt']))

In [None]:
vectorize_layer.adapt(dataset_vocab)

In [None]:
inputs = tf.keras.Input(shape=(1,), dtype=tf.string, name='Input')
x = vectorize_layer(inputs)
x = layers.Embedding(max_features + 1, embedding_dim)(x)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(256, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(256, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(256, 7, padding="valid", activation="relu", strides=3)(x)


x = layers.GlobalMaxPooling1D()(x)

x = layers.Dense(256, activation="relu", kernel_regularizer='l2')(x)
x = layers.Dense(256, activation="relu", kernel_regularizer='l2')(x)
x = layers.Dense(128, activation="relu", kernel_regularizer='l2')(x)
x = layers.Dropout(0.5)(x)
target_layer = layers.Dense(1, name='target' ) (x)

In [None]:
model = keras.Model(
                    inputs=inputs,
                    outputs=[target_layer
                            ]
                    )

In [None]:
model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss={
            "target": keras.losses.MeanSquaredError(),
            },
            metrics={'target':'mse'}
            #loss_weights=[1.0, 0.2],
                )

In [None]:
 model.fit(
        {"Input": trn['excerpt']},
        {"target": trn['target']},
        callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=30)],
        epochs=2000,
        validation_split=0.2,
        #batch_size=256,
            )

In [None]:
predictions=model.predict(tst['excerpt'])

In [None]:
tst['target']=predictions

In [None]:
tst.drop(columns=['url_legal',
                 'license', 'excerpt'], inplace=True)

In [None]:
tst.to_csv('submission.csv', index=True)