In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[:20]:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import re

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [None]:
df_train = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
df_test.head(20)

In [None]:
train = df_train['excerpt']
targets = df_train['target']

In [None]:
#tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('/kaggle/input/first-try/tokenizer/')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, targets, test_size=0.3, random_state=42)

In [None]:
X_train = [clean_text(x) for x in X_train]
X_test = [clean_text(x) for x in X_test]

In [None]:
X_train = tokenizer(X_train, padding="max_length", return_tensors='tf', truncation=True)
X_test = tokenizer(X_test, padding="max_length", return_tensors='tf', truncation=True)
X_train = {"input_ids": X_train['input_ids'], "attention_mask": X_train['attention_mask']}
X_test = {"input_ids": X_test['input_ids'], "attention_mask": X_test['attention_mask']}

In [None]:
y_train  =  tf.constant(y_train, dtype=tf.float32)
y_test  =  tf.constant(y_test, dtype=tf.float32)

In [None]:
bert = TFDistilBertForSequenceClassification.from_pretrained('/kaggle/input/first-try/distil_bert/')

In [None]:
def get_model(bert=None):   
    if bert is None:
        bert = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
    
    input_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='attention_mask')    
    
    x = bert(input_ids=input_ids, attention_mask=attention_mask)[0]
    
    out = tf.keras.layers.Dense(1, use_bias=True, activation='linear', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)          
    return tf.keras.Model(inputs=[input_ids, attention_mask], outputs=out)

In [None]:
model = get_model(bert)
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.MeanSquaredError()
BATCH = 8
#loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)


# instantiating the model in the strategy scope creates the model on the TPU
model.compile(optimizer=optimizer, loss=loss, metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [None]:
model.fit(X_train, y_train, batch_size=BATCH, validation_data=(X_test, y_test), epochs=4)

In [None]:
tokenizer.save_pretrained('/kaggle/working/tokenizer/')
bert.save_pretrained('/kaggle/working/distil_bert/')
#TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased').save_pretrained('/kaggle/working/distil_bert/')

Submission

In [None]:
X_val = tokenizer([clean_text(x) for x in df_test['excerpt']], padding="max_length", return_tensors='tf', truncation=True)
X_val = {"input_ids": X_val['input_ids'], "attention_mask": X_val['attention_mask']}

In [None]:
result = model.predict(X_val)

In [None]:
submission_df = pd.DataFrame({'id': df_test.id, 'target': 0})
submission_df.target = result

submission_file = 'submission.csv'
submission_df.to_csv(submission_file, index=False)

submission_df