In [None]:
import numpy as np
import pandas as pd
import os
import sys
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.model_selection import StratifiedKFold, KFold
import logging
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
import re,string,unicodedata
from kaggle_datasets import KaggleDatasets
import matplotlib.pyplot as plt

sns.set_style("darkgrid")
logging.basicConfig(level=logging.INFO)

1. We will need bert `Tokenization` class

# Step_1 load packages and data

In [None]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

train.head()

In [None]:
debug = False
if debug:
    train = train.sample(1000)

# Step_2 data cleaning

**Wordcloud for HIGH readability text**

In [None]:
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800 , stopwords = STOPWORDS).generate(" ".join(train[train.target > 0].excerpt))
plt.imshow(wc , interpolation = 'bilinear')

**Wordcloud for LOW readability text**

In [None]:
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800 , stopwords = STOPWORDS).generate(" ".join(train[train.target < 0].excerpt))
plt.imshow(wc , interpolation = 'bilinear')

**Number of characters in texts**

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,8))
text_len=train[train.target < 0]['excerpt'].str.len()
ax1.hist(text_len,color='red')
ax1.set_title('Low Readability')
text_len=train[train.target > 0]['excerpt'].str.len()
ax2.hist(text_len,color='green')
ax2.set_title('High Readability')
fig.suptitle('Characters in texts')
plt.show()

**Number of words in each text**

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,8))
text_len=train[train.target < 0]['excerpt'].str.split().map(lambda x: len(x))
ax1.hist(text_len,color='red')
ax1.set_title('Low Readability')
text_len=train[train.target > 0]['excerpt'].str.split().map(lambda x: len(x))
ax2.hist(text_len,color='green')
ax2.set_title('High Readability')
fig.suptitle('Words in texts')
plt.show()

**Average word length in a text**

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(20,10))
word=train[train.target < 0]['excerpt'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='red')
ax1.set_title('Low Readability')
word=train[train.target > 0]['excerpt'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='green')
ax2.set_title('High Readability')
fig.suptitle('Average word length in each text')

# Step_2: build bert_layer and model

In [None]:
%%time
sys.path.append('../input/tokenization')
import tensorflow_hub as hub 
import tokenization
module_url = '../input/bert-en-uncased-l12-h768-a122'
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
tf.gfile = tf.io.gfile
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(16, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(1, activation='linear')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='mean_squared_error')
    
    return model

# Step_3: Run model and export predictions

In [None]:
preds = None
kf = KFold(n_splits = 5 , shuffle = True , random_state = 42)
for fold , (train_index , val_index) in enumerate(kf.split(train["excerpt"] , train['target'])):
    print("Training Fold {}".format(fold))
    
    x_train, x_val = train.excerpt.values[train_index], train.excerpt.values[val_index]
    y_train, y_val = train.target.values[train_index], train.target.values[val_index]
    
    max_len = 300
    train_input = bert_encode(x_train, tokenizer, max_len=max_len)
    val_input = bert_encode(x_val, tokenizer, max_len=max_len)
    test_input = bert_encode(test.excerpt.values, tokenizer, max_len=max_len)
    
    BATCH_SIZE = 16
    
    name = "model_fold_{}".format(fold) +".h5"
    
    checkpoint = tf.keras.callbacks.ModelCheckpoint(name, monitor='val_loss', save_best_only=True, mode='min', verbose=1)
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor = 0.5 , patience=2, mode='min', verbose=1)

    load_locally = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
    bert_layer = hub.KerasLayer(module_url, trainable=True , load_options=load_locally)
    model = build_model(bert_layer, max_len=max_len)
    
    train_history = model.fit(
        train_input, y_train, 
        epochs=15,
        callbacks=[checkpoint, reduce_lr],
        batch_size=BATCH_SIZE,
        validation_data = (val_input, y_val),
        verbose=1)
    
    model.load_weights(name)
    
    if preds is None:
        preds = model.predict(test_input)
    else:
        preds += model.predict(test_input)

preds = preds/5

In [None]:
preds[:5]

In [None]:
%%time
sub = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

sub['target'] = preds
sub.to_csv('submission.csv', index=False)