In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold,train_test_split 

import tensorflow as tf
print(f'Tensorflow Version',tf.__version__)
import tensorflow_hub as hub

from tqdm.notebook import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_full_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')

In [None]:
train_full_df.shape

In [None]:
train_df, valid_df = train_test_split(train_full_df, random_state=42, train_size=0.8)
train_df.shape, valid_df.shape

In [None]:
train_df.head()

In [None]:
train_sent_len = [len(i.split()) for i in train_df['excerpt']]
test_sent_len = [len(i.split()) for i in test_df['excerpt']]

plt.hist(train_sent_len, bins=range(min(train_sent_len), max(train_sent_len) + 1, 1), 
              alpha=0.4, color="red")

plt.hist(test_sent_len, bins=range(min(test_sent_len), max(test_sent_len) + 1, 1), 
              alpha=0.4, color="blue")


labels = ['Train','Test']
plt.legend(labels)
plt.xlabel("length of sentence")
plt.ylabel("proportion")
plt.title("comparing number of words per sentence distribution in Train and Test")
plt.show()

In [None]:
train_df['target'].plot(kind='hist', title='Target distribution');

In [None]:
train_full_df['target'].max(),train_full_df['target'].min()

In [None]:
def train_and_evaluate_model(module_url, embed_size, name, trainable=False):
  hub_layer = hub.KerasLayer(module_url, input_shape=[], output_shape=[embed_size,], dtype=tf.string, trainable=trainable)
  
  def mapping_to_target_range(x, target_min=-3, target_max=2) :
    x02 = tf.keras.backend.tanh(x) + 1 # x in range(0,2)
    scale = ( target_max-target_min )/2.
    return  x02 * scale + target_min
    
  model = tf.keras.models.Sequential([
                                      hub_layer,
                                      tf.keras.layers.Dense(256, activation='relu'),
                                      tf.keras.layers.Dropout(0.2),
                                      tf.keras.layers.Dense(128, activation='relu'),
                                      tf.keras.layers.Dropout(0.3),
                                      tf.keras.layers.Dense(64, activation=mapping_to_target_range),
                                      tf.keras.layers.Dropout(0.4),
                                      tf.keras.layers.Dense(32, activation=mapping_to_target_range),
                                      tf.keras.layers.Dropout(0.2),
                                      tf.keras.layers.Dense(1, activation=mapping_to_target_range)
                                      ])
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='mean_squared_error',
              metrics=[tf.keras.metrics.RootMeanSquaredError()]) # mean_absolute_error
  model.summary()
  history = model.fit(train_df['excerpt'], train_df['target'],
                    epochs=20,
                    batch_size=64,
                    validation_data=(valid_df['excerpt'], valid_df['target']),
                    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min')],
                    verbose=1)
  return model, history

In [None]:
module_url = '../input/universalsentenceencoderlarge5/'
model, history = train_and_evaluate_model(module_url, embed_size=512, name='universal-sentence-encoder-large')

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15,5))
axes[0].plot(history.history['root_mean_squared_error'])
axes[0].plot(history.history['val_root_mean_squared_error'])
axes[0].set_xlabel('Epochs')
axes[0].set_ylabel('Root Mean Square Error')
axes[0].legend(['Training','Testing'])
axes[0].grid(True)

axes[1].plot(history.history['loss'])
axes[1].plot(history.history['val_loss'])
axes[1].set_xlabel('Epochs')
axes[1].set_ylabel('Loss')
axes[1].legend(['Training','Testing'])
axes[1].grid(True)

Prediction

In [None]:
preds = model.predict(test_df['excerpt'])

In [None]:
# preds

In [None]:
preds = preds[:,-1]
# preds

Submission

In [None]:
pd.DataFrame({
    'id':test_df.id,
    'target':preds
}).to_csv('submission.csv',index=False)

* Version 1: 
loss: 0.1164 - root_mean_squared_error: 0.3409 - val_loss: 0.3881 - val_root_mean_squared_error: 0.6230
* Version 2: loss: 0.2204 - root_mean_squared_error: 0.4692 - val_loss: 0.3971 - val_root_mean_squared_error: 0.6302

