In [None]:
# Loading required packages
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import collections
import re

from itertools import chain
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Set seed to get the same results each time
np.random.seed(0)

In [None]:
# Load the training data
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df.head()

In [None]:
# Load the test data
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
test.head()

In [None]:
df.info()

In [None]:
df.target.describe()

In [None]:
df.standard_error.describe()

In [None]:
#train-test split
excerpt_train, excerpt_val, y_train, y_val = train_test_split(df.excerpt, df.target, test_size=0.20)

In [None]:
# pd.series to array
train_data = excerpt_train.to_numpy()
val_data = excerpt_val.to_numpy()
test_data = test.excerpt.to_numpy()

# tokenizer and vocab
TOKEN_RE = re.compile(r'[\w\d]+')

def tokenize_text_simple_regex(txt, min_token_size=4):
    txt = txt.lower()
    all_tokens = TOKEN_RE.findall(txt)
    return [token for token in all_tokens if len(token) >= min_token_size]

def tokenize_corpus(texts, tokenizer=tokenize_text_simple_regex, **tokenizer_kwargs):
    return [tokenizer(text, **tokenizer_kwargs) for text in texts]

def add_fake_token(word2id, token='<PAD>'):
    word2id_new = {token: i + 1 for token, i in word2id.items()}
    word2id_new[token] = 0
    return word2id_new

def texts_to_token_ids(tokenized_texts, word2id):
    return [[word2id[token] for token in text if token in word2id]
            for text in tokenized_texts]


def build_vocabulary(tokenized_texts, max_size=10000, max_doc_freq=0.8, 
                     min_count=5, pad_word=None):
    word_counts = collections.defaultdict(int)
    doc_n = 0

    # count the number of documents in which each word is used
    # as well as the total number of documents
    for txt in tokenized_texts:
        doc_n += 1
        unique_text_tokens = set(txt)
        for token in unique_text_tokens:
            word_counts[token] += 1

    # remove too rare and too frequent words
    word_counts = {word: cnt for word, cnt in word_counts.items()
                   if cnt >= min_count and cnt / doc_n <= max_doc_freq}

    # sort words by descending frequency
    sorted_word_counts = sorted(word_counts.items(),
                                reverse=True,
                                key=lambda pair: pair[1])

    # add a nonexistent word with index 0 for batch processing convenience
    if pad_word is not None:
        sorted_word_counts = [(pad_word, 0)] + sorted_word_counts

    # if we still have too many words, leave only the max_size of the most frequent ones
    if len(word_counts) > max_size:
        sorted_word_counts = sorted_word_counts[:max_size]

    # we number the words
    word2id = {word: i for i, (word, _) in enumerate(sorted_word_counts)}

    # normalize the frequency of words
    word2freq = np.array([cnt / doc_n for _, cnt in sorted_word_counts], dtype='float32')

    return word2id, word2freq

In [None]:
train_tokenized = tokenize_corpus(train_data)
val_tokenized = tokenize_corpus(val_data)
test_tokenized = tokenize_corpus(test_data)

print(' '.join(train_tokenized[0]))

In [None]:
vocabulary, word_doc_freq = build_vocabulary(train_tokenized, 
                                             max_doc_freq=0.8, 
                                             min_count=5, 
                                             pad_word='<PAD>')

UNIQUE_WORDS_N = len(vocabulary)
print('Number of unique tokens', UNIQUE_WORDS_N)
print(list(vocabulary.items())[:10])

In [None]:
plt.hist(word_doc_freq, bins=20)
plt.title('Distribution of relative word frequencies')
plt.yscale('log');

In [None]:
# numbers of tokens
train_token_ids = texts_to_token_ids(train_tokenized, vocabulary)
val_token_ids = texts_to_token_ids(val_tokenized, vocabulary)
test_token_ids = texts_to_token_ids(test_tokenized, vocabulary)

print('\n'.join(' '.join(str(t) for t in sent)
                for sent in train_token_ids[:10]))

In [None]:
plt.hist([len(s) for s in train_token_ids], bins=20);
plt.title('Histogram of article lengths');

In [None]:
MAX_SEQ_LEN = 256 # Final sequence length

train_data = tf.keras.preprocessing.sequence.pad_sequences(
    train_token_ids,
    value=vocabulary["<PAD>"],
    padding='post',
    maxlen=MAX_SEQ_LEN)

val_data = tf.keras.preprocessing.sequence.pad_sequences(
    val_token_ids,
    value=vocabulary["<PAD>"],
    padding='post',
    maxlen=MAX_SEQ_LEN)

test_data = tf.keras.preprocessing.sequence.pad_sequences(
    test_token_ids,
    value=vocabulary["<PAD>"],
    padding='post',
    maxlen=MAX_SEQ_LEN)

print("Length examples: {}".format([len(train_data[0]), len(train_data[1])]))
print('=====================================')
print("Entry example: {}".format(train_data[0]))

In [None]:
# Create the model
EMB_SIZE = 32 # The size of the vector representation (embedding)
    
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(UNIQUE_WORDS_N, 32),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation=tf.nn.relu),
    tf.keras.layers.Dense(64, activation=tf.nn.relu),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dense(1),
])

model.summary()

In [None]:
# Let's train 
model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-4), 
              loss='mae'
              )

history = model.fit(
    train_data, y_train,
    validation_data=(val_data, y_val),
    batch_size=128,
    epochs=100)

In [None]:
# Evaluate performance on validation set
readable_preds = model.predict(val_data)
val_mae = mean_absolute_error(y_val,readable_preds)
print("Validation MAE for Deep learning Model: {}".format(val_mae))

In [None]:
# Let's test it
test_preds = model.predict(test_data)
test_preds = test_preds.tolist()
test_preds = list(chain.from_iterable(test_preds))

In [None]:
submission_df = pd.DataFrame({'id': test.id.tolist(), 
                              'target': test_preds})

In [None]:
submission_df.to_csv('submission.csv', index = False)