# Competition Challenge

- In this competition, we are required to build models to rate the complexity of reading passages for grade 3 to 12 class students

# Evaluation Metrics

- Root Mean Squared Error (RMSE)

In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from glob import glob
import gc

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

import plotly.express as px #Plotly Express

from plotly.offline import iplot
#to link plotly to pandas
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline = False, world_readable = True)

from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams['axes.titlesize'] = 16
plt.style.use('seaborn-whitegrid')
sns.set_palette('Set2')

import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

import os
print(os.listdir('../input/'))

import warnings
warnings.simplefilter('ignore')

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

In [None]:
base_dir = '../input/commonlitreadabilityprize/'

In [None]:
train = pd.read_csv(base_dir + 'train.csv')
print(train.shape)
train.head()

In [None]:
test = pd.read_csv(base_dir + 'test.csv')
print(test.shape)
test.head()

In [None]:
sub = pd.read_csv(base_dir + 'sample_submission.csv')
print(sub.shape)
sub.head()

In [None]:
print(f"Number of unique id in trainset: {train['id'].nunique()}")
print(f"Number of unique id in testset: {test['id'].nunique()}")

Check for number of missing values

In [None]:
missing = train.isna().sum().reset_index()
missing.columns = ['features', 'total_missing']
missing['percent'] = (missing['total_missing'] / len(train)) * 100
missing.index = missing['features']
del missing['features']

missing['total_missing'].iplot(kind = 'bar', 
                               title = 'Missing Values Count in train',
                               xTitle = 'Features',
                               colors = 'blue',
                               yTitle = 'Count')
missing.T

In [None]:
missing = test.isna().sum().reset_index()
missing.columns = ['features', 'total_missing']
missing['percent'] = (missing['total_missing'] / len(test)) * 100
missing.index = missing['features']
del missing['features']

missing['total_missing'].iplot(kind = 'bar', 
                               title = 'Missing Values Count in test',
                               xTitle = 'Features',
                               colors = 'red',
                               yTitle = 'Count')
missing.T

- As per competition, URL and License are blank in testset

<code>__Distribution of Target__</code>

In [None]:
sns.kdeplot(train['target'], shade = True, color = 'green');

<code>__Distribution of Standard Error__</code>

- Not provided for testset

In [None]:
sns.kdeplot(train['standard_error'], shade = True, color = 'grey');

In [None]:
train.head(2)

In [None]:
train['excerpt_len'] = train['excerpt'].apply(lambda x: len(str(x)))
train['excerpt_wordlen'] = train['excerpt'].apply(lambda x: len(str(x).split(' ')))

test['excerpt_len'] = test['excerpt'].apply(lambda x: len(str(x)))
test['excerpt_wordlen'] = test['excerpt'].apply(lambda x: len(str(x).split(' ')))

In [None]:
print(f"Max. word length in train - Excerpt: {train['excerpt_wordlen'].max()}")
print(f"Min. word length in train - Excerpt: {train['excerpt_wordlen'].min()}")
print()
print(f"Max. word length in train - Excerpt: {test['excerpt_wordlen'].max()}")
print(f"Min. word length in train - Excerpt: {test['excerpt_wordlen'].min()}")

- The max word length is useful to determine the tokenizer's max_len 

<code>__Distribution of Text Lengths__</code>

In [None]:
plt.subplot(1, 2, 1)
sns.distplot(train['excerpt_len'], bins = 50)
plt.title('Train Character Length')

plt.subplot(1, 2, 2)
sns.distplot(train['excerpt_wordlen'], bins = 50)
plt.title('Train Word Length');

In [None]:
plt.subplot(1, 2, 1)
sns.distplot(test['excerpt_len'], bins = 50)
plt.title('Test Character Length')

plt.subplot(1, 2, 2)
sns.distplot(test['excerpt_wordlen'], bins = 50)
plt.title('Test Word Length');

In [None]:
import itertools
import collections
from collections import Counter

from nltk.corpus import stopwords

import re
from wordcloud import WordCloud

def plot_wordcloud(data, col, text = None):
    stop = stopwords.words('english')
    all_words = [word for each in data[col] for word in each.split(' ') if word not in stop]
    word_freq = Counter(all_words)

    wordcloud = WordCloud(width = 900,
                          height = 500,
                          max_words = 200,
                          max_font_size = 100,
                          relative_scaling = 0.5,
                          background_color = "rgba(255, 255, 255, 0)", 
                          mode = "RGBA",
                          normalize_plurals = True).generate_from_frequencies(word_freq)
    plt.figure(figsize = (18, 16))
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.title(text, fontsize = 20, color = 'grey', y = 1.05)
    plt.axis("off")
    plt.show()

In [None]:
plot_wordcloud(train, 'excerpt', 'WordCloud of Train - Excerpt')

In [None]:
plot_wordcloud(test, 'excerpt', 'WordCloud of Test - Excerpt')

# Tokenize

In [None]:
from transformers import *

In [None]:
model_name = 'bert-large-uncased'
bert_url = '../input/bert-base-uncased-huggingface-transformer/'

max_len = train['excerpt_wordlen'].max()

In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_url + 'bert-base-uncased-vocab.txt')

In [None]:
#https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub
def bert_encode(texts, tokenizer, max_len = max_len):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len - 2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
Xtrain = bert_encode(train['excerpt'].values, tokenizer, max_len = max_len)
targets = train['target'].values

print(Xtrain[0].shape, targets.shape)

In [None]:
Xtest = bert_encode(test['excerpt'].values, tokenizer, max_len = max_len)
print(Xtest[0].shape)

# __Train Model__

In [None]:
epochs = 10

In [None]:
def build_model(max_len = max_len):
    input_word_ids = Input(shape = (max_len,), dtype = tf.int32, name = "input_word_ids")
    input_mask = Input(shape = (max_len,), dtype = tf.int32, name = "input_mask")
    segment_ids = Input(shape = (max_len,), dtype = tf.int32, name = "segment_ids")
    
    config = BertConfig()
    config.output_hidden_states = False
    
    bert_model = TFBertModel.from_pretrained(
        bert_url + 'bert-base-uncased-tf_model.h5', config = config)

    sequence_output = bert_model([input_word_ids, input_mask, segment_ids])[0]
    
    x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    x = tf.keras.layers.Dropout(0.2)(x)
    
    out = tf.keras.layers.Dense(1, activation = 'linear')(x)
    
    model = Model(inputs = [input_word_ids, input_mask, segment_ids], outputs = out)
    
    model.compile(Adam(lr = 1e-5), loss = tf.keras.losses.MeanSquaredError())
    
    return model

In [None]:
early = EarlyStopping(monitor = 'val_loss', min_delta = 0., patience = 2,
                   verbose = 1, mode = 'min', restore_best_weights = True)
check = ModelCheckpoint(filepath = 'commonlit_model.h5', monitor = 'val_loss', verbose = 1, 
                                               ave_weights_only = True)
reduce = ReduceLROnPlateau(monitor = 'val_loss', patience = 2, verbose = 1, factor = 0.5)

model = build_model(max_len = max_len)

model.summary()

In [None]:
history = model.fit(Xtrain, targets, validation_split = 0.2, epochs = epochs, batch_size = 16, 
                   callbacks = [reduce])

In [None]:
def display_training_curves(training, validation, title, subplot):
    """
    Source: https://www.kaggle.com/mgornergoogle/getting-started-with-100-flowers-on-tpu
    """
    plt.subplots(figsize = (10, 10), facecolor = '#F0F0F0')
    ax = plt.subplot(subplot)
    ax.set_facecolor('#F8F8F8')
    ax.plot(training)
    ax.plot(validation)
    ax.set_title('model '+ title)
    ax.set_ylabel(title)
    #ax.set_ylim(0.28,1.05)
    ax.set_xlabel('epoch')
    ax.legend(['train', 'valid.'])
    plt.show()

In [None]:
history.history.keys()

In [None]:
display_training_curves(
                    history.history['loss'], 
                    history.history['val_loss'], 
                    'loss', 211)

In [None]:
preds = model.predict(Xtest, verbose = 1)
preds[:10]

In [None]:
sub['target'] = preds
sub.to_csv('./submission.csv', index = False)

In [None]:
plt.subplot(1, 2, 1)
sns.kdeplot(train['target'], shade = True, color = 'green');
plt.title('Train Target')

plt.subplot(1, 2, 2)
sns.kdeplot(sub['target'], shade = True, color = 'blue');
plt.title('Predicted Target');

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))