# CommonLit Readability

## Imports

In [None]:
import nltk
import string
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

from nltk.stem.snowball import SnowballStemmer

## Read Dataset

In [None]:
df_train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df_test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

df_submission = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

print('Training shape : {}'.format(df_train.shape))
print('Testing shape : {}'.format(df_test.shape))

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.describe()

## Data Processing

### Check null data

In [None]:
df_train.isnull().sum()

### Function to clean out highkey chars, punctuaction and line breaking.

In [None]:
def transform(sentence):
    sentence = sentence.lower()
    sentence = sentence.replace('\n', ' ')
    sentence = sentence.translate(str.maketrans('','', string.punctuation))
    return sentence

In [None]:
df_train['excerpt'] = df_train['excerpt'].apply(transform)
df_train.head()

In [None]:
df_train['excerpt'][0]

### Function to convert words with suffix to root word

In [None]:
def stemWord(text):
    stemmer = SnowballStemmer(language='english')
    tokens = text.split()
    clean_text = ' '
    for token in tokens:
        clean_text = clean_text + " " + stemmer.stem(token)      
    return clean_text

In [None]:
df_train['excerpt'] = df_train['excerpt'].apply(stemWord)

In [None]:
df_train['excerpt'][0]

## Data split

In [None]:
X = df_train['excerpt'].copy()
y = df_train['target'].copy()

### Functions to plot loss and rmse history

In [None]:
def plot_loss(history):
    plt.plot(history.history['loss'],
             label='loss')
    plt.plot(history.history['val_loss'],
             label='val_loss')
    plt.xlabel('Epoch')
    plt.ylabel('Error')
    plt.legend()
    plt.grid(True)


def plot_rmse(history):
    plt.plot(history.history['root_mean_squared_error'],
             label='root_mean_squared_error')
    plt.plot(history.history['val_root_mean_squared_error'],
             label='val_root_mean_squared_error')
    plt.xlabel('Epoch')
    plt.ylabel('root mean squared error')
    plt.legend()
    plt.grid(True)

## Train Model

### Define parameters

In [None]:
text = X
vocab_size = 60000
embedding_dim = 64
max_length = 60
trunc_type='post'
pad_type='post'
oov_tok = "<OOV>"

### Create tokenizer

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(text)
word_index = tokenizer.word_index

In [None]:
training_sequences = tokenizer.texts_to_sequences(text)
training_padded = pad_sequences(training_sequences,maxlen=max_length, truncating=trunc_type, padding=pad_type)
training_labels_final = np.array(y)

### Create model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(embedding_dim, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(), 
    tf.keras.layers.Dense(24, activation='relu', kernel_regularizer = tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(1)
])

model.summary()

In [None]:
model.compile(loss='mean_squared_error',
              optimizer=tf.keras.optimizers.Adam(0.0001), 
              metrics=[RootMeanSquaredError()])

### Train model

In [None]:
his = model.fit(training_padded,
                training_labels_final,
                epochs=30,
                validation_split=0.1)

### Plot graphs

In [None]:
plot_loss(his)

In [None]:
plot_rmse(his)

## Submission

In [None]:
sample_sequences = tokenizer.texts_to_sequences(df_test['excerpt'])
excerpt_padded = pad_sequences(sample_sequences, padding='post', maxlen=max_length) 
classes = model.predict(excerpt_padded)

sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
submit = sample
submit["target"] = classes
submit.to_csv("submission.csv", index=False)
submit