# Project Introduction

**Project Goal:** Using machine learning to identify the appropriate reading level of a passage of text for grades 3-12 students.

**Data:** 
- train.csv size: 2834 rows, 6 columns:
    - id
    - url_legal
    - license
    - excerpt (feature)
    - target (** dependent variable)
    - standard_error
- test.csv size: 7 rows, 4 columns

#### In this project, I did not use any pre-trained models. Therefore, there is no need to turn on the Internet toggle in the kernel in order to download anything.

# Loading Libraries

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import spacy
import string
from collections import Counter

import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
import kerastuner as kt


# Loading Datasets into Memory

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df_topredict = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
df_sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
df.head()

# Feature Engineering
- Add some columns
    - text length
    - length of the longest word in the text
    - length of the longest sentence in the text

## Add a text_length column

In [None]:
# df.apply(lambda row: row.name, axis=1)
df['text_length'] = df.apply(lambda row: len(df.loc[row.name, 'excerpt'].split()), axis=1)

## Add a longest word in the text as a column

In [None]:
def maxword_len(row_idx):
    words = df.loc[row_idx, 'excerpt'].split()
    max_len = len(max(words, key=len))
    return max_len

In [None]:
df['maxword_length'] = df.index.map(lambda row_idx: maxword_len(row_idx))

## Add length of the longest sentence in the text as a column

In [None]:
def maxsent_len(row_idx):
    paragraph = df.loc[row_idx, 'excerpt']
    num_words = [len(sentence.split()) for sentence in paragraph.split('.')]
    return max(num_words)

In [None]:
df['maxsent_length'] = df.index.map(lambda row_idx: maxsent_len(row_idx))

## Normalize the 3 created columns using min/max normalization
formula:  (df-df.min())/(df.max()-df.min())

In [None]:
df['ntext_length'] = (df.text_length - df.text_length.min()) / (df.text_length.max()-df.text_length.min())
df['nmaxword_length'] = (df.maxword_length - df.maxword_length.min()) / (df.maxword_length.max()-df.maxword_length.min())
df['nmaxsent_length'] = (df.maxsent_length - df.maxsent_length.min()) / (df.maxsent_length.max()-df.maxsent_length.min())

### Take a look at the engineered df:

In [None]:
df.head()

# EDA 

## Check the target and standard_error distribution:

In [None]:
plt.figure(figsize=(10,4))

plt.subplot(1,2,1)
plt.hist(x=df.target, color='tab:cyan', bins=40, edgecolor='k')
plt.xlabel('Text Difficulty Score(target)')
plt.ylabel('Count')
plt.title('Distribution of Target Score')

plt.subplot(1,2,2)
plt.hist(x=df.standard_error, color='tab:purple',bins=40, edgecolor='k')
plt.xlabel('Standard Error')
plt.ylabel('Count')
plt.title('Distribution of Error')

plt.tight_layout()
plt.show()

## Check the text with lowest and highest target score:

In [None]:
# set the display to show more text
pd.options.display.max_colwidth = 100

# print out the text to exam the difference between high score and low score
min_target = df.loc[df.target==df.target.min(),['excerpt','target','text_length','maxword_length','maxsent_length']]
print('Min Target Score:',min_target.target, '-'*20, 'TEXT', '-'*20)
print(f'"{min_target.excerpt}"')
print(f'Text Length: {min_target.text_length}')
print(f'Longest Word Length: {min_target.maxword_length}')
print(f'Longest Sentence Length: {min_target.maxsent_length}')

print()

max_target = df.loc[df.target==df.target.max(),['excerpt','target','text_length','maxword_length','maxsent_length']]
print('Max Target Score:',max_target.target, '-'*20, 'TEXT', '-'*20)
print(f'"{max_target.excerpt}"')
print(f'Text Length: {max_target.text_length}')
print(f'Longest Word Length: {max_target.maxword_length}')
print(f'Longest Sentence Length: {max_target.maxsent_length}')

# shorten the text display
pd.options.display.max_colwidth = 50


## Visualize the relationship between target and 
- text_length
- maxword_length
- maxsent_length

In [None]:
corr_list = ['text_length','maxword_length','maxsent_length']

sns.set_theme(style="white", color_codes=True)

plt.figure(figsize=(15,5))
for i in range(len(corr_list)):
    plt.subplot(1,3,i+1)
    sns.regplot(x=df[corr_list[i]] ,y=df.target, marker='+')
    
plt.show()

# RNN Model
## Preprocessing: convert the dataframe into tf dataset

In [None]:
SEED = 5

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(
            (tf.cast(df['excerpt'].values, tf.string),
             tf.cast(df['target'].values, tf.float16)))
dataset.shuffle(SEED)

print(dataset)

In [None]:
# Let's print out an instance in the dataset
for example, label in dataset.take(1):
    print('Text: ', example.numpy(), sep='\n')
    print()
    print('Label: ', label.numpy(),sep='\n')

## Train test split

In [None]:
TRAIN_SIZE = int(len(dataset)*0.7)

train_dataset = dataset.take(TRAIN_SIZE)
test_dataset = dataset.skip(TRAIN_SIZE) 

## Tuning the train, test dataset to feed into tensorflow

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 4

train_dataset = train_dataset.cache().batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)


## Create tokenize(encoder) and vectorize(embedding) layers

In [None]:
total_words = df['excerpt'].str.split()
total_word_set = set()
total_words.apply(total_word_set.update)
count_dict = Counter(total_word_set)
VOCAB_SIZE = len(count_dict)

print('total unique words number:', VOCAB_SIZE)

In [None]:
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [None]:
embedding_layer = Embedding(
    input_dim=len(encoder.get_vocabulary()),
    output_dim=128,
    mask_zero=True
    )

# Tune the LSTM Model
### Define the model

In [None]:
def model_builder(hp):
    model = Sequential()
    model.add(encoder)
    model.add(embedding_layer)
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)))
    
    hp_units = hp.Int('units', min_value=96, max_value=512, step=32)
    model.add(keras.layers.Dense(units=hp_units, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    
    model.add(keras.layers.Dense(units=hp_units, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    
    model.add(tf.keras.layers.Dense(1))
    
    hp_learning_rate = hp.Choice('learning_rate', values = [1e-4, 1e-5])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss=tf.keras.losses.MeanSquaredError())
    
    return model

### Instantiate the tuner and perform hypertuning

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective='val_loss',
                     max_epochs=10,
                     factor=3)

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

tuner.search(train_dataset, validation_data=test_dataset,
             epochs=30, callbacks=[stop_early])

In [None]:
# Display the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete... 
The optimal number of neurons in the dense layers is {best_hps.get('units')};
The optimal learning rate is {best_hps.get('learning_rate')}.
""")

### Build the model with the optimal hyperparameters and train it on the data for 50 epochs

In [None]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(train_dataset, epochs=50,
                    validation_data=test_dataset, 
                    callbacks = [stop_early])

val_loss_per_epoch = history.history['val_loss']
best_epoch = val_loss_per_epoch.index(min(val_loss_per_epoch)) + 1
print(f'Best epoch: {best_epoch}')

### Re-instantiate the hypermodel and train it with the optimal number of epochs from above.

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(train_dataset, validation_data=test_dataset,
             epochs=best_epoch, callbacks=[stop_early])

# Re-train the model with the entire dataset

In [None]:
dataset = dataset.cache().batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
hypermodel.fit(dataset, epochs=best_epoch)

# Make Prediction

In [None]:
def make_prediction(row_idx):
    result = hypermodel.predict(np.array([df_topredict.excerpt[row_idx]]))
    return result[0][0]

df_topredict['target'] = df_topredict.index.map(lambda row_idx: make_prediction(row_idx))

In [None]:
df_sub = df_topredict.loc[:, ['id','target']]
df_sub.to_csv('submission.csv', index=False)