# Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense, Embedding, LSTM, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Data loading and preprocessing

In [None]:
#Hyperparameters
vocab_size = 30000
embedding_size = 200
hidden_units_size = 128
dropout_lstm = 0.6
dropout_regular = 0.6
epochs = 50
batch_size = 32

In [None]:
#Load data
train_data = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_data = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
train_data.head(2)

In [None]:
test_data.head(2)

In [None]:
#Prepare variables
x = np.array(train_data["excerpt"])
y = np.array(train_data["target"])
x_test = np.array(test_data["excerpt"])

In [None]:
#Create tokenizer for splitting words and number labeling
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(x)

In [None]:
#Tokenize
x = tokenizer.texts_to_sequences(x)

In [None]:
# First 15 words of second example
x[1][0:15]

In [None]:
#Get longest text
maximum_length = np.max([len(example) for example in x])
maximum_length

In [None]:
#Pad text so each has same size
x = pad_sequences(x, padding='post', maxlen=maximum_length)

In [None]:
#Prepare test set
x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, padding='post', maxlen=maximum_length)

# Create model and train

In [None]:
#Create bidirectional lstm with 1 output layer
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=maximum_length, trainable=True))
model.add(Dropout(dropout_regular))
model.add(Bidirectional(LSTM(hidden_units_size, recurrent_dropout = dropout_lstm, dropout = dropout_regular,  return_sequences=True)))
model.add(Bidirectional(LSTM(hidden_units_size, recurrent_dropout = dropout_lstm, dropout = dropout_regular)))
model.add(Dense(1, activation='linear'))

In [None]:
#Create callbacks
checkpoint = ModelCheckpoint("", monitor="val_loss", verbose=1, save_best_only=True)
early_stop = EarlyStopping(monitor="val_loss", patience = 12)
reduce_lr = ReduceLROnPlateau(patience=5)

In [None]:
model.compile(optimizer="Adam", loss="mean_squared_error", metrics=["MeanSquaredError"])
history = model.fit(x, y, batch_size=batch_size, validation_split=0.2, epochs=epochs, verbose=1,  callbacks = [early_stop, checkpoint, reduce_lr], use_multiprocessing = True)

In [None]:
output = model.predict(x_test)

In [None]:
output_table = pd.DataFrame(test_data['id'])
output_table['target'] = output

In [None]:
output_table.to_csv("submission.csv", index=False)

In [None]:
output_table