# Commonlit | biLSTM Sentence encoder



In [None]:
# import
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
train

## Data preparation

In [None]:
# data preparation

# combine the dataset
combined = pd.concat([train['excerpt'], test['excerpt']])
# set vocab count
vocab = 10000
tokenizer = Tokenizer(num_words=vocab, oov_token=0)
tokenizer.fit_on_texts(combined) 
sequence_combined = tokenizer.texts_to_sequences(combined)
max_len = max([len(x) for x in sequence_combined])
sequences = tokenizer.texts_to_sequences(train['excerpt'])
padded_seq = pad_sequences(sequences, maxlen=max_len, dtype='int32', padding='pre',truncating='pre', value=0)

## biLSTM based Regression Model

- Sentence encoder: biLSTM
- Target decoder: Regression model

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Bidirectional, Embedding

In [None]:
# mode
model = Sequential()
# encoder
model.add(keras.Input(shape=(padded_seq.shape[1], )))
model.add(Embedding(vocab, 300))
model.add(Bidirectional(LSTM(256)))
# decoder
model.add(Dense(256, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(8, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))
# summary
model.summary()

In [None]:
# callbacks
earlystopping = keras.callbacks.EarlyStopping(monitor='loss', patience=3)

# compile
model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])

# fit
history = model.fit(padded_seq, train['target'], epochs=100, batch_size=32, verbose=2)

In [None]:
import matplotlib.pyplot as plt
print(history.history.keys())
# "Loss"
plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()

## Test data

In [None]:
test_sequences = tokenizer.texts_to_sequences(test['excerpt'])
test_pad_sequences = pad_sequences(test_sequences, maxlen=max_len, dtype='int32', padding='pre',truncating='pre', value=0)
y_pred = model.predict(test_pad_sequences)

## Submission

In [None]:
sub = test[['id']].copy()

In [None]:
sub['target'] = y_pred
sub.to_csv("submission.csv", index=False)