In [None]:
import pandas as pd
import numpy as np
import spacy
import re
import string

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

In [None]:
len(train)

In [None]:
len(test)

In [None]:
def clean_text(text):
    '''Make text lowercase,remove punctuation
    .'''
    
    text = str(text).lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    return text

In [None]:
train["excerpt"]  = train["excerpt"].apply(lambda x:clean_text(x))

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
train_samples = train["excerpt"]

In [None]:
import tensorflow as tf 
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization()
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

In [None]:
vectorizer.get_vocabulary()[:5]

In [None]:
voc = vectorizer.get_vocabulary()

In [None]:
len(voc)

In [None]:
num_tokens = len(voc)
embedding_dim = len(nlp('The').vector)
embedding_matrix = np.zeros((num_tokens, embedding_dim))

In [None]:
embedding_matrix.shape

In [None]:
%%time
#generate the embedding matrix
for i, word in enumerate(voc):
        embedding_matrix[i] = nlp(word).vector

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow import keras

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [None]:
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from keras.utils import np_utils


# Create training and validation splits
df_train = train.sample(frac=0.7, random_state=0)
df_valid = train.drop(df_train.index)

In [None]:
# Split features and target
X_train = df_train.drop(['target'], axis=1)
X_valid = df_valid.drop(['target'], axis=1)
y_train = df_train['target']
y_valid = df_valid['target']

In [None]:
x_train = vectorizer(np.array([[s] for s in X_train["excerpt"]])).numpy()
x_valid = vectorizer(np.array([[s] for s in X_valid["excerpt"]])).numpy()

In [None]:
x_train.shape

In [None]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(embedded_sequences)
x = layers.Bidirectional(layers.LSTM(32))(x)
preds = layers.Dense(1)(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

In [None]:
early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=20, # how many epochs to wait before stopping
    restore_best_weights=True,
)


model.compile(
    optimizer='adam',
    loss='mse',
    metrics ='mse'
)

history = model.fit(
    x_train, y_train,
    validation_data=(x_valid, y_valid),
    batch_size=128,
    epochs=500,
    callbacks=[early_stopping], # put your callbacks in a list
    verbose=1,  # turn on training log
)

In [None]:
x_test = vectorizer(np.array([[s] for s in test["excerpt"]])).numpy()

In [None]:
x_test[0].shape

In [None]:
predictions = model.predict(x_test)

In [None]:
submission = pd.DataFrame({"id": test.iloc[:,0].values,"target": predictions[:,0]})
submission.to_csv("submission.csv", index=False) 
submission.head()