In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path

import tensorflow as tf
from tensorflow.python.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
gpu_devices=tf.config.experimental.list_physical_devices("GPU")
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device,True)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import os


In [None]:
BASE_DATA_PATH = Path("../input/commonlitreadabilityprize/")

df_train = pd.read_csv(BASE_DATA_PATH / "train.csv")
df_test = pd.read_csv(BASE_DATA_PATH / "test.csv")
df_sub = pd.read_csv(BASE_DATA_PATH / "sample_submission.csv")

In [None]:
df_train.head()

In [None]:
targets=np.array(df_train['target'])
excerpt_text=np.array(df_train['excerpt'])

In [None]:
oov="<OOV>"
total_num_of_words=28651
padding_type='pre'
trunc_type='post'
embedding_output_dim=200

In [None]:
tokenizer=Tokenizer(oov_token=oov)
tokenizer.fit_on_texts(excerpt_text)
word_index=tokenizer.word_index
total_num_of_words=len(tokenizer.word_index)+1

In [None]:
seqs=tokenizer.texts_to_sequences(excerpt_text)
#maxsentencelen=len(max(seqs, key = len))
maxsentencelen=200
pads=pad_sequences(seqs,maxlen=maxsentencelen,padding=padding_type,truncating=trunc_type)

In [None]:
split_point=int(2834*0.95)

TRAIN_DATA=pads[:split_point]
TRAIN_targets=targets[:split_point]

VAL_DATA=pads[split_point:]
VAL_targets=targets[split_point:]

In [None]:
embeddings_index = {}
with open("../input/glove6b200d/glove.6B.200d.txt",'r',encoding='cp850') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embeddings_matrix = np.zeros((total_num_of_words, embedding_output_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector


In [None]:
Model=tf.keras.models.Sequential([
        tf.keras.layers.Embedding(input_dim=total_num_of_words,
                              output_dim=embedding_output_dim,
                              input_length=maxsentencelen,
                              weights=[embeddings_matrix],
                              trainable=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16,activation='relu'),
    tf.keras.layers.Dense(1,activation='linear'),
])
Model.summary()

In [None]:
selected_optimizer=tf.keras.optimizers.Adam(learning_rate=1e-05)
selected_loss=tf.keras.losses.MeanSquaredError()
##selected_metrics=tf.keras.metrics.Accuracy()
Model.compile(optimizer=selected_optimizer,loss=selected_loss)

In [None]:
savedmodel_filepath='./SAVED_MODELs/Model.h5'
early_stopping=EarlyStopping(patience=10,monitor='val_loss')
reduce_lr=ReduceLROnPlateau(monitor='val_loss',min_lr=0.00001,patience=3,mode='min',verbose=1)
model_checkpoint=ModelCheckpoint(monitor='val_loss',filepath=savedmodel_filepath,
                                 save_best_only=True)

#lr_schedule = tf.keras.callbacks.LearningRateScheduler( lambda epoch: 0.00001 * 10**(epoch / 20),verbose=1)

selected_callbacks=[]

In [None]:
history=Model.fit(TRAIN_DATA,TRAIN_targets,epochs=200,verbose=1,callbacks=selected_callbacks,
                         validation_data=(VAL_DATA,VAL_targets))

In [None]:
excerpt_test=np.array(df_test['excerpt'])
df_test.head(10)

In [None]:
test_seqs=tokenizer.texts_to_sequences(excerpt_test)
test_pads=pad_sequences(test_seqs,maxlen=maxsentencelen,padding=padding_type,truncating=trunc_type)
pedictions_testdata=Model.predict(test_pads)
df_test["target"]=pedictions_testdata
df_test.head(10)

In [None]:
df_sub["target"]=pedictions_testdata
df_sub.to_csv('submission.csv',index=False)