In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.text import *
from tensorflow.keras.preprocessing.sequence import *
from tensorflow.keras.models import *
import tensorflow.keras.backend as k
from tensorflow.keras.optimizers import *
from sklearn.model_selection import train_test_split,StratifiedKFold
from tensorflow.keras.callbacks import *
from nltk.corpus import *
from nltk.stem import *
import string
from sklearn.preprocessing import *
from tqdm import tqdm

In [None]:
train_df=pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df=pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample_sub=pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
#Load Word Embeddings
embedding_path='../input/glove840b300dtxt/glove.840B.300d.txt'
embedding_dict={}
embd_file=open(embedding_path,'r',errors = 'ignore',encoding='utf8')
for line in tqdm(embd_file):
    values=line.split(' ')
    word=values[0]
    coef=np.asarray(values[1:],dtype='float32')
    embedding_dict[word]=coef
embd_file.close()

In [None]:
#Clean Text
sp=stopwords.words('english')
lm=WordNetLemmatizer()

def clean_text(df):
    #Remove punctuation
    print('Cleaning Punctuations')
    cleaned_text=[txt.translate(str.maketrans('','',string.punctuation)) for txt in df['excerpt']]

    print('Cleaning numbers')
    cleaned_text=[' '.join([i for i in txt.lower().split() if i.isalpha()]) for txt in cleaned_text]

    print('Cleaning Stopwords')
    cleaned_text=[' '.join(i for i in txt.split() if i not in sp) for txt in cleaned_text]
    
    #Normalize Word
    print('Word Normalizing')
    cleaned_text=[' '.join(lm.lemmatize(i) for i in txt.split()) for txt in cleaned_text]
    
    return cleaned_text

In [None]:
train_cleaned=clean_text(train_df)

In [None]:
maxlen_=500
max_words=20000
print('Word Tokenization and Transforming')
tokenizer=Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_cleaned)
sequences=tokenizer.texts_to_sequences(train_cleaned)
train_data_preped=pad_sequences(sequences,maxlen=maxlen_)
word_index=tokenizer.word_index
print('Tokenization Done!')

In [None]:
embedding_matrix=np.zeros((max_words,300))
print('Loading Embedding Matrix..\n')
for word,ix in tqdm(word_index.items()):
    if ix<max_words:
        embed_vec=embedding_dict.get(word)
        if embed_vec is not None:
            embedding_matrix[ix]=embed_vec

In [None]:
#Split Dataset
X_train,X_val,y_train,y_val=train_test_split(train_data_preped,train_df['target'],test_size=0.15)
print('Size of Train: ',X_train.shape)
print('Size of Validation: ',X_val.shape)

In [None]:
inp=Input(maxlen_)
x=Embedding(max_words,300)(inp)
x=Bidirectional(LSTM(256,return_sequences=True))(x)

x=Conv1D(16,5,strides=2,padding='same')(x)
x=Activation('relu')(x)

x=Conv1D(32,3,strides=2,padding='same')(x)
x=Activation('relu')(x)

x=Conv1D(64,3,strides=4,padding='same')(x)
x=Activation('relu')(x)

x=Conv1D(128,3,strides=4,padding='same')(x)
x=Activation('relu')(x)

x=GRU(256)(x)
x=Dense(128,activation='relu')(x)
out=Dense(1)(x)
model=Model(inp,out)
model.summary()

In [None]:
model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable=True

In [None]:
def rmse(y_true, y_pred):
        return k.sqrt(k.mean(k.square(y_pred - y_true))) 
    
model.compile(loss=rmse,optimizer=RMSprop(0.001))

#Callbacks
rop=ReduceLROnPlateau(min_lr=0.00000001,patience=10)
mc=ModelCheckpoint('model.h5',save_freq='epoch')

In [None]:
history=model.fit(X_train,y_train,batch_size=128,epochs=300,validation_data=(X_val,y_val),
                  callbacks=[mc])

In [None]:
import matplotlib.pyplot as plt
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.figure(figsize=(10,5))
plt.plot(epochs, loss, 'b', color='red', label='Training loss')
plt.plot(epochs, val_loss, 'b',color='blue', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.grid()
plt.show()

**Test Data**

In [None]:
claned_test=clean_text(test_df)

In [None]:
print('Word Tokenization and Transforming of Test data')
tokenizer=Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(claned_test)
sequences=tokenizer.texts_to_sequences(claned_test)
test_data_preped=pad_sequences(sequences,maxlen=maxlen_)
word_index=tokenizer.word_index
print('Tokenization Done!')

In [None]:
model=load_model('model.h5',custom_objects={'rmse': rmse})
preds=model.predict(test_data_preped)
sample_sub['target']=preds
sample_sub.to_csv('submission.csv',index=False)