# Versions
1. Version 4- Used Glove 6B 200D
2. Version 5- Used Glove 840B 300d 

# Imports

In [None]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm
import re
import pickle

from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import mean_squared_error

import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.layers import LSTM,RNN,Conv2D,Dense,Flatten,GlobalAveragePooling2D,Embedding,Bidirectional,Input,Dropout,Conv1D,MaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam,SGD


SEED=42

In [None]:
paths=["/kaggle/input/commonlitreadabilityprize/sample_submission.csv",
       '/kaggle/input/commonlitreadabilityprize/train.csv',
       '/kaggle/input/commonlitreadabilityprize/test.csv']
df_train=pd.read_csv(paths[1])
df_test=pd.read_csv(paths[2])
df_ss=pd.read_csv(paths[0])

In [None]:
df_train

In [None]:
df_test

In [None]:
df_train['excerpt'][0]

# Cleaning

In [None]:
def clean(string):
    pattern='\n'
    pattern1= '\''
    test= re.sub(pattern,'',string)
    string= re.sub(pattern1,'',string)

    return string

In [None]:
#remove stopwords 
nltk.download('stopwords')
stop=stopwords.words('english')


def remove_stopwords(df):
    new_text=[]
    for i in range(len(df)):
        test=[j for j in df['excerpt'][i].split() if j not in stop]
        new_text.append(' '.join(test))
    new_text=pd.Series(new_text,name='cleaned_text')
    df=pd.concat([df,new_text],axis='columns',copy=False)
    return df

In [None]:
df_train['excerpt']=df_train['excerpt'].map(clean)
df_test['excerpt']=df_test['excerpt'].map(clean)


In [None]:
df_train= remove_stopwords(df_train)
df_test= remove_stopwords(df_test)


In [None]:
length=[]
for i in df_train['cleaned_text']:
    length.append(len(i))
    
length= np.array(length)
print(length.mean(),length.min(),length.max())

In [None]:
X=df_train['cleaned_text']
y=df_train['target']
test=df_test['cleaned_text']



In [None]:
df_train['cleaned_text'][0]

In [None]:
VOCAB= 25000
max_len=681
oov_token='<OOV_TOKEN>'
truncate_type='post'
padding_type='post'
embedding_dim=16


# Tokenizing

In [None]:
#tokenize
tokenizer=Tokenizer(oov_token=oov_token,num_words=VOCAB)
tokenizer.fit_on_texts(X)
word_index = tokenizer.word_index
print(len(word_index))

#

train_sequences= tokenizer.texts_to_sequences(X)
test_sequences= tokenizer.texts_to_sequences(test)

train_padding = pad_sequences(train_sequences, maxlen=max_len, padding= padding_type, truncating= truncate_type)
test_padding = pad_sequences(test_sequences, maxlen=max_len, padding= padding_type , truncating= truncate_type)

# Glove Embedding Prep

In [None]:
import pickle
from time import time

t = time()
with open('../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl', 'rb') as fp:
    embeddings_index  = pickle.load(fp)

In [None]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
print(f'Shape of Embedding: {embedding_matrix.shape}')

In [None]:
"""embeddings_index = {}
with open('../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl', 'rb') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embeddings_matrix = np.zeros(((len(word_index)+1),200))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector """

# Model Construction

In [None]:
input_layer = Input(shape=(681,))
x= embedding_layer= Embedding(len(word_index)+1,300,trainable=False,weights=[embedding_matrix])(input_layer)
x= Conv1D(32,3,activation='relu')(x)
x=MaxPooling1D(pool_size=2)(x)
x= Bidirectional(LSTM(150))(x)
x=Dense(128,activation='relu')(x)
x=Dense(64,activation='relu')(x)
x=Dense(32,activation='relu')(x)
x=Dense(16,activation='relu')(x)
predictions = Dense(1,activation='linear')(x)

model1=Model(inputs=input_layer, outputs= predictions)

print(model1.summary())


model1.compile(
    optimizer= Adam(learning_rate=1e-5),
    loss='mse',
    metrics='mae'
)

model_checkpoint=ModelCheckpoint('golve_840b300d.h5',monitor='loss',save_best_only=True,mode='min')
early_stopping=EarlyStopping(monitor="loss",min_delta=0,patience=10,verbose=0,mode="min",restore_best_weights=True)
reduce_lr=ReduceLROnPlateau(monitor="loss",factor=0.2,patience=10,min_lr=0.00001)


# Submission code

In [None]:
model1.fit(train_padding,y,epochs=200,batch_size=256, callbacks = [model_checkpoint,reduce_lr,early_stopping])

In [None]:
y_pred = model1.predict(test_padding)

In [None]:
sub_scores=[]
for i in y_pred:
    sub_scores.append(i)
sub_scores

In [None]:
sub=pd.DataFrame({'id':df_ss['id'],'target':y_pred})
sub.to_csv('submission.csv',index=False)
sub.head()