In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.optimizers as O
import tensorflow.keras.losses as Los

from sklearn.model_selection import KFold

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
train_data.head()

In [None]:
print('Number of Unique values in the data :')
train_data.nunique()

In [None]:
max = np.max([len(x) for x in train_data.excerpt.values])
print("Maximum Length of an Excerpt:", max)

In [None]:
train_data.describe()

In [None]:
ax = sns.displot(x=train_data.target.values)
ax.set(xlabel='Readability',title='Density plot of Readability')

In [None]:
sns.boxplot(x=train_data.target)

In [None]:
print('Total number of Excerpts: ',len(train_data))
print("Number of Positive ease of read excerpts: ",len(train_data.target.values[train_data.target.values>=0]))
print("Number of Negative ease of read excerpts: ",len(train_data.target.values[train_data.target.values<0]))

In [None]:
positive_excerpts = train_data[train_data.target.values>=0]
negative_excerpts = train_data[train_data.target.values<0]

In [None]:
def visualize_wordcloud(data):
    text = ""
    for i in range(len(data)):
        text = text + " " + data.excerpt.values[i]
    stopwords = set(STOPWORDS)
    wordcld = WordCloud(background_color ='white',stopwords=stopwords, min_font_size=10).generate(text)
    plt.imshow(wordcld)
    plt.axis("off")
    plt.tight_layout(pad = 0)

In [None]:
visualize_wordcloud(positive_excerpts)
plt.title('Positive readablility word cloud')

In [None]:
visualize_wordcloud(negative_excerpts)
plt.title('Negative readablility word cloud')

In [None]:
positive_len = [len(x) for x in positive_excerpts.excerpt.values]
negative_len = [len(x) for x in negative_excerpts.excerpt.values]
ax = sns.displot(data=positive_len,kde=True,color='green')
ax.set(title='Desnity plot Lengths of Positive Readability Excerpts',xlabel='Length')
ax = sns.displot(data=negative_len,kde=True,color='red')
ax.set(title='Desnity plot Lengths of Negative Readability Excerpts',xlabel='Length')

In [None]:
tokenizer = Tokenizer(num_words=500)
tokenizer.fit_on_texts(train_data.excerpt.values)

In [None]:
data_seq = tokenizer.texts_to_sequences(train_data.excerpt.values)
print("Maximum sequence length: ",np.max([len(x) for x in data_seq]))

In [None]:
BATCH_SIZE = 16
MAX_LEN = 172
EPOCHS = 10

In [None]:
pad_data_seq = tf.keras.preprocessing.sequence.pad_sequences(data_seq,maxlen=MAX_LEN,padding='post')

In [None]:
def build_model():
    inp = L.Input(shape=(MAX_LEN,))
    emb = L.Embedding(input_dim=500,output_dim = 62)(inp)
    X = L.Bidirectional(L.LSTM(32))(emb)
    X = L.Dense(64,activation='relu')(X)
    X = L.Dense(32,activation='relu')(X)
    out = L.Dense(1)(X)
    
    model = M.Model(inputs=inp,outputs=out)
    model.compile(loss='mse',optimizer='adam',metrics=['acc'])
    return model

In [None]:
model = build_model()
model.summary()

In [None]:
kf = KFold(n_splits=5,random_state=24,shuffle=True)

for index,(t_idx,v_idx) in enumerate(kf.split(pad_data_seq)):
    print(f"######## STEP {index+1} ########")
    train_data_seq = pad_data_seq[t_idx]
    val_data_seq = pad_data_seq[v_idx]
    train_target = train_data.target.values[t_idx]
    val_target = train_data.target.values[v_idx]
    
    history = model.fit(train_data_seq,
                        train_target,
                        validation_data=(val_data_seq,val_target),
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE)

In [None]:
test_data_seq = tokenizer.texts_to_sequences(test_data.excerpt.values)
test_data_seq = tf.keras.preprocessing.sequence.pad_sequences(test_data_seq,maxlen=MAX_LEN)

In [None]:
pred = model.predict(pad_data_seq)

In [None]:
pred = model.predict(test_data_seq,verbose=1)

In [None]:
sampl = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
sampl.target = pred

In [None]:
sampl

In [None]:
sampl.to_csv('submission.csv',index=False)