In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.optimizers as O
import tensorflow.keras.losses as Los

from sklearn.model_selection import KFold

from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import string

stop_words = stopwords.words('english')
stemmer = nltk.PorterStemmer()

In [None]:
import transformers

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
train_data.head()

In [None]:
print('Number of Unique values in the data :')
train_data.nunique()

In [None]:
max = np.max([len(x) for x in train_data.excerpt.values])
print("Maximum Length of an Excerpt:", max)

In [None]:
train_data.describe()

In [None]:
ax = sns.displot(x=train_data.target.values)
ax.set(xlabel='Readability',title='Density plot of Readability')

In [None]:
sns.boxplot(x=train_data.target)

In [None]:
print('Total number of Excerpts: ',len(train_data))
print("Number of Positive ease of read excerpts: ",len(train_data.target.values[train_data.target.values>=0]))
print("Number of Negative ease of read excerpts: ",len(train_data.target.values[train_data.target.values<0]))

In [None]:
positive_excerpts = train_data[train_data.target.values>=0]
negative_excerpts = train_data[train_data.target.values<0]

In [None]:
def visualize_wordcloud(data):
    text = ""
    for i in range(len(data)):
        text = text + " " + data.excerpt.values[i]
    stopwords = set(STOPWORDS)
    wordcld = WordCloud(background_color ='white',stopwords=stopwords, min_font_size=10).generate(text)
    plt.imshow(wordcld)
    plt.axis("off")
    plt.tight_layout(pad = 0)

In [None]:
visualize_wordcloud(positive_excerpts)
plt.title('Positive readablility word cloud')

In [None]:
visualize_wordcloud(negative_excerpts)
plt.title('Negative readablility word cloud')

In [None]:
visualize_wordcloud(train_data)
plt.title('Whole data Readablility word cloud')

In [None]:
positive_len = [len(x) for x in positive_excerpts.excerpt.values]
negative_len = [len(x) for x in negative_excerpts.excerpt.values]
ax = sns.displot(data=positive_len,kde=True,color='green')
ax.set(title='Desnity plot Lengths of Positive Readability Excerpts',xlabel='Length')
ax = sns.displot(data=negative_len,kde=True,color='red')
ax.set(title='Desnity plot Lengths of Negative Readability Excerpts',xlabel='Length')

In [None]:
vectorizer = L.experimental.preprocessing.TextVectorization(max_tokens=21000,output_sequence_length=205)
train_data_ds = tf.data.Dataset.from_tensor_slices(train_data.excerpt.values)
vectorizer.adapt(train_data_ds)

In [None]:
train_data_vec = vectorizer(train_data.excerpt.values).numpy()

In [None]:
path_to_glove_file = '../input/glove42b300dtxt/glove.42B.300d.txt'
embeddings_index = {}

with open(path_to_glove_file) as f:
    for line in tqdm(f):
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
BATCH_SIZE = 16
MAX_LEN = 205
EPOCHS = 15
VOCAB_SIZE = len(vectorizer.get_vocabulary())
EMBEDDING_DIM = 301

In [None]:
num_tokens = VOCAB_SIZE + 2
word_index = dict(zip(vectorizer.get_vocabulary(), range(VOCAB_SIZE)))
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_vector = np.concatenate((embedding_vector,[0]))
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        vector = np.zeros((EMBEDDING_DIM-1))
        vector = np.concatenate((vector,[5e-1]))
        embedding_matrix[i] = vector
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
def build_model():
    inp = L.Input(shape=(MAX_LEN,))
    emb = L.Embedding(input_dim=num_tokens,output_dim = EMBEDDING_DIM,embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),trainable=False)(inp)
    X = L.Conv1D(16,3)(emb)
    X = L.BatchNormalization()(X)
    X = L.Activation('relu')(X)
    X = L.MaxPooling1D()(X)
    X = L.Conv1D(32,3)(emb)
    X = L.BatchNormalization()(X)
    X = L.Activation('relu')(X)
    X = L.MaxPooling1D()(X)
    X = L.Conv1D(64,3)(emb)
    X = L.BatchNormalization()(X)
    X = L.Activation('relu')(X)
    X = L.MaxPooling1D()(X)
    X = L.Dropout(0.3)(X)
    X = L.Bidirectional(L.LSTM(32,recurrent_initializer='glorot_uniform'))(X)
    X = L.Dense(64,activation='relu')(X)
    X = L.Dense(32,activation='relu')(X)
    out = L.Dense(1,kernel_initializer='glorot_uniform')(X)
    
    rms = tf.keras.metrics.RootMeanSquaredError()
    model = M.Model(inputs=inp,outputs=out)
    model.compile(loss='mse',optimizer='adam',metrics=[rms])
    
    return model

In [None]:
model = build_model()
model.summary()

In [None]:
kf = KFold(n_splits=5,random_state=24,shuffle=True)

for index,(t_idx,v_idx) in enumerate(kf.split(train_data_vec)):
    print(f"\n ######## STEP {index+1} ######## \n")
    train_x = train_data_vec[t_idx]
    val_x = train_data_vec[v_idx]
    train_y = train_data.target.values[t_idx]
    val_y = train_data.target.values[v_idx]
    
    history = model.fit(train_x,
                        train_y,
                        validation_data=(val_x,val_y),
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE)
    
    pred = model.predict(train_data_vec)
    plt.scatter(pred,train_data.target.values)
    plt.xlabel('Predicted')
    plt.ylabel('Real')
    plt.show()

In [None]:
test_data_vec = vectorizer(test_data.excerpt.values).numpy()

In [None]:
pred = model.predict(test_data_vec,verbose=1)

In [None]:
sampl = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
sampl.target = pred

In [None]:
sampl

In [None]:
sampl.to_csv('submission.csv',index=False)