In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
def master_seed(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed)

master_seed(31416)
    

In [None]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train_df.head(10)

In [None]:
train_df.describe()

In [None]:
#print sample text along with target variable 

for i in range(20):
    
    print(train_df.iloc[i,3], train_df.iloc[i,4], '\n')

In [None]:
#comparing the most comple excerpt and the simplest one
#Bigger values on the target column are related to easier readability
mostcomplex_index = train_df['target'].idxmin()
simplest_index = train_df['target'].idxmax()

print('The simplest excerpt:\n')
print(train_df.iloc[simplest_index,3], '\n')
print('With a target value of: {}'.format(train_df.iloc[simplest_index,4]), '\n')

print('The most complex excerpt:\n')
print(train_df.iloc[mostcomplex_index,3], '\n')
print('With a target value of: {}'.format(train_df.iloc[mostcomplex_index,4]), '\n')

In [None]:
def remove_small(word_list):
    
    _list = []
    
    for st in word_list:
        
        if len(st) > 3:
            
            _list.append(st)
            
    return _list

def tokenize(ser):
    
    full_tokens = []
    cleaned_tokens = []
    
    for pr in ser:
        
        temp = nltk.word_tokenize(pr)
        full_tokens.extend(temp)
        cleaned_tokens.extend(remove_small(temp))
        
    return cleaned_tokens, full_tokens, set(full_tokens)    

def remove_stopwords(word_list):
    
    stop_words = set(nltk.corpus.stopwords.words('english'))
    output = [w for w in word_list if w not in stop_words]
    return output 

def create_vectorizer(text_list, sequence_length, batch_size):
    
    vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens = 33000,
                                                                    output_sequence_length = sequence_length)
    text_array = np.array(text_list)
    
    text_ds = tf.data.Dataset.from_tensor_slices(text_array).batch(batch_size)
    
    vectorizer.adapt(text_ds)
    
    return vectorizer


def create_vector_mapping_dict(path_to_glove_file):

    path_to_glove_file = path_to_glove_file

    embeddings_index = {}
    with open(path_to_glove_file) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs

    print("Found %s word vectors." % len(embeddings_index))
    
    return embeddings_index

def create_embedding_matrix(len_voc, embedding_dim, word_index, embedding_index):

    num_tokens = len_voc + 2
    embedding_dim = embedding_dim
    hits = 0
    misses = 0

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))
    
    return embedding_matrix




In [None]:
cl, fll, unq = tokenize(train_df['excerpt'])

In [None]:
print('The number of unique words is {}'.format(len(unq)))
print('The total number of words in all the texts is {}'.format(len(fll)))

In [None]:
_ = plt.figure(figsize = (7,7))
b = pd.Series(cl)
b.value_counts().iloc[:20].plot(kind = 'barh')
plt.title('Most Common Words with stop words')
plt.xlabel('Occurences')

In [None]:
cl_stop = remove_stopwords(cl)
_ = plt.figure(figsize = (7,7))
b = pd.Series(cl_stop)
b.value_counts().iloc[:20].plot(kind = 'barh')
plt.title('Most Common Words without stop words')
plt.xlabel('Occurences')

In [None]:
_ = plt.figure(figsize = (7,7))
sns.histplot(train_df['target'])
plt.title('Readibility Train data Histogram')
plt.xlabel('Readibility')

In [None]:
# looking at the length of texts by words

word_count = []

for para in train_df['excerpt']:
    
    word_count.append(len(nltk.word_tokenize(para)))
    
_ = plt.figure(figsize = (7,7))
sns.histplot(word_count)
plt.title('Word Count histogram')
plt.xlabel('Number of words in Exerpt')

In [None]:
vectorizer = create_vectorizer(train_df['excerpt'], 250, 128)
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))
embedding_index = create_vector_mapping_dict('../input/glove6b/glove.6B.300d.txt')
emb_matrix = create_embedding_matrix(len(voc), 300, word_index, embedding_index)

In [None]:
embedding_dim = 300
num_tokens = len(voc) + 2


embedding_layer = layers.Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(emb_matrix),
    trainable=False,
)

In [None]:
#defining the model 

inputs = tf.keras.Input(shape = (250,))

interm = embedding_layer(inputs)
interm = layers.LSTM(128, dropout = 0.2, recurrent_dropout = 0.2)(interm)
interm = layers.Dense(64, activation = 'relu')(interm)
interm = layers.Dense(32, activation = 'relu')(interm)
outputs = layers.Dense(1, activation = 'linear')(interm)

model = tf.keras.Model(inputs, outputs)

model.compile(optimizer = 'adam', loss = 'MSE')
model.summary()

In [None]:
points = train_df.shape[0]
split = 0.8

x_train = train_df['excerpt'][:int(points*split)]
x_val = train_df['excerpt'][int(points*split):]
y_train = train_df['target'][:int(points*split)]
y_val = train_df['target'][int(points*split):]

x_train = vectorizer(x_train)
x_val = vectorizer(x_val)

In [None]:
callback_list = [tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=1,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)]


In [None]:
model.fit(x_train, y_train, batch_size = 128, validation_data = (x_val, y_val), epochs = 20, callbacks = callback_list)

In [None]:
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
x_test = vectorizer(test_df['excerpt'])
results = model.predict(x_test)
ids = test_df['id']
results = pd.Series(np.squeeze(results, 1))
submission = pd.concat([ids, results], axis = 1)
submission.rename({0:'target'}, axis = 1, inplace = True)
submission.to_csv('submission.csv', index = False)
