In [None]:
import numpy as np
import pandas as pd
import time
import string
import re
import math
from collections import Counter

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, ReduceLROnPlateau

import kerastuner as kt

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import nltk
from nltk.stem.snowball import SnowballStemmer

import spacy
nlp = spacy.load('en_core_web_lg')

In [None]:
df_train = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
df_test = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')

print(df_train.shape)
print(df_test.shape)

In [None]:
df_train[0:2]

In [None]:
df_test[0:2]

In [None]:
df_train['excerpt_len'] = df_train['excerpt'].apply(lambda x : len(x))
df_train['excerpt_word_count'] = df_train['excerpt'].apply(lambda x : len(x.split(' ')))

In [None]:
results = Counter()
df_train['excerpt'].str.lower().str.split().apply(results.update)
print(len(results.keys()))

In [None]:
def removeStopwords(text):
    doc = nlp(text)
    clean_text = ' '
    for txt in doc:
        if (txt.is_stop == False):
            clean_text = clean_text + " " + str(txt)        
    return clean_text

def removePunctuations(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def removeLinks(text):
    clean_text = re.sub('https?://\S+|www\.\S+', '', text)
    return clean_text

def removeNumbers(text):
    clean_text = re.sub(r'\d+', '', text)
    return clean_text

In [None]:
def clean(text):
    text = text.lower()
    text = removeStopwords(text)
    text = removePunctuations(text)
    text = removeNumbers(text)
    text = removeLinks(text)
    return text

In [None]:
df_train['excerpt_clean'] = df_train['excerpt'].apply(clean)
df_test['excerpt_clean'] = df_test['excerpt'].apply(clean)
df_train[0:2]

In [None]:
results = Counter()
df_train['excerpt_clean'].str.lower().str.split().apply(results.update)
print(len(results.keys()))

In [None]:
def stemWord(text):
    stemmer = SnowballStemmer(language='english')
    tokens = text.split()
    clean_text = ' '
    for token in tokens:
        clean_text = clean_text + " " + stemmer.stem(token)      
    return clean_text

In [None]:
df_train['excerpt_clean'] = df_train['excerpt_clean'].apply(stemWord)
df_test['excerpt_clean'] = df_test['excerpt_clean'].apply(stemWord)

In [None]:
results = Counter()
df_train['excerpt_clean'].str.lower().str.split().apply(results.update)
print(len(results.keys()))

In [None]:
rmse = lambda y_true, y_pred: np.sqrt(mse(y_true, y_pred))
rmse_loss = lambda Estimator, X, y: rmse(y, Estimator.predict(X))

In [None]:
x = df_train['excerpt_clean']
y = df_train['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_root_mean_squared_error', patience=3, verbose=1, factor=0.5, min_lr=0.00001)
early_stopping = EarlyStopping(min_delta=0.001,patience=5,restore_best_weights=True,)

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [None]:
def predict_complexity(model, excerpt):
    padding_type='post'
    sample_sequences = tokenizer.texts_to_sequences(excerpt)
    excerpt_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length) 
    classes = model.predict(excerpt_padded)
    for x in range(len(excerpt_padded)):
        print(excerpt[x])
        print(classes[x])

In [None]:
text = df_train['excerpt_clean']
vocab_size = 60000
embedding_dim = 64
max_length = 60
trunc_type='post'
pad_type='post'
oov_tok = "<OOV>"

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(text)
word_index = tokenizer.word_index

In [None]:
training_sequences = tokenizer.texts_to_sequences(text)
training_padded = pad_sequences(training_sequences,maxlen=max_length, truncating=trunc_type, padding=pad_type)
training_labels_final = np.array(df_train.target)

In [None]:
print(training_padded.shape)
print(training_labels_final.shape)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dense(1)
])

model.summary()

In [None]:
learning_rate = 0.0001
model.compile(loss='mean_squared_error',
              optimizer=tf.keras.optimizers.Adam(learning_rate), 
              metrics=[RootMeanSquaredError()])

In [None]:
num_epochs = 200
his = model.fit(training_padded, training_labels_final, 
                epochs=num_epochs, validation_split=0.1,
                callbacks=[early_stopping,learning_rate_reduction])

In [None]:
get_acc = his.history['root_mean_squared_error']
value_acc = his.history['val_root_mean_squared_error']
get_loss = his.history['loss']
validation_loss = his.history['val_loss']

In [None]:
epochs = range(len(get_acc))
plt.plot(epochs, get_acc, 'r', label='Mean_squared_error of Training data')
plt.plot(epochs, value_acc, 'b', label='Mean_squared_error of Validation data')
plt.title('Training vs validation mean_squared_error')
plt.legend(loc=0)
plt.figure()
plt.show()

In [None]:
epochs = range(len(get_loss))
plt.plot(epochs, get_loss, 'r', label='Loss of Training data')
plt.plot(epochs, validation_loss, 'b', label='Loss of Validation data')
plt.title('Training vs validation loss')
plt.legend(loc=0)
plt.figure()
plt.show()

In [None]:
sample_sequences = tokenizer.texts_to_sequences(df_test['excerpt'])
excerpt_padded = pad_sequences(sample_sequences, padding='post', maxlen=max_length) 
classes = model.predict(excerpt_padded)

sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
sample

In [None]:
submit = sample
submit["target"] = classes
submit.to_csv("submission.csv", index=False)
submit