In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize #(word tokenize, sentence tokenize)
from bs4 import BeautifulSoup
import re, string, unicodedata
from keras.preprocessing import text, sequence
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

In [None]:
df.head()

In [None]:
df.iloc[0][3]

In [None]:
df['target'].plot(kind = 'hist', bins = 100)

In [None]:
df.drop(['url_legal', 'license', 'standard_error'], axis =1, inplace = True) #removing url_legal and license columns

In [None]:
stop_words = set(stopwords.words('english'))
punctuation = list(string.punctuation) 
#adding everything into one set
stop_words.update(punctuation)

In [None]:
#data cleaning

#for html lines removal
def strip_html(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def url_extract(text):
    return re.sub(r'http\S+', '', text)

In [None]:
def stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop_words:
            final_text.append(i.strip())
    return " ".join(final_text)

def pre_process(text):
    text = strip_html(text)
    text = square_brackets(text)
    text = url_extract(text)
    text = stopwords(text)
    return text

In [None]:
df['excerpt'] = df['excerpt'].apply(pre_process)

In [None]:
df.head()

In [None]:
#creating vocab for the news
def get_corpus(text):
    words = []
    for i in text:
        for j in i.split():
            words.append(j.strip())
    return words

corpus = get_corpus(df.excerpt)

In [None]:
corpus[:5]

In [None]:
#using counter function now to get a count for each word.
from collections import Counter
counter = Counter(corpus)

most_common_words = counter.most_common(10)
most_common_words = dict(most_common_words)
most_common_words

In [None]:
#train_test split
X_train, X_test, y_train, y_test= train_test_split(df.excerpt, df.target, random_state = 42)

In [None]:
maxfeatures = 10000
maxlength = 400

In [None]:
#tokenize
tokenizer = text.Tokenizer(num_words=maxfeatures)
tokenizer.fit_on_texts(X_train)
tokenized_train = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlength)

In [None]:
X_train

In [None]:
#for test data
tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlength)

In [None]:
#GOLVE MODEL FOR WORD EMBEDDING
embedding_file = '../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt'

In [None]:
def get_coeff(word, *arr):
    return word, np.asarray(arr, dtype = 'float32')

embedding_index = dict(get_coeff(*o.rstrip().rsplit(' ')) for o in open(embedding_file))
open(embedding_file)

In [None]:
all_embedd = np.stack(embedding_index.values())
embedd_mean, embedd_std = all_embedd.mean(), all_embedd.std()
embedd_size = all_embedd.shape[1]

word_index = tokenizer.word_index
nb_words = min(maxfeatures, len(word_index))

#creating a matrix
embedding_matrix = np.random.normal(embedd_mean, embedd_std, (nb_words, embedd_size))

In [None]:
for word,i in word_index.items():
    if i>= maxfeatures:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
batch_size = 256
epochs = 20
embedd_size = 200

In [None]:
learning_rate = ReduceLROnPlateau(monitor = 'val_loss',
                                  patience = 2, verbose = 1,
                                  factor = 0.5, min_lr=0.0001)

In [None]:
#creating a model
model = Sequential()
model.add(Embedding(maxfeatures, output_dim = embedd_size,
                    weights = [embedding_matrix], input_length = maxlength,
                    trainable = False))
model.add(LSTM(units = 128, return_sequences = True, recurrent_dropout = 0.25,
               dropout = 0.25))
model.add(LSTM(units = 64, recurrent_dropout = 0.1, dropout = 0.1))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1))
model.compile(loss='mse', optimizer= keras.optimizers.Adam(lr = 0.01))

In [None]:
model.summary()

In [None]:
history = model.fit(X_train, y_train, batch_size=batch_size,
                    validation_data=(X_test, y_test), epochs = epochs,
                    callbacks=[learning_rate])

In [None]:
#training data Validation Loss
print("Validation Loss of the model on Training Data is - " , model.evaluate(X_train,y_train))

In [None]:
#Testing validation Loss
print("Validation Loss of the model on Testing Data is - " , model.evaluate(X_test,y_test))

In [None]:
epochs = [i for i in range(20)]
fig, ax = plt.subplots(1, 1)
train_loss = history.history['loss']
val_loss = history.history['val_loss']


ax.plot(epochs, train_loss, 'go-', label= 'Training Loss')
ax.plot(epochs, val_loss, 'ro-', label = 'Testing Loss')
ax.set_title('Training and Testing Loss')
ax.legend()
ax.set_xlabel('Epochs')
ax.set_ylabel('Loss')
plt.show()

In [None]:
#make predictions
predictions = model.predict(X_test)
predictions[:10]
########Submissions#####
submission_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
submission_df.drop(['url_legal', 'license'], axis = 1, inplace = True)
tokenized_test = tokenizer.texts_to_sequences(submission_df.excerpt)
submission = sequence.pad_sequences(tokenized_test, maxlen=maxlength)
final_submission = model.predict(submission)

#final_submission[:10]
submission_df['target'] = final_submission
submission_df.drop('excerpt', axis = 1, inplace = True)
#submission.head()
submission_df.to_csv('submission.csv',index=False)
display(submission_df.head(10))