

<h1 style='background-color:Gray; font-family:newtimeroman; font-size:250%; text-align:center; border-radius: 15px 50px;' > CommonLit Readability Prize With EDA+GRU+glove.840B.300d </h1>

## What is GloVe?
##### GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.



<img src="https://dz2cdn2.dzone.com/storage/article-thumb/11629773-thumb.jpg" width="800px">




<h1 style='background-color:Gray; font-family:newtimeroman; font-size:250%; text-align:center; border-radius: 15px 50px;' > Gated Recurrent Units  </h1>

## What is GRU? 
##### The GRU is the newer generation of Recurrent Neural networks and is pretty similar to an LSTM. GRU's got rid of the cell state and used the hidden state to transfer information. It also only has two gates, a reset gate and update gate

<img src="https://www.researchgate.net/publication/334385520/figure/fig1/AS:779310663229447@1562813549841/Structure-of-a-GRU-cell.ppm" width="800px">


## Data Description

### Files
* train.csv - the training set
* test.csv - the test set
* sample_submission.csv - a sample submission file in the correct format

### Columns
* id - unique ID for excerpt
* url_legal - URL of source - this is blank in the test set.
* license - license of source material - this is blank in the test set.
* excerpt - text to predict reading ease of
* target - reading ease
* standard_error - measure of spread of scores among multiple raters for each excerpt. Not included for test data.


### Dataset link 


#### [Here](https://www.kaggle.com/c/commonlitreadabilityprize)

In [None]:
!pip install dataprep

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
from collections import Counter
%matplotlib inline
import matplotlib.pyplot as plt
from dataprep.eda import *
from dataprep.eda import plot
from dataprep.eda import plot_correlation
from dataprep.eda import plot_missing
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.text import *
from tensorflow.keras.preprocessing.sequence import *
from tensorflow.keras.models import *
import tensorflow.keras.backend as k
from tensorflow.keras.optimizers import *
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import *
from keras.models import Sequential 

#For training model
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, GRU
from tensorflow.python.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
gpu_devices=tf.config.experimental.list_physical_devices("GPU")
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device,True)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


from keras.layers import Dense
from keras.layers import SimpleRNN
from keras.layers import Dropout
from nltk.corpus import *
from nltk.stem import *
import string
from sklearn.preprocessing import *
from tqdm import tqdm

In [None]:
train=pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test=pd.read_csv('../input/commonlitreadabilityprize/test.csv')
submission=pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe().T

In [None]:
# Generate the Profiling Report

import pandas_profiling as pp
profile = pp.ProfileReport(
    train,html={"style": {"full_width": True}}, sort="None"
)

In [None]:
profile.to_widgets()

In [None]:
profile

In [None]:
plot(train)

In [None]:
plot_correlation(train)

In [None]:
plot(test)

In [None]:
for col in train.columns:
    print(f"{col}: {len(train[col].unique())}")

In [None]:
train['license']

In [None]:
train['license'].value_counts()

In [None]:
plt.figure(figsize=(35, 5))
sns.countplot(data= train, x= 'license', saturation=0.2, color="r")
plt.title('Types of License')
plt.show();

In [None]:
fig, ax = plt.subplots(1,2,figsize=(12,7))
sns.histplot(train['target'], kde= True, ax=ax[0])
sns.histplot(train['standard_error'], kde= True, ax=ax[1])
ax[0].set_title("Target Distribution")
ax[1].set_title("Standard Error Distribution")
plt.show();

In [None]:
print(train.target.describe())
print('_____________________________')
print(train.standard_error.describe())

In [None]:
# Extract all url's
url_list = train['url_legal'].dropna().apply(lambda x : re.findall('https?://([A-Za-z_0-9.-]+).*',x)[0])
url_list = [url for url in url_list]
url_list[:10]
# count url's and sort them descending order 
urls_counts = Counter(url_list)
urls_counts_sorted = sorted(urls_counts.items(), key=lambda pair: pair[1], reverse=True)
urls_counts_df = pd.DataFrame(urls_counts_sorted, columns=['sites', 'counts'])
urls_counts_df

In [None]:
site = urls_counts_df['sites'].head(20)
count = urls_counts_df['counts'].head(20)
 
# Figure Size
fig, ax = plt.subplots(figsize =(16, 9))
 
# Horizontal Bar Plot
ax.barh(site, count)
 
# Remove axes splines
for s in ['top', 'bottom', 'left', 'right']:
    ax.spines[s].set_visible(False)
 

# Remove x, y Ticks
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
 
# Add padding between axes and labels
ax.xaxis.set_tick_params(pad = 5)
ax.yaxis.set_tick_params(pad = 10)
 
# Add x, y gridlines
ax.grid(b = True, color ='grey',
        linestyle ='-.', linewidth = 0.5,
        alpha = 0.2)
 
# Show top values
ax.invert_yaxis()
 
# Add annotation to bars
for i in ax.patches:
    plt.text(i.get_width()+0.2, i.get_y()+0.5,
             str(round((i.get_width()), 2)),
             fontsize = 10, fontweight ='bold',
             color ='grey')
 
# Add Plot Title
ax.set_title('Unique Sites count',
             loc ='left', )
 
# Add Text watermark
fig.text(0.9, 0.15, 'kritanjalijain', fontsize = 12,
         color ='grey', ha ='right', va ='bottom',
         alpha = 0.7)
 
# Show Plot
plt.show()

In [None]:
print("First example from train dataset: \n")
print(train.excerpt[0])

In [None]:
# Top 2 excerpts with lowest target

min_targets = sorted(train['target'])[:2]
for min_target in min_targets:
    print("Target:", train[train['target'] == min_target].iloc[0,4])
    print(train[train['target'] == min_target].iloc[0,3])
    print("-" * 100)


In [None]:
# Top 2 excerpts with highest target

max_targets = sorted(train['target'])[-2:]
for max_target in max_targets:
    print("Target:", train[train['target'] == max_target].iloc[0,4])
    print(train[train['target'] == max_target].iloc[0,3])
    print("-" * 100)

In [None]:
from nltk.stem import PorterStemmer
def excerpt_to_words(excerpt):
    ''' Convert excerpt text into a sequence of words '''
    
    # convert to lowercase
    text = excerpt.lower()
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words

words_list = excerpt_to_words(''.join(sents for sents in train['excerpt']))
s=words_list[:10]


words_list_freq = Counter(words_list)
words_list_freq_sorted = sorted(words_list_freq.items(), key=lambda pair: pair[1], reverse=True)

words_list_freq_sorted_df = pd.DataFrame(words_list_freq_sorted, columns=['words', 'counts'])[:30]
words_list_freq_sorted_df.head() 

In [None]:
print("\nOriginal excerpt ->", train['excerpt'][0])
print("\nProcessed excerpt ->", excerpt_to_words(train['excerpt'][0]))

In [None]:
X = list(map(excerpt_to_words, train['excerpt']))

In [None]:
word = words_list_freq_sorted_df['words'].head(20)
count = words_list_freq_sorted_df['counts'].head(20)
 
# Figure Size
fig, ax = plt.subplots(figsize =(16, 9))
 
# Horizontal Bar Plot
ax.barh(word, count)
 
# Remove axes splines
for s in ['top', 'bottom', 'left', 'right']:
    ax.spines[s].set_visible(False)
 

# Remove x, y Ticks
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
 
# Add padding between axes and labels
ax.xaxis.set_tick_params(pad = 5)
ax.yaxis.set_tick_params(pad = 10)
 
# Add x, y gridlines
ax.grid(b = True, color ='grey',
        linestyle ='-.', linewidth = 0.5,
        alpha = 0.2)
 
# Show top values
ax.invert_yaxis()
 
# Add annotation to bars
for i in ax.patches:
    plt.text(i.get_width()+0.2, i.get_y()+0.5,
             str(round((i.get_width()), 2)),
             fontsize = 10, fontweight ='bold',
             color ='grey')
 
# Add Plot Title
ax.set_title('Top 20 frequent words and no. of times they occured',
             loc ='left', )
 
# Add Text watermark
fig.text(0.9, 0.15, 'kritanjalijain', fontsize = 12,
         color ='grey', ha ='right', va ='bottom',
         alpha = 0.7)
 
# Show Plot
plt.show()

In [None]:
targets=np.array(train['target'])
excerpt_text=np.array(train['excerpt'])

In [None]:
targets

In [None]:
excerpt_text

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.excerpt)  
vocab_size = len(tokenizer.word_index) + 1 
max_length = 200

In [None]:
sequences_train = tokenizer.texts_to_sequences(excerpt_text) 
#sequences_test = tokenizer.texts_to_sequences(test_data.excerpt) 

X_train = pad_sequences(sequences_train, maxlen=max_length, padding='post')
#X_test = pad_sequences(sequences_test, maxlen=max_length, padding='post')

y_train = train.target.values
#y_test = test_data.target.values

In [None]:
y_train

In [None]:
from tqdm import tqdm
embedding_vector = {}
f = open('../input/glove-version840b300d/glove.840B.300d.txt')
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

In [None]:
embedding_matrix = np.zeros((vocab_size,300))
for word,i in tqdm(tokenizer.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

In [None]:
embedding_matrix.shape

In [None]:
 input_length =embedding_matrix.shape[1]

In [None]:
embid_dim = 300
lstm_out = 256
model = keras.Sequential()
model.add(Embedding(vocab_size, embid_dim, input_length =max_length, weights = [embedding_matrix] , trainable = False))
model.add(Bidirectional(GRU(lstm_out, dropout=0.2)))
model.add(Dense(128, activation = 'tanh'))
model.add(Dense(128, activation = 'tanh'))
model.add(Dropout(0.2))
model.add(Dense(30, activation = 'tanh'))
model.add(Dense(30, activation = 'tanh'))
model.add(Dense(1, activation = 'linear'))
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model
from IPython.display import Image
plot_model(model, to_file='convnet.png', show_shapes=True,show_layer_names=True)
Image(filename='convnet.png')

In [None]:
# Callbacks
lrd = ReduceLROnPlateau(monitor = 'val_loss',patience = 3,verbose = 1,factor = 0.50, min_lr = 1e-7)

mcp = ModelCheckpoint('model_RUN.h5',save_freq='epoch', verbose=1)

es = EarlyStopping(verbose=1, patience=3)   

model.compile(loss=tf.keras.losses.MeanSquaredError(),optimizer=Adam(learning_rate=1e-05))      

In [None]:
%time
history=model.fit(X_train,y_train,epochs=100,verbose=1,callbacks=[lrd,mcp,es])

In [None]:
history.history.keys()

In [None]:
import matplotlib.pyplot as plt
loss = history.history['loss']
epochs = range(1, len(loss) + 1)
plt.figure(figsize=(10,5))

plt.plot(epochs, loss, 's', color='C3', label='Training loss')
plt.title('Training and validation loss')
plt.legend()
plt.grid()
plt.show()

In [None]:
excerpt_test=np.array(test['excerpt'])
test.head()

In [None]:
sequences_test = tokenizer.texts_to_sequences(excerpt_test) 

X_test = pad_sequences(sequences_test, maxlen=max_length, padding='post')

prediction_testdata = model.predict(X_test)

test["target"]= prediction_testdata
test.head()

In [None]:
submission["target"]=prediction_testdata
submission.to_csv('submission.csv',index=False)