
## Overview

This script performs EDA on `train.csv` and then preprocesses it to train a model which is in turn used to build algorithms to rate the complexity of reading passages for grade 3-12 classroom.



## Installing and importing dependencies

First let us import all the modules and packages that will be required.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

#For Preprocessing
from nltk.tokenize import word_tokenize
from collections import Counter
import re #regex for removing non-letter characters
import nltk  #natural language processing
nltk.download("stopwords")
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS

#For data visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#For training model
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from tensorflow.python.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
gpu_devices=tf.config.experimental.list_physical_devices("GPU")
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device,True)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


## Loading and Exploring dataset

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


There are three files have been provided with us in this competition: 

* `train.csv`: The CSV file containing all the training excerpts as well as their corresponding metadata, such as their ID and their target complexities.
* `test.csv`: The CSV file containing the excerpts that will be used for testing purposes.
* `sample_submission.csv`: The CSV file containing all the publications IDs in the test set, for which we'll have to populate the prediction column.

Let's read these files.

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

#### Printing first 10 rows of the train data:

In [None]:
train_data.head(10)

The dataset has the following columns:
 * `id` : Denotes the id of the excerpt
 * `legal_url` : Denotes the URL of the site from where the excerpt was taken
 * `license` : Denotes the License under which the excerpt lies
 * `excerpt` : The text data (readings)
 * `target` : Denotes the ease of readability score 
 * `standard_error` : Denotes the corresponding standard error of the target measure across different rating scores

In [None]:
train_data.info()

In [None]:
# count null values
train_data.isnull().sum()

Thus, we realise that there are 2834 entries in our dataset with no missing values except for legal and license information.

In [None]:
train_data.describe()

#### Printing the number of unique values of each column in the dataset:

In [None]:
for col in train_data.columns:
    print(f"{col}: {len(train_data[col].unique())}")

Thus, we can conclude that all targets and standard errors are unique in the dataset.

Also, out of 2834 excepts, 830 excerpts have a license with only 15 unique licenses (16th value refers to NaN).


## Understanding the License Distribution

Let us jot down the different types of license and the number of excerpts they apply to.

In [None]:
train_data['license'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data= train_data, y= 'license', saturation=0.2, color="blue")
plt.title('Types of License')
plt.show();

## Understanding the URLs distribution

In [None]:
# Extract all url's
url_list = train_data['url_legal'].dropna().apply(lambda x : re.findall('https?://([A-Za-z_0-9.-]+).*',x)[0])
url_list = [url for url in url_list]
url_list[:10]
# count url's and sort them descending order 
urls_counts = Counter(url_list)
urls_counts_sorted = sorted(urls_counts.items(), key=lambda pair: pair[1], reverse=True)
urls_counts_df = pd.DataFrame(urls_counts_sorted, columns=['sites', 'counts'])
urls_counts_df

In [None]:
site = urls_counts_df['sites'].head(20)
count = urls_counts_df['counts'].head(20)
 
# Figure Size
fig, ax = plt.subplots(figsize =(16, 9))
 
# Horizontal Bar Plot
ax.barh(site, count)
 
# Remove axes splines
for s in ['top', 'bottom', 'left', 'right']:
    ax.spines[s].set_visible(False)
 

# Remove x, y Ticks
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
 
# Add padding between axes and labels
ax.xaxis.set_tick_params(pad = 5)
ax.yaxis.set_tick_params(pad = 10)
 
# Add x, y gridlines
ax.grid(b = True, color ='grey',
        linestyle ='-.', linewidth = 0.5,
        alpha = 0.2)
 
# Show top values
ax.invert_yaxis()
 
# Add annotation to bars
for i in ax.patches:
    plt.text(i.get_width()+0.2, i.get_y()+0.5,
             str(round((i.get_width()), 2)),
             fontsize = 10, fontweight ='bold',
             color ='grey')
 
# Add Plot Title
ax.set_title('Unique Sites count',
             loc ='left', )
 
# Add Text watermark
fig.text(0.9, 0.15, 'kritanjalijain', fontsize = 12,
         color ='grey', ha ='right', va ='bottom',
         alpha = 0.7)
 
# Show Plot
plt.show()

## Understanding the Excerpts

Now, let's look at an excerpt.

In [None]:
print("First example from train dataset: \n")
print(train_data.excerpt[0])

Let's look at the distribution of the target and standard error.

In [None]:
fig, ax = plt.subplots(1,2,figsize=(12,7))
sns.histplot(train_data['target'], kde= True, ax=ax[0])
sns.histplot(train_data['standard_error'], kde= True, ax=ax[1])
ax[0].set_title("Target Distribution")
ax[1].set_title("Standard Error Distribution")
plt.show();
print(train_data.target.describe())
print("-"*50)
print(train_data.standard_error.describe())

Targets follow a normal distribution centered at -1. It is apparent that negative targets are more common than positive ones, with the training range going from -3.676 up to 1.711 with 1.033 as standard deviation.

The standard deviation of `standard_error` variable is small, 0.034. Low standard error means that multiple rating systems mostly agreed regarding the ease of readability score and high standard error means that ratings from multiple rating systems are scattered.

#### Let's look at the texts with highest and lowest readability.

In [None]:
# Top 2 excerpts with lowest target

min_targets = sorted(train_data['target'])[:2]
for min_target in min_targets:
    print("Target:", train_data[train_data['target'] == min_target].iloc[0,4])
    print(train_data[train_data['target'] == min_target].iloc[0,3])
    print("-" * 100)

In [None]:
# Top 2 excerpts with highest target

max_targets = sorted(train_data['target'])[-2:]
for max_target in max_targets:
    print("Target:", train_data[train_data['target'] == max_target].iloc[0,4])
    print(train_data[train_data['target'] == max_target].iloc[0,3])
    print("-" * 100)

Thus, we can conclude that the higher target scoring excerpts have a lower reading complexity i.e. difficult words and more complex sentences than excerpts with lower target scores.

#### Visualizing wordcloud of the top 500 excerpts with the highest and lowest target scores:

In [None]:
def wordcloud_draw(data, color = 'white'):
    wordcloud = WordCloud(stopwords = STOPWORDS,
                          background_color = color,
                          width = 3000,
                          height = 2000
                         ).generate(' '.join(data))
    plt.figure(1, figsize = (12, 8))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()

lowscore_excerpts_words = []

for _, row in train_data.sort_values('target').head(500).iterrows():
    lowscore_excerpts_words.extend(row['excerpt'].split())

print("Wordcloud for 500 excerpts with lowest targets:")
wordcloud_draw(lowscore_excerpts_words, color='black')

highscore_excerpts_words = []

for _, row in train_data.sort_values('target').tail(500).iterrows():
    highscore_excerpts_words.extend(row['excerpt'].split())
print("-" * 100)
print("\nWordcloud for 500 excerpts with highest targets:")
wordcloud_draw(highscore_excerpts_words, color='black')

Words present in lower scoring excerpts such as government, light, matter and current stand out the most.

Words present in higher scoring excerpts such as said, went, little and children stand out the most.

## Preprocessing

Let's convert the passages to a string of partially preprocessed words i.e. without punctuations, stopwords and in lowercase sorted by the number of times the word occured in ascending order.

In [None]:
from nltk.stem import PorterStemmer
def excerpt_to_words(excerpt):
    ''' Convert excerpt text into a sequence of words '''
    
    # convert to lowercase
    text = excerpt.lower()
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words

words_list = excerpt_to_words(''.join(sents for sents in train_data['excerpt']))
s=words_list[:10]


words_list_freq = Counter(words_list)
words_list_freq_sorted = sorted(words_list_freq.items(), key=lambda pair: pair[1], reverse=True)

words_list_freq_sorted_df = pd.DataFrame(words_list_freq_sorted, columns=['words', 'counts'])[:30]
words_list_freq_sorted_df.head() 

In [None]:
print("\nOriginal excerpt ->", train_data['excerpt'][0])
print("\nProcessed excerpt ->", excerpt_to_words(train_data['excerpt'][0]))

In [None]:
X = list(map(excerpt_to_words, train_data['excerpt']))

Let's see the most commonly occuring words in the excerpts (except for stopwords)

In [None]:
word = words_list_freq_sorted_df['words'].head(20)
count = words_list_freq_sorted_df['counts'].head(20)
 
# Figure Size
fig, ax = plt.subplots(figsize =(16, 9))
 
# Horizontal Bar Plot
ax.barh(word, count)
 
# Remove axes splines
for s in ['top', 'bottom', 'left', 'right']:
    ax.spines[s].set_visible(False)
 

# Remove x, y Ticks
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
 
# Add padding between axes and labels
ax.xaxis.set_tick_params(pad = 5)
ax.yaxis.set_tick_params(pad = 10)
 
# Add x, y gridlines
ax.grid(b = True, color ='grey',
        linestyle ='-.', linewidth = 0.5,
        alpha = 0.2)
 
# Show top values
ax.invert_yaxis()
 
# Add annotation to bars
for i in ax.patches:
    plt.text(i.get_width()+0.2, i.get_y()+0.5,
             str(round((i.get_width()), 2)),
             fontsize = 10, fontweight ='bold',
             color ='grey')
 
# Add Plot Title
ax.set_title('Top 20 frequent words and no. of times they occured',
             loc ='left', )
 
# Add Text watermark
fig.text(0.9, 0.15, 'kritanjalijain', fontsize = 12,
         color ='grey', ha ='right', va ='bottom',
         alpha = 0.7)
 
# Show Plot
plt.show()

In [None]:
targets=np.array(train_data['target'])
excerpt_text=np.array(train_data['excerpt'])

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data.excerpt)  
vocab_size = len(tokenizer.word_index) + 1 
max_length = 200

In [None]:
sequences_train = tokenizer.texts_to_sequences(excerpt_text) 
#sequences_test = tokenizer.texts_to_sequences(test_data.excerpt) 

X_train = pad_sequences(sequences_train, maxlen=max_length, padding='post')
#X_test = pad_sequences(sequences_test, maxlen=max_length, padding='post')

y_train = train_data.target.values
#y_test = test_data.target.values

In [None]:
embeddings_dictionary = dict()
embedding_dim = 100
glove_file = open('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt')

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
    
glove_file.close()

embeddings_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embeddings_matrix[index] = embedding_vector

## Defining Model

In [None]:
embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False)

In [None]:
num_epochs = 100
#batch_size = 1000

In [None]:
model=tf.keras.models.Sequential([

    embedding_layer,
    #tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(filters=64,kernel_size= 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Bidirectional(LSTM(64)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16,activation='relu'),
    tf.keras.layers.Dense(1,activation='linear'),

])
model.summary()

In [None]:
import tensorflow as tf
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
selected_optimizer=tf.keras.optimizers.Adam(learning_rate=1e-05)
selected_loss=tf.keras.losses.MeanSquaredError()

model.compile(optimizer=selected_optimizer,loss=selected_loss)

In [None]:
savedmodel_filepath='./model_1.h5'
early_stopping=EarlyStopping(patience=10,monitor='val_loss')
reduce_lr=ReduceLROnPlateau(monitor='val_loss',min_lr=0.00001,patience=3,mode='min',verbose=1)
model_checkpoint=ModelCheckpoint(monitor='val_loss',filepath=savedmodel_filepath,
                                 save_best_only=True)


selected_callbacks=[early_stopping]

history=model.fit(X_train,y_train,epochs=num_epochs,verbose=1,callbacks=selected_callbacks)



In [None]:
model.save('model_1.h5')
print("Model 1 saved")


In [None]:
excerpt_test=np.array(test_data['excerpt'])
test_data.head(10)

In [None]:
sequences_test = tokenizer.texts_to_sequences(excerpt_test) 

X_test = pad_sequences(sequences_test, maxlen=max_length, padding='post')

prediction_testdata = model.predict(X_test)

test_data["target"]= prediction_testdata
test_data.head(10)

In [None]:
submission["target"]=prediction_testdata
submission.to_csv('submission.csv',index=False)