# CommonLit Readability Prize with Pytorch

This kernel is based on multiple sources (in References) along with my edits.

## References
- https://www.kaggle.com/meihanw/commonlit-autokeras-try-out

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys
sys.path.append('../input/autokeras')


### Import neccessary library

In [None]:
import re
import string
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.datasets import load_files
import autokeras as ak


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import spacy

In [None]:
train = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
train.head()

In [None]:
 train.sort_values(by=['target']).head()

In [None]:
import itertools
import collections
import matplotlib.pyplot as plt


In [None]:
train.sort_values(by=['target']).head().iloc[0]['excerpt']


In [None]:
train.sort_values(by=['target']).head().iloc[1]['excerpt']


In [None]:
train.sort_values(by=['target']).head().iloc[0]['excerpt']
d =  list(train.sort_values(by=['target']).head(20).iloc[0:20]['excerpt'])

In [None]:
words_in_lowest = [lowest_text.lower().split() for lowest_text in d]

Comment out the stopwords and decide not to clean out the text - assuming there is actual difference between these stopwords and pre-clean context of the high and low target excerpt group

In [None]:
# List of all words 
all_words= list(itertools.chain(*words_in_lowest))

counts_no_words = collections.Counter(all_words)

lowest_count = counts_no_words.most_common(50)
counts_no_words.most_common(50)

In [None]:
clean_lowest = pd.DataFrame(lowest_count,
                             columns=['words', 'count'])

fig, ax = plt.subplots(figsize=(8, 8))

# Plot horizontal bar graph
clean_lowest.sort_values(by='count').plot.barh(x='words',
                      y='count',
                      ax=ax,
                      color="grey")

ax.set_title("Common Words Found in Lowest Target Excerpt (With Stop Words)")

plt.show()

An assumption made here is that the one with lowest target score seems to have a lot of sub sentence (child sentence) or possessive such as "its" or ";" or perfect tense. 

### Let's see the one with the highest target score

In [None]:
train.sort_values(by=['target'],ascending=False).head().iloc[0]['excerpt']

In [None]:
h =  list(train.sort_values(by=['target'],ascending=False).head(20).iloc[0:20]['excerpt'])

In [None]:
words_in_highest = [higest_text.lower().split() for higest_text in h]

In [None]:
all_words= list(itertools.chain(*words_in_highest))

counts_no_words = collections.Counter(all_words)

highest_count = counts_no_words.most_common(50)
counts_no_words.most_common(50)

In [None]:
clean_highest = pd.DataFrame(highest_count,
                             columns=['words', 'count'])

fig, ax = plt.subplots(figsize=(8, 8))

# Plot horizontal bar graph
clean_highest.sort_values(by='count').plot.barh(x='words',
                      y='count',
                      ax=ax,
                      color="pink")

ax.set_title("Common Words Found in Highest Target Excerpt (With Stop Words)")

plt.show()

### Examine the missing value


In [None]:
import seaborn as sns

sns.heatmap(train.isnull(), cbar=False,cmap="Blues")

Only the url_legal and license got missing values. However, the missing percentage is huge.

### Count keywords feature 

In [None]:
keywords = ['while', 'which', 'as', 'been','its','through']


train_count_keyword = train['excerpt'].str.count('|'.join(keywords))
test_count_keyword = test['excerpt'].str.count('|'.join(keywords))



### Count sub sentence mark

In [None]:
# number of sub sentence
train_count_mark =  train['excerpt'].apply(lambda s: s.count(';'))
test_count_mark =  test['excerpt'].apply(lambda s: s.count(';'))

### Stack the above two features together

In [None]:
train_additional_feature = np.column_stack((train_count_keyword,train_count_mark))
test_additional_feature = np.column_stack((test_count_keyword,test_count_mark))

### Parse Text for features

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')


with nlp.disable_pipes():
    train_array = np.array([nlp(text).vector for text in train.excerpt])
    test_array = np.array([nlp(text).vector for text in test.excerpt])

In [None]:
y_train = np.array(train.target)

### Combine the additional feature (keywords + subsentence) with the spacy features

In [None]:
x_train = np.column_stack((train_array,train_additional_feature))
x_test = np.column_stack((test_array,test_additional_feature))

In [None]:
x_train.shape

### StructuredDataRegressor
Here we initiate the baseline StructuredDataRegressor. For faster processing we set the max_trials to 20.
(Note - in this notebook experiment the baseline structured data regressor performs better than directly feeding text input for TextBlock in the following part of the notebook).

In [None]:
sdr = ak.StructuredDataRegressor(
    loss="mean_squared_error",
    project_name="structured_data_regressor",
    max_trials=20,
    objective="val_loss",
    overwrite=False,
    seed=5)

In [None]:
sdr.fit(
    x_train, y_train, epochs=100, validation_split=0.2,verbose=2)

In [None]:
predicted_test = sdr.predict(x_test)

In [None]:
predicted_test

In [None]:
submission['target'] = predicted_test
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

### TextRegressor
One can directly use the Text regressor. Basically user only needs to define the input data.

In [None]:
# # Initialize the text regressor.
# reg = ak.TextRegressor(overwrite=True, max_trials=8)  
# # Feed the text regressor with training data.
# split = round(len(x_train)*0.85)
# x_val = x_train[split:]
# y_val = y_train[split:]
# x_train = x_train[:split]
# y_train = y_train[:split]
# reg.fit(
#     x_train,
#     y_train,
#     epochs=20,
#     # Use your own validation set.
#     validation_data=(x_val, y_val),
# )


In [None]:
# auto_predict = reg.predict(x_test)
# submission['target'] = auto_predict
# submission.to_csv('submission_clrp_txtregressor.csv', index=False)

### TextBlock
One can directly use the TextBlock and specifying the block_type (can be sequence, ngram, transformer or just none then it'd be tunned automatically). Further more, we can also feed the pretrained word embedding (if we have it) into the TextBlock

In [None]:
# input_node = ak.TextInput()
# output_node = ak.TextBlock(block_type="sequence")(input_node)
# output_node = ak.RegressionHead()(output_node)
# reg = ak.AutoModel(
#     inputs=input_node, outputs=output_node, overwrite=True, max_trials=9
# )
# reg.fit(x_train, y_train, epochs=100)

In [None]:
# auto_predict = reg.predict(x_test)
# submission['target'] = auto_predict
# submission.to_csv('submission_clrp_txtblock.csv', index=False)

### Customize search space - Automodel 
Configure blocks for standard ConvNets aka Vanilla ConvNets(standard backpropagation).
(The following will take a bit more time to run)

In [None]:
# input_node = ak.TextInput()
# output_node = ak.TextToIntSequence()(input_node)
# #do not have to feed the max_features as we use TextToIntSequence
# output_node = ak.Embedding()(output_node)
# # Use separable Conv layers
# output_node = ak.ConvBlock(separable=True)(output_node)
# output_node = ak.RegressionHead()(output_node)
# reg = ak.AutoModel(
#     inputs=input_node, outputs=output_node, overwrite=True, max_trials=8
# )
# reg.fit(x_train, y_train, epochs=100)

In [None]:
# auto_predict = reg.predict(x_test)
# submission['target'] = auto_predict
# submission.to_csv('submission_clrp_automodel.csv', index=False)