# CommonLit Readability
The object of this kernel is to investigate feature engineering methods for this dataset rather than attaining a high RMSE score. This is implemented primarily through two packages, namely `textstat` and `nltk`. Features identified are contextual and reflect the reading levels, difficulty levels and parts of speech for each record. The text data is thereby converted into tabular format to allow the implementation of shallow learning algorithms like Linear Regression. This method provides an RMSE score of 0.7201, an MSE score of 0.5186 and an $R^2$ score of 0.5047.



# Initialisation
## Imports

In [None]:
# Using the internet
#!pip install textstat
#!pip install rich

# Without the internet
#!pip download textstat -d ./textstat/
#!pip download rich -d ./rich/

In [None]:
# import os
# from zipfile import ZipFile

# dirName = "./"
# zipName = "packages.zip"

# # Create a ZipFile Object
# with ZipFile(zipName, 'w') as zipObj:
#     # Iterate over all the files in directory
#     for folderName, subfolders, filenames in os.walk(dirName):
#         for filename in filenames:
#             if (filename != zipName):
#                 # create complete filepath of file in directory
#                 filePath = os.path.join(folderName, filename)
#                 # Add file to zip
#                 zipObj.write(filePath)

In [None]:
!pip install ../input/additionalpackages/textstat/Pyphen-0.10.0-py3-none-any.whl
!pip install ../input/additionalpackages/textstat/textstat-0.7.0-py3-none-any.whl
!pip install ../input/additionalpackages/rich/Pygments-2.9.0-py3-none-any.whl
!pip install ../input/additionalpackages/rich/colorama-0.4.4-py2.py3-none-any.whl
!pip install ../input/additionalpackages/rich/commonmark-0.9.1-py2.py3-none-any.whl
!pip install ../input/additionalpackages/rich/rich-10.2.0-py3-none-any.whl
!pip install ../input/additionalpackages/rich/typing_extensions-3.10.0.0-py3-none-any.whl

In [None]:
import pandas as pd
import numpy as np
import textstat
import rich
import nltk
import random
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing, tree, metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.utils import shuffle
#from sklearn.metrics import mean_squared_error as mse
from rich.console import Console
from rich import print
from rich.theme import Theme
#from IPython.display import display

## Definitions

I have defined my own metrics as I was having issues with sklearn's implementation.

In [None]:
def custom_palette(custom_colors):
    customPalette = sns.set_palette(sns.color_palette(custom_colors))
    sns.palplot(sns.color_palette(custom_colors),size=0.8)
    plt.tick_params(axis='both', labelsize=0, length = 0)

palette = ["#7209B7","#3F88C5","#136F63","#F72585","#FFBA08"]
palette2 = sns.diverging_palette(120, 220, n=20)
custom_palette(palette)

custom_theme = Theme({
    "info" : "italic bold cyan",
    "warning": "italic bold magenta",
    "danger": "bold blue"
})
console = Console(theme=custom_theme)

def rmse(x,y): 
    # Define Root Mean Squared Error calculation
    return np.sqrt(((x-y)**2).mean())

def mse(x,y): 
    # Define Mean Squared Error calculation
    return ((x-y)**2).mean()

def print_score(m):
    # Define method to print the model score
    res = [rmse(m.predict(X_train), y_train), 
           rmse(m.predict(X_test), y_test),
           #rmse(m.predict(X_valid), y_valid),
           mse(m.predict(X_train), y_train), 
           mse(m.predict(X_test), y_test),
           m.score(X_train, y_train), 
           m.score(X_test, y_test),
           #m.score(X_valid, y_valid)
          ]
    console.print("RMSE SCORES (lower the better):",style="info")
    print('RMSE (train): {}\nRMSE (test): {}'
          .format(res[0],res[1]))
    console.print("MSE SCORES (lower the better):",style="info")
    print('MSE (train): {}\nMSE (test): {}'
          .format(res[2],res[3]))
    console.print("R-square SCORES (higher the better):",style="info")
    print('R-square (train): {}\nR-square (test): {}'
          .format(res[4],res[5]))

## Read data

In [None]:
DATA_DIR_INPUT = '/kaggle/input/commonlitreadabilityprize'
DATA_DIR_OUTPUT = '/kaggle/working'
data_train = pd.read_csv(f'{DATA_DIR_INPUT}/train.csv')
data_val = pd.read_csv(f'{DATA_DIR_INPUT}/test.csv')

# Exploratory Data Analysis
## Missing values
Check if any missing values are present.

In [None]:
msno.matrix(data_train)

`url_legal` and `license` fields are being dropped as they are largely empty and do not add much information.

In [None]:
# Drop url_legal and licence columns from training data
data_train.drop(columns=['url_legal', 'license'], inplace=True)
data_val.drop(columns=['url_legal', 'license'], inplace=True)

# Feature Engineering
## textstat
The `textstat` package helps in identifying a number of different readability indices. It also helps in the identification of statistics like the number of difficult words. As a result, all of the indices currently implemented within `textstat` are being added as individual features for each record.

In [None]:
def add_features(df):
    df['lexicon_count'] = df.excerpt.apply(lambda x: textstat.lexicon_count(x))
    df['flesch_reading_ease'] = df.excerpt.apply(lambda x: textstat.flesch_reading_ease(x))
    df['flesch_kincaid_grade'] = df.excerpt.apply(lambda x: textstat.flesch_kincaid_grade(x))
    df['smog_index'] = df.excerpt.apply(lambda x: textstat.smog_index(x))
    df['coleman_liau_index'] = df.excerpt.apply(lambda x: textstat.coleman_liau_index(x))
    df['automated_readability_index'] = df.excerpt.apply(lambda x: textstat.automated_readability_index(x))
    df['dale_chall_readability_score'] = df.excerpt.apply(lambda x: textstat.dale_chall_readability_score(x))
    df['linsear_write_formula'] = df.excerpt.apply(lambda x: textstat.linsear_write_formula(x))
    df['gunning_fog'] = df.excerpt.apply(lambda x: textstat.gunning_fog(x))
    df['fernandez_huerta'] = df.excerpt.apply(lambda x: textstat.fernandez_huerta(x))
    df['szigriszt_pazos'] = df.excerpt.apply(lambda x: textstat.szigriszt_pazos(x))
    df['gutierrez_polini'] = df.excerpt.apply(lambda x: textstat.gutierrez_polini(x))
    df['spache_readability'] = df.excerpt.apply(lambda x: textstat.spache_readability(x))
    df['crawford'] = df.excerpt.apply(lambda x: textstat.crawford(x))
    df['difficult_words'] = df.excerpt.apply(lambda x: textstat.difficult_words(x))
    df['syllable_count'] = df.excerpt.apply(lambda x: textstat.syllable_count(x))
    df['sentence_count'] = df.excerpt.apply(lambda x: textstat.sentence_count(x))
    df['polysyllabcount'] = df.excerpt.apply(lambda x: textstat.polysyllabcount(x))
    df['char_count'] = df.excerpt.apply(lambda x: textstat.char_count(x))
    df['letter_count'] = df.excerpt.apply(lambda x: textstat.letter_count(x))
    df['avg_character_per_word'] = df.excerpt.apply(lambda x: textstat.avg_character_per_word(x))
    df['avg_letter_per_word'] = df.excerpt.apply(lambda x: textstat.avg_letter_per_word(x))
    df['avg_sentence_length'] = df.excerpt.apply(lambda x: textstat.avg_sentence_length(x))
    df['avg_sentence_per_word'] = df.excerpt.apply(lambda x: textstat.avg_sentence_per_word(x))
    df['avg_syllables_per_word'] = df.excerpt.apply(lambda x: textstat.avg_syllables_per_word(x))
    df['lix'] = df.excerpt.apply(lambda x: textstat.lix(x))
    df['rix'] = df.excerpt.apply(lambda x: textstat.rix(x))
    df['reading_time'] = df.excerpt.apply(lambda x: textstat.reading_time(x))
    return df

The cell below takes a long time to execute (approximately 22 minutes on the Kaggle kernel). Therefore, the output has been saved in the following cell. This is then loaded thereafter to save time.

In [None]:
# Run this only if needed, pickles have been provided in the cell below to save time.

# # Add indices and paragraph features
# %time df_train = add_features(data_train)
# df_val = add_features(data_val)
# #df_train.head().T

### Load / Save DFs

In [None]:
# Save df_train and df_val as pickles to save time in the future
#import pickle 
#file_df_train = open(f'{DATA_DIR_OUTPUT}/df_train.pkl', 'wb')
#with open('df.pkl', 'wb') as file:
#file_df_val = open(f'{DATA_DIR_OUTPUT}/df_val.pkl', 'wb')
#pickle.dump(df_train, file_df_train)
#pickle.dump(df_val, file_df_val)

In [None]:
#Load saved DFs to save time in the future
import pickle 
file_df_train = open('../input/pickled-dfs/df_train.pkl', 'rb') 
file_df_val = open('../input/pickled-dfs/df_val.pkl', 'rb') 
df_train = pickle.load(file_df_train)
df_val = pickle.load(file_df_val)

## nltk
The `nltk` package is useful for several NLP tasks. However, the current implementation uses its 'parts of speech' tagging feature. Here, the parts of speech are identified and counted for each record.

In [None]:
def count_pos_tags(df):
    for index, row in df.iterrows():
        tag_list = []
        paragraph = row.excerpt
        tokens = nltk.word_tokenize(paragraph)
        tagged = nltk.pos_tag(tokens)
        for tag in tagged:
            tag_list.append(tag[1])
        tag_counts = pd.Series(tag_list).value_counts()
        tag_indices = tag_counts.index
        for i, tag in enumerate(tag_counts):
            df.at[index, tag_indices[i]] = tag
    return df.fillna(0)

In [None]:
# Add parts-of-speech features to training set
df_train = count_pos_tags(df_train)

# Add parts-of-speech features to validation set
df_val = count_pos_tags(df_val)
# Add columns not present in df_val
differences = list(set(df_train.columns).difference(set(df_val.columns)))
# Remove target column names
differences.remove('target')
differences.remove('standard_error')
# Add missing columns as zero columns
df_val[differences] = 0

df_train.shape, df_val.shape

## Normalisation
Normalisation is being done for best practice. Not all models need the data to be normalised. However, if you wish to test other models, this may be useful.

In [None]:
# Manual identification of features calculated by textstat
continuous_features = [
    'lexicon_count',
       'flesch_reading_ease', 'flesch_kincaid_grade', 'smog_index',
       'coleman_liau_index', 'automated_readability_index',
       'dale_chall_readability_score', 'linsear_write_formula', 'gunning_fog',
       'fernandez_huerta', 'szigriszt_pazos', 'gutierrez_polini',
       'spache_readability', 'crawford', 'difficult_words', 'syllable_count',
       'sentence_count', 'polysyllabcount', 'char_count', 'letter_count',
       'avg_character_per_word', 'avg_letter_per_word', 'avg_sentence_length',
       'avg_sentence_per_word', 'avg_syllables_per_word', 'lix', 'rix',
       'reading_time'
]

# Automatic identification of features calculated by nltk
additional_features = list(set(df_train.columns).difference(set(continuous_features)))
additional_features.remove('standard_error')
additional_features.remove('target')
additional_features.remove('excerpt')
additional_features.remove('id')
continuous_features.extend(additional_features)

In [None]:
# Normalisation
normalizer = preprocessing.Normalizer()
# Normalise training set
normalized_train_X = pd.DataFrame(normalizer.fit_transform(df_train[continuous_features]))
normalized_train_X.columns = df_train[continuous_features].columns
# Normalise validation set
normalized_val_X = pd.DataFrame(normalizer.transform(df_val[continuous_features]))
normalized_val_X.columns = df_val[continuous_features].columns

In [None]:
X = normalized_train_X.copy()
y = df_train.target
X.shape, y.shape

## Train-Test Split
Split the data into 80% for training and 20% for testing purposes.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size=0.2,
                                                    random_state=42)
X_valid = normalized_val_X.copy()

X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_valid.shape

# Model Training
This is a base Linear Regression model.

In [None]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
print_score(lr_model)

# Predictions
Obtain predictions for submission.

In [None]:
y_pred = lr_model.predict(X_valid)
predictions = pd.DataFrame()
predictions['id'] = df_val.id
predictions['target'] = y_pred
#predictions.to_csv("/kaggle/working/submission.csv", index=False)
predictions.to_csv(f'{DATA_DIR_OUTPUT}/submission.csv', index=False)
predictions