In [None]:
!pip install pyphen
!pip install tidybear

In [None]:
import numpy as np
import pandas as pd
import tidybear as tb

from tqdm import tqdm

import nltk
from pyphen import Pyphen

import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
def summarise_cv_scores(scores):
    return (len(scores), np.mean(scores), np.std(scores))

## Set Up

To start, I read in the training data and select the columns I care about.

Then I assign an approx grade by dividing the target into 10 regioins (deciles, grades 3-12).

In [None]:
train_ = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
train_ = train_.loc[:, ["id", "excerpt", "target", "standard_error"]]

grade_lbls = [i for i in range(12, 2, -1)]
train_["grade"] = pd.qcut(train_.target, q=len(grade_lbls), labels=grade_lbls)
train_["grade"] = train_.grade.astype(int)

def get_school_level(grade):
    if grade <= 5:
        return "elementary"
    elif grade <= 8:
        return "middle"
    else:
        return "high"
    
train_["school"] = train_.grade.apply(get_school_level)

print(train_.shape)
train_.tail()

Next, we look at an example of the easiest and hardest (by target) excerpts to read. As per the discussion and by example, higher scores are eaiser (lower grade level), lower scores are harder (higher grade level).

In [None]:
print("Max Target - Easiest to read - lowest grade level\n")
print(train_[train_.target == train_.target.max()].excerpt.values[0])

print("\n-------------------------\n")

print("Min Target - Hardest to read - highest grade level\n")
print(train_[train_.target == train_.target.min()].excerpt.values[0])

In [None]:
train_.target.plot.hist();

Looking at the distribution above, the target is pretty normal. This tells me there is lots of overlap between grade level readability... or as the discussion puts it, the categories are squisy....

However below, using the approx grade level, the words being used on average for higher grade levels are longer.

## Non-Text Features

In [None]:
train = train_.copy()

pyphen = Pyphen(lang="en")

def syllables(word):
    return len(pyphen.positions(word)) + 1

def engineer_features(df):
    word_tok = df.excerpt.apply(nltk.tokenize.word_tokenize)
    sent_tok = df.excerpt.apply(nltk.tokenize.sent_tokenize)
    syls_tok = word_tok.apply(lambda x: [syllables(w) for w in x])

    total_charachters = word_tok.apply(lambda x: np.sum([len(w) for w in x]))
    total_words = word_tok.apply(lambda x: len(x))
    total_syllables = syls_tok.apply(lambda x: np.sum(x))
    total_sentences = sent_tok.apply(lambda x: len(x))

    df["unique_words"] = word_tok.apply(lambda x: np.unique(x).shape[0]) / total_words
    df["words_geq_len8"] = word_tok.apply(lambda x: np.sum([len(w) >= 8 for w in x])) / total_words
    df["hard_words"] = syls_tok.apply(lambda x: np.sum([s >= 3 for s in x])) / total_words

    df["characters_per_word"] = total_charachters / total_words
    df["syllables_per_word"] = total_syllables / total_words
    df["words_per_sentence"] = total_words / total_sentences

engineer_features(train)
train.drop(columns=["standard_error", "grade"]).corr()

In [None]:
non_text_features = [
    "unique_words",
    "words_geq_len8",
    "words_per_sentence"
]

with tb.GroupBy(train, "grade") as g:
    g.mean(non_text_features, decimals=2)
    grade_summary = g.summarise()
    
grade_summary = grade_summary \
    .stack() \
    .rename("value") \
    .reset_index() \
    .rename(columns={"level_1": "feature"})

g = sns.FacetGrid(grade_summary, col="feature", col_wrap=3, sharey=False, height=4)
g.map_dataframe(sns.barplot, x="grade", y="value");

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.feature_selection import RFECV

In [None]:
gs_params = {
    "cv": 10, 
    "scoring": "neg_root_mean_squared_error", 
    "n_jobs": -1, 
    "verbose": 2
}

model_grid = {"reg": [
    DummyRegressor(),
    LinearRegression(),
    RidgeCV(),
    RandomForestRegressor(max_depth=3, random_state=123)
]}

pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler()),
    ("reg", RidgeCV())
])

grid = GridSearchCV(pipe, model_grid, **gs_params)

X = train[non_text_features]
y = train.target

grid.fit(X, y)

pd.DataFrame(grid.cv_results_)[["param_reg", "mean_test_score", "std_test_score"]]

In [None]:
non_text_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler()),
    ("poly", PolynomialFeatures()),
    ("feat", RFECV(LinearRegression(), cv=10)),
    ("reg", RidgeCV())
])

grid = GridSearchCV(non_text_pipe, model_grid, **gs_params)
grid.fit(X, y)

pd.DataFrame(grid.cv_results_)[["param_reg", "mean_test_score", "std_test_score"]]

In [None]:
non_text_lm = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler()),
    ("reg", LinearRegression())
])

scores = cross_val_score(non_text_lm, X, y, **gs_params)
print("RMSE (cv={}): {:.3f} ({:.3f})".format(*summarise_cv_scores(-scores)))

non_text_lm.fit(X, y)
coefs = pd.DataFrame({"coef": non_text_lm.named_steps["reg"].coef_}, index=non_text_features)
coefs.sort_values("coef").plot.barh()

With no NLP, just summary stats about the vocab of the text, we get an average RMSE across 10 folds of .84 for the Ridge regression. We'll need to halve that to win...

## Simple NLP

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

tm_grid = {"reg": [
    LinearRegression(),
    Ridge(),
    Lasso()
]}

text_pipe = Pipeline([
    ("count", CountVectorizer()),
    ("scale", TfidfTransformer()),
    ("reg", Ridge())
])

grid = GridSearchCV(text_pipe, tm_grid, **gs_params)
grid.fit(train.excerpt, train.target)

pd.DataFrame(grid.cv_results_)[["param_reg", "mean_test_score", "std_test_score"]]

## Combine Non-Text and Simple NLP

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

non_text_trans = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler(),
    PolynomialFeatures(),
    RFECV(LinearRegression(), cv=10)
)

text_trans = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
)

combined_pipe = Pipeline([
    ("transform", make_column_transformer(
        (non_text_trans, non_text_features),
        (text_trans, "excerpt"),
        remainder="drop"
    )),
    ("predict", Ridge())
])

scores = cross_val_score(combined_pipe, train, train.target, **gs_params)
print("RMSE (cv={}): {:.3f} ({:.3f})".format(*summarise_cv_scores(-scores)))

In [None]:
test = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
engineer_features(test)

train_cols = ["excerpt"] + non_text_features
combined_pipe.fit(train[train_cols], train.target.values)
test_pred = combined_pipe.predict(test[train_cols])

submission = pd.DataFrame({
    "id": test.id,
    "target": test_pred
})

submission

In [None]:
submission.to_csv("submission.csv", index=False)