This notebook is an example of a quick and (relatively) simple regression model for the CommonLit dataset. 



In [None]:
import spacy
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
train_file = "../input/commonlitreadabilityprize/train.csv"
df = pd.read_csv(train_file, index_col='id')
print(f"df.shape = {df.shape}")
df.head()

In [None]:
y = df.target
X_raw = df['excerpt']

print(X_raw.shape)
print(y.shape)
X_raw.head()

In [None]:
X_raw[0]

In [None]:
def preprocess(X_raw):
    nlp = spacy.load('en_core_web_lg') 
    docs = [nlp(exc) for exc in X_raw]

    # helper lists of lists to derive features below 
    doc_tok_lens = [[len(tok.text) for tok in doc] for doc in docs]
    doc_tok_lens_sq = [[tok_len**2 for tok_len in lengths] for lengths in doc_tok_lens]

    # n_toks
    doc_n_toks = [len(doc) for doc in docs]
    # n_stop_toks
    doc_n_stop_toks = [sum([tok.is_stop for tok in doc]) for doc in docs]
    # n_chars
    doc_n_chars = [len(doc.text) for doc in docs]
    # tok_len_mean
    doc_tok_len_means = [sum(lens)/len(lens) for lens in doc_tok_lens]
    # tok_len_var
    doc_tok_len_vars = [sum(lens2)/len(lens2) - (sum(lens)/len(lens))**2 for lens2, lens in zip(doc_tok_lens_sq, doc_tok_lens)]
    # n_punct
    doc_all_punct = [sum([tok.pos_ == 'PUNCT' for tok in doc]) for doc in docs]
    # n_special_punct
    doc_nonstd_punct = [sum([tok.lemma_ in ['-', ','] for tok in doc]) for doc in docs]

    # see https://universaldependencies.org/docs/u/pos/ for pos used by spaCy
    pos_all = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPACE']
    pos_dict = dict(zip(pos_all, list(range(len(pos_all)))))

    # pos_dist
    doc_pos_counts = [[0]*len(pos_all) for doc in docs]
    doc_pos_dist = [[0]*len(pos_all) for doc in docs]
    for i, doc in enumerate(docs):
        for tok in doc:
            if tok.pos_ not in pos_all:
                print("i = ", i, "   lemma = ", tok.lemma_)
            doc_pos_counts[i][pos_dict[tok.pos_]] += 1
        doc_pos_dist[i] = np.divide(doc_pos_counts[i], doc_n_toks[i])

    X = pd.DataFrame(doc_pos_dist, columns=['pos_' + pos.lower() for pos in pos_all])
    X['n_toks'] = doc_n_toks
    X['n_chars'] = doc_n_chars
    X['prop_stop_toks'] = np.divide(doc_n_stop_toks, doc_n_toks)
    X['tok_len_mean'] = doc_tok_len_means
    X['tok_len_var'] = doc_tok_len_vars
    X['prop_punct'] = np.divide(doc_all_punct, doc_n_chars)
    X['prop_nonstd_punct'] = np.divide(doc_nonstd_punct, doc_all_punct)

    return X

In [None]:
X = preprocess(X_raw)

Split the data into subsets for training and validation to see how well the model will generalize (i.e. how well will it be able to make new predictions). 

In [None]:
X_train, X_val, y_train_actual, y_val_actual = train_test_split(X, y, test_size=0.15, random_state=1)

Fit the random forest model. 

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=1)
rf.fit(X_train, y_train_actual)

In [None]:
y_train_predicted = rf.predict(X_train)
y_val_predicted = rf.predict(X_val)
train_mse = mean_squared_error(y_train_predicted, y_train_actual)
valid_mse = mean_squared_error(y_val_predicted, y_val_actual)
print(f"baseline model training MSE = {train_mse:.6f}")
print(f"baseline model validation MSE = {valid_mse:.6f}")

In [None]:
matplotlib.rc('figure', figsize=(15,4))
_, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot([0,1], [0,1], transform=ax1.transAxes, c='k', alpha=0.2)
ax1.scatter(y_train_actual, y_train_predicted, c='b', alpha=0.2)
ax1.set_title("Training data")
ax1.set_xlabel("actual")
ax1.set_ylabel("predicted")
ax2.plot([0,1], [0,1], transform=ax2.transAxes, c='k', alpha=0.2)
ax2.scatter(y_val_actual, y_val_predicted, c='g', alpha=0.4)
ax2.set_title("Validation data")
ax2.set_xlabel("actual")
ax2.set_ylabel("predicted")