In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Logic:

Assuming the readability is related to the words in the texts, we can train a model that checks if unique words are in the text and returns a readability score. The following model does exactly that. In this case only single words and no combination:

In [None]:
train_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')

In [None]:
train_df.shape

# Minimal Solution:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

model = make_pipeline(
    CountVectorizer(stop_words=None,
                     preprocessor=None,
                     max_features=None,
                     binary=True,
                     ngram_range=(1,1)),
    LinearRegression(),
)


model.fit(train_df['excerpt'], train_df['target'])

test_df['target'] = model.predict(test_df['excerpt'])
test_df[['id','target']].to_csv('submission.csv', index=False)

# Comparison

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error as mse, make_scorer

rmse = lambda y_true, y_pred: np.sqrt(mse(y_true, y_pred))
rmse_loss = lambda Estimator, X, y: rmse(y, Estimator.predict(X))


print('Count Vectorizer, ngram(1,1):',
      cross_val_score(model, 
                train_df['excerpt'], 
                train_df['target'], 
                scoring=rmse_loss).mean())

print('Tfidf Vectorizer, ngram(1,1):',
      cross_val_score(make_pipeline(
                        TfidfVectorizer(stop_words=None,
                             preprocessor=None,
                             max_features=None,
                             ngram_range=(1,1)),
                        LinearRegression()), 
                train_df['excerpt'], 
                train_df['target'], 
                scoring=rmse_loss).mean())

print('Tfidf Vectorizer, ngram(1,2):',
      cross_val_score(make_pipeline(
                        TfidfVectorizer(stop_words=None,
                             preprocessor=None,
                             max_features=None,
                             ngram_range=(1,2)),
                        LinearRegression()), 
                train_df['excerpt'], 
                train_df['target'], 
                scoring=rmse_loss).mean())

print('Tfidf Vectorizer, ngram(1,3):',
      cross_val_score(make_pipeline(
                        TfidfVectorizer(stop_words=None,
                             preprocessor=None,
                             max_features=None,
                             ngram_range=(1,3)),
                        LinearRegression()), 
                train_df['excerpt'], 
                train_df['target'], 
                scoring=rmse_loss).mean())



## The best model (among above models)

In [None]:
result = cross_validate(make_pipeline(
                        TfidfVectorizer(stop_words=None,
                             preprocessor=None,
                             max_features=None,
                             ngram_range=(1,2)),
                        LinearRegression()), 
                train_df['excerpt'], 
                train_df['target'], 
                scoring=rmse_loss,
                return_estimator=True)
result

In [None]:
test_df['target'] = np.mean([model.predict(test_df['excerpt']) for model in result['estimator']], 0)
test_df[['id','target']].to_csv('submission.csv', index=False)
test_df