This is a simple solution to start the competition and get a feel of the problem.

I will be training a Decision Tree by evaluation the below features for each excerpt.
 - Readability
 - Length
 - Sentiment

# Initilization

I am using the [readability Python package](https://pypi.org/project/readability/) to evaluate readability of each excerpt and [textblob](https://pypi.org/project/textblob/) for sentiment analysis.

In [None]:
! pip install -q /kaggle/input/readability/readability-0.3.1-py3-none-any.whl
from textblob import TextBlob
import readability

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

import pandas as pd

In [None]:
train_data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')

In [None]:
train_data.info()
train_data.head()

In [None]:
test_data.info()
test_data.head()

In [None]:
pd.read_csv('/kaggle/input/commonlitreadabilityprize/sample_submission.csv')

# Functions

In [None]:
def readability_analysis(text):
    rd = readability.getmeasures(text, lang='en')
    return rd['readability grades']['FleschReadingEase']

Currently, I am using **FleschReadingEase** but I will be testing with various options and other possible features which can be used later when I work on more complex models.

In [None]:
def length_analysis_words(text):
    return len(text.split())

In [None]:
def length_analysis_chars(text):
    return len(text)

In [None]:
def sentiment_analysis(text):
    return TextBlob(text).sentiment.polarity

# Creating Features

In [None]:
X = pd.DataFrame(train_data['id'])
X.loc[:,'readability'] = train_data.apply(lambda row: readability_analysis(row.excerpt), axis=1)
X.loc[:,'len_words'] = train_data.apply(lambda row: length_analysis_words(row.excerpt), axis=1)
X.loc[:,'len_chars'] = train_data.apply(lambda row: length_analysis_chars(row.excerpt), axis=1)
X.loc[:,'sentiment'] = train_data.apply(lambda row: sentiment_analysis(row.excerpt), axis=1)

In [None]:
X.info()
X.head()

In [None]:
X = X[['readability', 'len_words', 'len_chars', 'sentiment']]

In [None]:
y = train_data['target']

In [None]:
y.describe()

# Training

In [None]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

In [None]:
errors = pd.DataFrame(columns=['train_error', 'val_error', 'num_leaf'])
for max_leaf_nodes in range(2, 100):
    model = DecisionTreeRegressor(random_state=0, max_leaf_nodes=max_leaf_nodes)
    model.fit(train_X, train_y)
    
    train_preds = model.predict(train_X)
    train_e = mean_absolute_error(train_y, train_preds)
    val_preds = model.predict(val_X)
    val_e = mean_absolute_error(val_y, val_preds)
    
    errors.loc[max_leaf_nodes-2] = [train_e, val_e, model.get_n_leaves()]
    
errors

In [None]:
import matplotlib.pyplot as plt

plt.plot(errors['train_error'], errors['num_leaf'])
plt.plot(errors['val_error'], errors['num_leaf'])
plt.show()

In [None]:
model = DecisionTreeRegressor(random_state=0, max_leaf_nodes=20)

model.fit(train_X, train_y)

# Evaluating the result

In [None]:
train_preds = model.predict(train_X)
mean_absolute_error(train_y, train_preds)

In [None]:
val_preds = model.predict(val_X)
mean_absolute_error(val_y, val_preds)

# Creating features for test set and predicting results

In [None]:
X_test = pd.DataFrame(test_data['id'])
X_test.loc[:,'readability'] = test_data.apply(lambda row: readability_analysis(row.excerpt), axis=1)
X_test.loc[:,'len_words'] = test_data.apply(lambda row: length_analysis_words(row.excerpt), axis=1)
X_test.loc[:,'len_chars'] = test_data.apply(lambda row: length_analysis_chars(row.excerpt), axis=1)
X_test.loc[:,'sentiment'] = test_data.apply(lambda row: sentiment_analysis(row.excerpt), axis=1)

In [None]:
X_test.info()
X_test.head()

In [None]:
val_preds = model.predict(X_test[['readability', 'len_words', 'len_chars', 'sentiment']])

In [None]:
solution = pd.DataFrame(X_test['id'])
solution.loc[:, 'target'] = val_preds

In [None]:
solution.info()

In [None]:
solution.to_csv('submission.csv', index=False)