In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from sklearn import metrics
from sklearn.model_selection import KFold
import lightgbm
import matplotlib.pyplot as plt

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv').set_index('id')
test = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv').set_index('id')

In [None]:
data = pd.concat([train, test]).loc[:, ['excerpt', 'target']].rename(columns={'excerpt': 'txt'})

In [None]:
sent_len_df = data.txt.str.split('.').apply(lambda x: [len(s.split(' ')) for s in x[:-1]])

In [None]:
features = pd.concat([data.txt.apply(len).to_frame('symbols_in_text'),
                      data.txt.str.split('.').apply(len).subtract(1).to_frame('cnt_of_centences'),
                      sent_len_df.apply(max).to_frame('max_words_in_sentence'),
                      sent_len_df.apply(min).to_frame('min_words_in_sentence'),
                      sent_len_df.apply(np.size).to_frame('cnt_words_in_sentence'),
                      sent_len_df.apply(np.mean).to_frame('avg_words_in_sentence'),
                      data.txt.str.split(' ').apply(len).to_frame('cnt_words')], axis=1)

In [None]:
data

Catboost allow to use textual features, alongside numerical, categorical etc.<br>
But it only possible in classification tasks.<br>
I create different thresholds on a target and run a catboost model with a single textual feature against binary targets<br>

In [None]:
feature_names = data.drop(columns=['target']).columns.tolist()

In [None]:
X = data.loc[lambda dx: dx.index.isin(train.index), feature_names]
y = data.loc[lambda dx: dx.index.isin(train.index), 'target']
X_tst = data.loc[lambda dx: dx.index.isin(test.index), feature_names]

In [None]:
thresholds = np.quantile(y, [0.2, 0.4, 0.6, 0.8])

In [None]:
import catboost
params = dict(loss_function='Logloss', n_estimators=99999999, learning_rate=0.1)
cv_obj = KFold(n_splits=6, random_state=42, shuffle=True)

for threshold in thresholds:
    oof = np.zeros(X.index.size)
    test_oof = np.zeros(X_tst.index.size)
    for fold_id, (trn_idx, val_idx) in enumerate(cv_obj.split(X,y)):
        x_trn, x_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]
        trn_pool = catboost.Pool(x_trn, label=(y_trn > threshold).astype(int), text_features=['txt'])
        val_pool = catboost.Pool(x_val, label=(y_val > threshold).astype(int), text_features=['txt'])
        model = catboost.train(pool=trn_pool, params=params, eval_set=[trn_pool, val_pool], early_stopping_rounds=100, verbose_eval=100)
        oof[val_idx] = model.predict(x_val)
        test_oof += model.predict(X_tst) / cv_obj.n_splits

    feature = pd.concat([pd.DataFrame(oof, index=X.index, columns=[f'thr_{threshold}']),
                         pd.DataFrame(test_oof, index=X_tst.index, columns=[f'thr_{threshold}'])])
    data = data.join(feature)

In [None]:
data = data.join(features).drop(columns=['txt'])

In [None]:
data

In [None]:
feature_names = data.drop(columns=['target']).columns.tolist()
X = data.loc[lambda dx: dx.index.isin(train.index), feature_names]
y = data.loc[lambda dx: dx.index.isin(train.index), 'target']
X_tst = data.loc[lambda dx: dx.index.isin(test.index), feature_names]

In [None]:
oof = np.zeros(X.index.size)
predictions = np.zeros(X_tst.index.size)
for fold_id, (trn_idx, val_idx) in enumerate(cv_obj.split(X,y)):
    x_trn, x_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]
    model = lightgbm.LGBMRegressor(n_estimators=9999999, learning_rate=1e-3)
    model.fit(x_trn, y_trn, eval_set=[(x_trn, y_trn), (x_val, y_val)], early_stopping_rounds=300, verbose=500)
    oof[val_idx] = model.predict(x_val)
    predictions += model.predict(X_tst) / 5
    lightgbm.plot_importance(model)
    plt.show()

In [None]:
metrics.mean_squared_error(y, oof)

In [None]:
submission = pd.Series(predictions, index=X_tst.index).to_frame('target').reset_index()

In [None]:
submission.to_csv('submission.csv', index=False)