In [None]:
!pip install ../input/lama-whl/LightAutoML-0.2.14-py3-none-any.whl

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import mean_squared_error
from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.tasks import Task

In [None]:
ss = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
TIMEOUT = 9000 # Time in seconds for automl run
TARGET_NAME = 'target' # Target column name

In [None]:
def rmse(x, y): return np.sqrt(mean_squared_error(x, y))
task = Task('reg', metric=rmse)
roles = {'target': TARGET_NAME,
         'text': ['excerpt'],
         'drop': ['id', 'standard_error', 'url_legal', 'license']}

In [None]:
def preprocess_text(df):
    df['len_tokens'] = df['excerpt'].str.strip().str.split(' ').apply(len)
    df['len'] = df['excerpt'].str.strip().apply(len)
    df['len_sent'] = df['excerpt'].str.strip().str.split('.').apply(len)
    df['n_comm'] = df['excerpt'].str.strip().str.split(',').apply(len)
    _t = df['excerpt'].str.strip().str.split(' ').values
    df['d_mean'] = [np.sum([j.isdigit() for j in i]) for i in _t]
    df['u_mean'] = [np.sum([j.isupper() for j in i]) for i in _t]
    
preprocess_text(train)

In [None]:
train.head()

In [None]:
automl = TabularNLPAutoML(task=task,
                          timeout=TIMEOUT,
                          general_params={'nested_cv': False, 'use_algos': [['linear_l2', 'nn']]},
                          text_params={'lang': 'en', 'bert_model': '../input/roberta-base'},
                          reader_params={'cv': 5},
                          linear_pipeline_params={'text_features': 'embed'},
                          autonlp_params={'model_name': 'pooled_bert',
                                          'transformer_params': {'model_params': {'pooling': 'mean'},
                                                                 'dataset_params': {'max_length': 220},
                                                                 'loader_params': {'batch_size': 64,
                                                                                   'shuffle': False,
                                                                                   'num_workers': 4}
                                                                 }
                                          },
                          nn_params={'opt_params': {'lr': 3e-5},
                                     'lang': 'en',
                                     'path_to_save': './models',
                                     'bert_name': '../input/roberta-base',
                                     'snap_params': {'k': 1, 'early_stopping': True,
                                                     'patience': 2, 'swa': False},
                                     'init_bias': False,
                                     'pooling': 'mean',
                                     'max_length': 220, 'bs': 32, 'n_epochs': 20,
                                     'use_cont': False,
                                     'use_cat': False,
                                     },
                          )

oof_pred = automl.fit_predict(train, roles=roles)
print('')
print(rmse(train[TARGET_NAME], oof_pred.data[:, 0]))

In [None]:
from lightautoml.addons.interpretation import LimeTextExplainer
lime = LimeTextExplainer(automl, feature_selection='lasso', force_order=False)

In [None]:
df = train.iloc[0]
exp = lime.explain_instance(df, perturb_column='excerpt')
exp.visualize_in_notebook()
print(df[TARGET_NAME])

In [None]:
df = train.iloc[1]
exp = lime.explain_instance(df, perturb_column='excerpt')
exp.visualize_in_notebook()
print(df[TARGET_NAME])

In [None]:
df = train.iloc[100]
exp = lime.explain_instance(df, perturb_column='excerpt')
exp.visualize_in_notebook()
print(df[TARGET_NAME])

In [None]:
df = train.iloc[777]
exp = lime.explain_instance(df, perturb_column='excerpt')
exp.visualize_in_notebook()
print(df[TARGET_NAME])

In [None]:
import pickle
with open('LAMA_model.pkl', 'wb') as f:
    pickle.dump(automl, f)