In [None]:
%env TOKENIZERS_PARALLELISM=false

In [None]:
import sys
sys.path.append('../input/pyreadabilitymetrics')

In [None]:
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

In [None]:
from readability import Readability

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df

In [None]:
def get_read_metric(txt):
    r = Readability(txt)
    
    res = {'num_syllables': r._statistics.num_syllables,
        'num_poly_syllable_words': r._statistics.num_poly_syllable_words,
        'num_words': r._statistics.num_words,
        'num_sentences': r._statistics.num_sentences,
        'num_letters': r._statistics.num_letters,
        'num_gunning_complex': r._statistics.num_gunning_complex,
        'num_dale_chall_complex': r._statistics.num_dale_chall_complex,
        'num_spache_complex': r._statistics.num_spache_complex,}
    
    f = r.flesch_kincaid()
    res["fk_score"]=f.score
#     res["fk_level"]=f.grade_level

    f = r.flesch()
    res["f_score"]=f.score
#     res["f_level"]=f.grade_levels

    # Dale-Chall Score
    f = r.dale_chall()
    res["dc_score"]=f.score
#     res["dc_level"]=f.grade_levels
    
    f = r.ari()
    res["ari_score"]=f.score
#     res["ari_level"]=f.grade_levels
    
    f = r.coleman_liau()
    res["cl_score"]=f.score
#     res["cl_level"]=f.grade_level
    
    # Gunning Fog Scale Level
    f = r.gunning_fog()
    res["gf_score"]=f.score
#     res["gf_level"]=f.grade_level
    
    f = r.spache()
    res["s_score"]=f.score
#     res["s_level"]=f.grade_level
    
    f = r.linsear_write()
    res["lw_score"]=f.score
#     res["lw_level"]=f.grade_level
    
    return res

get_read_metric('When the young people returned to the ballroom...' * 100)

In [None]:
def get_features(df):
    tmp = []

    for row in tqdm(df.itertuples(), total=len(df)):
        rst = get_read_metric(row.excerpt)
        rst['target'] = row.target
        tmp.append(rst)

    dff = pd.DataFrame(tmp)

    def mean_words_len(txt):
        return np.mean([len(x) for x in txt.split(' ')])

    dff['mean_words_len'] = df['excerpt'].apply(mean_words_len)
    
    return dff

dff = get_features(df)
dff

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

In [None]:
class MyTestDataset(Dataset):
    def __init__(self, df, nlp_model):
        self.excerpt = df['excerpt'].values
        self.id = df['id'].values
        self.tokenizer = AutoTokenizer.from_pretrained(nlp_model)
        
    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self, idx):
        txt = self.excerpt[idx]
        
        ret = self.tokenizer(txt, padding='max_length', truncation=True, max_length=320, )
        
        for k, v in ret.items():
            ret[k] = torch.tensor(v)
            
        return ret
    
class FeatureExtractor(nn.Module):
    def __init__(self, nlp_model):
        super().__init__()
        
        self.model = AutoModel.from_pretrained(nlp_model)
        self.model.eval()
        
    def forward(self, **ipt):
        ipt['output_hidden_states'] = True
        with torch.no_grad():
            return self.model(**ipt).hidden_states[-2].mean(dim=1)
        
def get_bert_features(df):
    ds = MyTestDataset(df, '../input/huggingface-transformers-download/bert-base-cased')
    fe = FeatureExtractor('../input/huggingface-transformers-download/bert-base-cased').cuda()
    
    tmp = []
    for batch in tqdm(DataLoader(ds, num_workers=4, batch_size=32)):
        for k, v in batch.items():
            batch[k] = v.cuda()

        pred = fe(**batch).cpu().numpy()

        for p in pred:
            tmp.append(p)
            
    tmp = np.array(tmp)
    
    return pd.DataFrame(tmp, columns=[f'bert_{x}' for x in range(768)])

In [None]:
dff_bert = get_bert_features(df)
dff_bert

In [None]:
dff = pd.concat((dff, dff_bert), axis=1)
dff

In [None]:
import xgboost as xgb
from xgboost import XGBRegressor

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

class ContinuousStratifiedKFold(StratifiedKFold):
    def split(self, X, y, groups=None):
        num_bins = int(np.floor(1 + np.log2(len(y))))
        bins = pd.cut(y, bins=num_bins, labels=False)
        return super().split(X, bins, groups)

kf = ContinuousStratifiedKFold(n_splits=5, random_state=42, shuffle=True)

models = []
for fold_idx, (train, val) in enumerate(kf.split(dff, dff['target'])):
    print('training fold', fold_idx)
    df_train = dff.loc[train].sort_index()
    df_val = dff.loc[val].sort_index()
    
    # read in data
    dtrain = xgb.DMatrix(df_train.drop('target', axis=1), label=df_train['target'])
    dval = xgb.DMatrix(df_val.drop('target', axis=1), label=df_val['target'])
    
    # specify parameters via map
    param = {'objective':'reg:squarederror', 'tree_method': 'gpu_hist', 'eta': 0.1, 'nthread': 4, 'eval_metric': 'rmse'}
    num_round = 1000
    bst = xgb.train(param, dtrain, num_round, evals=[(dval, 'val')], verbose_eval=50, early_stopping_rounds=200)
    
    pred = bst.predict(xgb.DMatrix(df_val.drop('target', axis=1)))
    mse = ((pred - df_val['target']) ** 2).mean()
    rmse = np.sqrt(mse)
    
    print('mse:', mse, 'rmse:', rmse)
    
    models.append(bst)

In [None]:
from matplotlib import pyplot as plt

In [None]:
fig = plt.figure(figsize=(20, 40))
xgb.plot_importance(bst, ax=fig.gca(), max_num_features=100)

In [None]:
df_test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
df_test['target'] = 0
df_test

In [None]:
dff_test = get_features(df_test)
dff_test_bert = get_bert_features(df_test)
dff_test = pd.concat((dff_test, dff_test_bert), axis=1)
dff_test

In [None]:
preds = []

for m in models:
    preds.append(m.predict(xgb.DMatrix(dff_test.drop('target', axis=1))))
    
preds = np.array(preds)
preds

In [None]:
df_rst = df_test[['id']].copy()
df_rst['target'] = preds.mean(axis=0)
df_rst

In [None]:
df_rst.to_csv('submission.csv', index=None)