In [None]:
!pip install ../input/packages-for-creating-text-features/*.whl
!pip install ../input/packages-for-creating-text-features/ReadabilityCalculator-0.2.37/ReadabilityCalculator-0.2.37



from textblob.tokenizers import SentenceTokenizer, WordTokenizer
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os 
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold
import textstat
plt.style.use('seaborn-talk')
from readcalc import readcalc
from sklearn.preprocessing import StandardScaler
import joblib

import spacy
sp = spacy.load('en_core_web_sm')

def pos_to_id(pos_name):
    return sp.vocab[pos_name].orth

content_poss = ['ADJ', 'NOUN', 'VERB', 'ADV']

def count_poss(text, poss_names):
    text = sp(text)
    poss_ids = [pos_to_id(pos_name) for pos_name in poss_names]
    pos_freq_dict = text.count_by(spacy.attrs.POS)
    poss_sum = sum([pos_freq_dict.get(pos_id, 0) for pos_id in poss_ids])
    return poss_sum


count_poss('my name is', ['PRON', 'NOUN'])

# !pip download textstat ReadabilityCalculator 
# !pip install *.whl

sent_tokenizer = SentenceTokenizer()
word_tokenizer = WordTokenizer()

with open('../input/clrauxdata/dale-chall-3000-words.txt') as f:
    words = f.readlines()[0].split()
    
common_words = dict(zip(words, [True] * len(words)))
# df.sent_cnt.plot(kind='kde')

feats_to_drop = ['sents_n', 'words_n', 'long_words_n', 'difficult_words_n', 'content_words_n', 'prons_n', 'chars_n', 'syllables_n']
doc_feats = ['chars_per_word', 'chars_per_sent', 'syllables_per_word',
       'syllables_per_sent', 'words_per_sent', 'long_words_doc_ratio',
       'difficult_words_doc_ratio', 'prons_doc_ratio', 'flesch_reading_ease',
       'flesch_kincaid_grade', 'ari', 'cli', 'gunning_fog', 'lix', 'rix',
       'smog', 'dcrs', 'lexical_diversity', 'content_diversity', 'lwf']

def create_handcrafted_features(df):
    df['sents_n'] = df.excerpt.apply(textstat.sentence_count)
    df['words_n'] = df.excerpt.apply(textstat.lexicon_count)
    df['long_words_n'] = df.excerpt.apply(lambda t: readcalc.ReadCalc(t).get_words_longer_than_X(6))
    df['difficult_words_n'] = df.excerpt.apply(lambda t: sum([bool(common_words.get(word)) for word in word_tokenizer.tokenize(t, include_punc=False)]))
    df['content_words_n'] = df.excerpt.apply(lambda t: count_poss(t, content_poss))
    df['prons_n'] = df.excerpt.apply(lambda t: count_poss(t, ['PRON']))
    df['chars_n'] = df.excerpt.str.len()
    df['syllables_n'] = df.excerpt.apply(textstat.syllable_count)
    print('\tstage 1 finished..')

    df['chars_per_word'] = df.chars_n / df.words_n
    df['chars_per_sent'] = df.chars_n / df.sents_n
    df['syllables_per_word'] = df.syllables_n / df.words_n
    df['syllables_per_sent'] = df.syllables_n / df.sents_n

    df['words_per_sent'] = df.words_n / df.sents_n
    df['long_words_doc_ratio'] = df.long_words_n / df.words_n
    df['difficult_words_doc_ratio'] = df.difficult_words_n / df.words_n
    df['prons_doc_ratio'] = df.prons_n / df.words_n

    print('\tstage 2 finished..')

    df['flesch_reading_ease'] = df.excerpt.apply(textstat.flesch_reading_ease)
    df['flesch_kincaid_grade'] = df.excerpt.apply(textstat.flesch_kincaid_grade)
    df['ari'] = df.excerpt.apply(textstat.automated_readability_index)
    df['cli'] = df.excerpt.apply(textstat.coleman_liau_index)
    df['gunning_fog'] = df.excerpt.apply(textstat.gunning_fog)

    df['lix'] = df.excerpt.apply(lambda t: readcalc.ReadCalc(t).get_lix_index())
    df['rix'] = df.long_words_n / df.sents_n
    df['smog'] = df.excerpt.apply(lambda t: readcalc.ReadCalc(t).get_smog_index())
    df['dcrs'] = df.excerpt.apply(textstat.dale_chall_readability_score)

    df['lexical_diversity'] = len(set(df.words_n)) / df.words_n
    df['content_diversity'] = df.content_words_n / df.words_n
    df['lwf'] = df.excerpt.apply(textstat.linsear_write_formula)

    print('\tstage 3 finished..')
    return df

In [None]:

import os
from pathlib import Path
in_folder_path = Path('../input/clrp-distil-roberta')
scripts_dir = Path(in_folder_path / 'scripts')

In [None]:

os.chdir(scripts_dir)
exec(Path("imports.py").read_text())
exec(Path("config.py").read_text())
exec(Path("dataset.py").read_text())
exec(Path("model.py").read_text())
os.chdir('/kaggle/working')

In [None]:
import joblib 

test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")

tokenizer = torch.load('../input/tokenizers/roberta-tokenizer.pt')
models_folder_path = Path(in_folder_path / 'models')
models_preds = []
n_models = 5

for model_num in range(n_models):
    scaler = joblib.load(f'../input/clrauxdata/std_scaler{model_num}.pkl')
    new_df = create_handcrafted_features(test_df)
    new_df[doc_feats] = scaler.transform(new_df[doc_feats])

    print(f'Inference#{model_num+1}/{n_models}')
    test_ds = CLRPDataset(data=new_df, tokenizer=tokenizer, max_len=Config.max_len, is_test=True)
    test_sampler = SequentialSampler(test_ds)
    test_dataloader = DataLoader(test_ds, sampler = test_sampler, batch_size=Config.batch_size)
    model = torch.load(models_folder_path / f'best_model_{model_num}.pt').to(Config.device)

    all_preds = []
    model.eval()

    for step,batch in enumerate(test_dataloader):
        sent_id, mask, doc_features = batch['input_ids'].to(Config.device), batch['attention_mask'].to(Config.device), batch['doc_features'].to(Config.device)
        with torch.no_grad():
            preds = model(sent_id, mask, doc_features)
            all_preds += preds.flatten().cpu().tolist()
    
    models_preds.append(all_preds)

In [None]:
models_preds = np.array(models_preds)
print(models_preds.shape)
print(models_preds)
all_preds = models_preds.mean(axis=0)
print(all_preds.shape)
result_df = pd.DataFrame(
    {
        'id': test_df.id,
        'target': all_preds
    })


result_df.to_csv('submission.csv', index=False)
result_df.head(10)