In [None]:
!pip install ../input/lama-whl/LightAutoML-0.2.14-py3-none-any.whl > /dev/null
!pip install ../input/packages-for-creating-text-features/*.whl > /dev/null
!pip install ../input/packages-for-creating-text-features/ReadabilityCalculator-0.2.37/ReadabilityCalculator-0.2.37 > /dev/null
!cp ../input/textstat-pre/dist/textstat-0.7.1.tar . > /dev/null
!cp ../input/pyphen-gz/pyphen-0.11.0.tar . > /dev/null
!tar -xvf textstat-0.7.1.tar > /dev/null
!tar -xvf pyphen-0.11.0.tar > /dev/null
!cd pyphen-0.11.0 && python setup.py build > /dev/null && python setup.py install > /dev/null
!cd textstat-0.7.1 && python setup.py build > /dev/null && python setup.py install > /dev/null

In [None]:
# !python setup.py build > /dev/null
# !python setup.py install > /dev/null

In [None]:
import textstat
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


import re
from sklearn.feature_extraction.text import TfidfVectorizer

import transformers
import torch
from transformers import BertTokenizer

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold

import lightgbm as lgb

from fastprogress.fastprogress import  progress_bar
 
from sklearn.metrics import mean_squared_error
from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.tasks import Task

In [None]:
ss = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
train_df.target.min(), train_df.target.max()

In [None]:
TIMEOUT = 15_000 # Time in seconds for automl run
TARGET_NAME = 'target' # Target column name

In [None]:
def rmse(x, y): return np.sqrt(mean_squared_error(x, y))
task = Task('reg', metric=rmse)
roles = {'target': TARGET_NAME,
         'text': ['excerpt'],
         'drop': ['id', 'standard_error', 'url_legal', 'license']}

# preprocess

In [None]:
def preprocess(data):
    excerpt_processed=[]
    for e in progress_bar(data['excerpt']):
        
        # find alphabets
        e = re.sub("[^a-zA-Z]", " ", e)
        
        # convert to lower case
        e = e.lower()
        
        # tokenize words
        e = nltk.word_tokenize(e)
        
        # remove stopwords
        e = [word for word in e if not word in set(stopwords.words("english"))]
        
        # lemmatization
        lemma = nltk.WordNetLemmatizer()
        e = [lemma.lemmatize(word) for word in e]
        e=" ".join(e)
        
        excerpt_processed.append(e)
        
    return excerpt_processed

train_df['excerpt_preprocessed'] = preprocess(train_df)
#test_df["excerpt_preprocessed"] = preprocess(test_df)

# Handcrafted features from Kaggle notebooks

In [None]:
from textblob.tokenizers import SentenceTokenizer, WordTokenizer
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os 
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold
#import textstat
plt.style.use('seaborn-talk')
from readcalc import readcalc
from sklearn.preprocessing import StandardScaler
import joblib

import spacy
sp = spacy.load('en_core_web_sm')

def pos_to_id(pos_name):
    return sp.vocab[pos_name].orth

content_poss = ['ADJ', 'NOUN', 'VERB', 'ADV']

def count_poss(text, poss_names):
    text = sp(text)
    poss_ids = [pos_to_id(pos_name) for pos_name in poss_names]
    pos_freq_dict = text.count_by(spacy.attrs.POS)
    poss_sum = sum([pos_freq_dict.get(pos_id, 0) for pos_id in poss_ids])
    return poss_sum


count_poss('my name is', ['PRON', 'NOUN'])

# !pip download textstat ReadabilityCalculator 
# !pip install *.whl

sent_tokenizer = SentenceTokenizer()
word_tokenizer = WordTokenizer()

# with open('../input/clrauxdata/dale-chall-3000-words.txt') as f:
#     words = f.readlines()[0].split()
    
# common_words = dict(zip(words, [True] * len(words)))
# # df.sent_cnt.plot(kind='kde')

feats_to_drop = ['sents_n', 'words_n', 'long_words_n',
                 #'difficult_words_n',
                 'content_words_n', 'prons_n', 'chars_n', 'syllables_n']


doc_feats = ['chars_per_word', 'chars_per_sent', 'syllables_per_word',
       'syllables_per_sent', 'words_per_sent', 'long_words_doc_ratio',
       'difficult_words_doc_ratio', 'prons_doc_ratio', 'flesch_reading_ease',
       'flesch_kincaid_grade', 'ari', 'cli', 'gunning_fog', 'lix', 'rix',
       'smog', 'dcrs', 'lexical_diversity', 'content_diversity', 'lwf']

def create_handcrafted_features(df):
    df['sents_n'] = df.excerpt.apply(textstat.sentence_count)
    df['words_n'] = df.excerpt.apply(textstat.lexicon_count)
    df['long_words_n'] = df.excerpt.apply(lambda t: readcalc.ReadCalc(t).get_words_longer_than_X(6))
    #df['difficult_words_n'] = df.excerpt.apply(lambda t: sum([bool(common_words.get(word)) for word in word_tokenizer.tokenize(t, include_punc=False)]))
    df['content_words_n'] = df.excerpt.apply(lambda t: count_poss(t, content_poss))
    df['prons_n'] = df.excerpt.apply(lambda t: count_poss(t, ['PRON']))
    df['chars_n'] = df.excerpt.str.len()
    df['syllables_n'] = df.excerpt.apply(textstat.syllable_count)
    print('\tstage 1 finished..')

    df['chars_per_word_'] = df.chars_n / df.words_n
    df['chars_per_sent_'] = df.chars_n / df.sents_n
    df['syllables_per_word_'] = df.syllables_n / df.words_n
    df['syllables_per_sent_'] = df.syllables_n / df.sents_n

    df['words_per_sent_'] = df.words_n / df.sents_n
    df['long_words_doc_ratio_'] = df.long_words_n / df.words_n
    #df['difficult_words_doc_ratio'] = df.difficult_words_n / df.words_n
    df['prons_doc_ratio'] = df.prons_n / df.words_n

    print('\tstage 2 finished..')

    df['flesch_reading_ease_'] = df.excerpt.apply(textstat.flesch_reading_ease)
    df['flesch_kincaid_grade_'] = df.excerpt.apply(textstat.flesch_kincaid_grade)
    df['ari_'] = df.excerpt.apply(textstat.automated_readability_index)
    df['cli_'] = df.excerpt.apply(textstat.coleman_liau_index)
    df['gunning_fog'] = df.excerpt.apply(textstat.gunning_fog)

    df['lix_'] = df.excerpt.apply(lambda t: readcalc.ReadCalc(t).get_lix_index())
    df['rix_'] = df.long_words_n / df.sents_n
    df['smog_'] = df.excerpt.apply(lambda t: readcalc.ReadCalc(t).get_smog_index())
    df['dcrs_'] = df.excerpt.apply(textstat.dale_chall_readability_score)

    df['lexical_diversity_'] = len(set(df.words_n)) / df.words_n
    df['content_diversity_'] = df.content_words_n / df.words_n
    df['lwf_'] = df.excerpt.apply(textstat.linsear_write_formula)

    print('\tstage 3 finished..')
    return df


# train_df = create_handcrafted_features(train_df)
# train_df.drop(feats_to_drop, inplace=True, axis=1)

# TextStat

In [None]:
def text_2_statistics(data):
    flesch_reading_ease_list, smog_index_list = [], []
    flesch_kincaid_grade_list, coleman_liau_index_list = [], []
    automated_readability_index_list, dale_chall_readability_score_list = [], []
    difficult_words_list, linsear_write_formula_list = [], []
    gunning_fog_list, text_standard_list = [], []
    fernandez_huerta_list, szigriszt_pazos_list = [], []
    gutierrez_polini_list, crawford_list = [], []
    
     
    for sentence in progress_bar(data['excerpt']):
        flesch_reading_ease_list.append(textstat.flesch_reading_ease(sentence))
        smog_index_list.append(textstat.smog_index(sentence))
        flesch_kincaid_grade_list.append(textstat.flesch_kincaid_grade(sentence))
        coleman_liau_index_list.append(textstat.coleman_liau_index(sentence))
        automated_readability_index_list.append(textstat.automated_readability_index(sentence))
        dale_chall_readability_score_list.append(textstat.dale_chall_readability_score(sentence))
        difficult_words_list.append(textstat.difficult_words(sentence))
        linsear_write_formula_list.append(textstat.linsear_write_formula(sentence))
        gunning_fog_list.append(textstat.gunning_fog(sentence))
        text_standard_list.append(textstat.text_standard(sentence, float_output=True))
        fernandez_huerta_list.append(textstat.fernandez_huerta(sentence))
        szigriszt_pazos_list.append(textstat.szigriszt_pazos(sentence))
        gutierrez_polini_list.append(textstat.gutierrez_polini(sentence))
        crawford_list.append(textstat.crawford(sentence))
        
    statistics_dict = {'flesch_reading_ease':flesch_reading_ease_list,
                       'smog_index' : smog_index_list,
                       'flesch_kincaid_grade' : flesch_kincaid_grade_list,
                       'coleman_liau_index' : coleman_liau_index_list,
                       'automated_readability_index' : automated_readability_index_list, 
                       'dale_chall_readability_score' : dale_chall_readability_score_list, 
                       'difficult_words' : difficult_words_list,
                       'linsear_write_formula' : linsear_write_formula_list,
                       'gunning_fog' : gunning_fog_list,
                       'text_standard' : text_standard_list,
                       'fernandez_huerta' : fernandez_huerta_list,
                       'szigriszt_pazos' : szigriszt_pazos_list,
                       'gutierrez_polini' : gutierrez_polini_list,
                       'crawford' : crawford_list
                      }
    return statistics_dict


statistics_dict = text_2_statistics(train_df)
for k,v in statistics_dict.items():
    train_df[k] = v
            
    

#train_txt_stat = pd.DataFrame(statistics_dict)




# TF-IDF

In [None]:
vectorizer = TfidfVectorizer(max_features=1000)

train_bags = vectorizer.fit_transform(train_df['excerpt_preprocessed'].values).toarray()
train_bag_of_words_df = pd.DataFrame(train_bags)
train_bag_of_words_df.columns = vectorizer.get_feature_names()
 

        
for col in train_bag_of_words_df.columns:
    train_df[col] = train_bag_of_words_df[col].values

del train_bag_of_words_df
# train_df.head()

    
#  -------------------

def count_words_in_sentences(data):
    counts = []
    for sentence in progress_bar(data['excerpt_preprocessed']):
        words = sentence.split()
        counts.append(len(words))
        
    return counts

train_df['excerpt_word_counts_by_preprocessed'] = count_words_in_sentences(train_df)


# NLTK features

In [None]:
from typing import List, Dict, Union

import nltk
import numpy as np
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree



In [None]:
!ls 

In [None]:


def get_named_entities(text: str) -> List[str]:
    continuous_chunk = []
    current_chunk = []

    for i in ne_chunk(pos_tag(word_tokenize(text))):
        if isinstance(i, Tree):
            current_chunk.append(" ".join(token for token, pos in i.leaves()))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            continuous_chunk.append(named_entity)
            current_chunk = []

    named_entity = " ".join(current_chunk)
    continuous_chunk.append(named_entity)

    return continuous_chunk


_raw_tags = frozenset(
    {
        'LS', 'TO', 'VBN', "''",
        'WP', 'UH', 'VBG', 'JJ',
        'VBZ', '--', 'VBP', 'NN',
        'DT', 'PRP', ':', 'WP$',
        'NNPS', 'PRP$', 'WDT',
        '(', ')', '.', ',', '``',
        '$', 'RB', 'RBR', 'RBS',
        'VBD', 'IN', 'FW', 'RP',
        'JJR', 'JJS', 'PDT', 'MD',
        'VB', 'WRB', 'NNP', 'EX',
        'NNS', 'SYM', 'CC', 'CD', 'POS'
    }
)

_general_tags = frozenset(
    {'gVB', 'gNN', 'gPR', 'gWP', 'gRB', 'gJJ'}
)

_tagset = (
    *_raw_tags,
    *_general_tags
)


def generate_text_features(text: str) -> Dict[str, Union[int, float]]:
    total_count = dict.fromkeys(_tagset, 0)
    tokenized_text = nltk.word_tokenize(text)
    inv_text_len = 1 / len(tokenized_text)
    for word, pos in nltk.pos_tag(tokenized_text):
        total_count[pos] += inv_text_len
        general_tag = f'g{pos[:2]}'
        if general_tag in _general_tags:
            total_count[general_tag] += inv_text_len

    max_in_sent = dict.fromkeys(_tagset, 0)
    min_in_sent = dict.fromkeys(_tagset, 0)
    mean_in_sent = dict.fromkeys(_tagset, 0)
    general_tags = set()
    tags = set()

    sentences = nltk.sent_tokenize(text)
    num_sentences = len(sentences)
    num_words = []
    words_len = []

    for sentence in map(nltk.word_tokenize, sentences):
        cur_sentence_stat = dict.fromkeys(_tagset, 0)
        num_words.append(len(sentence))
        inv_sent_len = 1 / len(sentence)
        for word, pos in nltk.pos_tag(sentence):
            words_len.append(len(word))
            cur_sentence_stat[pos] += inv_sent_len
            tags.add(pos)
            general_tag = f'g{pos[:2]}'
            if general_tag in _general_tags:
                general_tags.add(general_tag)
                cur_sentence_stat[general_tag] += inv_sent_len
        for tag in _tagset:
            max_in_sent[tag] = max(max_in_sent[tag], cur_sentence_stat[tag])
            min_in_sent[tag] = min(min_in_sent[tag], cur_sentence_stat[tag])
            mean_in_sent[tag] += cur_sentence_stat[tag] / num_sentences

    res = {}
    for k, v in total_count.items():
        res[f'TOTAL_{k}'] = v
    for k, v in max_in_sent.items():
        res[f'MAX_{k}'] = v
    for k, v in min_in_sent.items():
        res[f'MIN_{k}'] = v
    for k, v in mean_in_sent.items():
        res[f'MEAN_{k}'] = v

    num_words = np.array(num_words)
    words_len = np.array(words_len)
    res['NUM_SENTENCES'] = len(num_words)
    res['MEAN_NUM_WORDS'] = num_words.mean()
    res['STD_NUM_WORDS'] = num_words.std()
    res['NUM_WORDS'] = len(words_len)
    res['MEAN_WORD_LEN'] = words_len.mean()
    res['STD_WORD_LEN'] = words_len.std()
    res['TAGS_UNIQUE'] = len(tags)
    res['GENERAL_TAGS_UNIQUE'] = len(general_tags)

    named_entities = get_named_entities(text)
    res['NAMED_ENTITIES_PER_SENTENCE'] = len(named_entities) / num_sentences
    res['UNIQUE_NAMED_ENTITIES_PER_SENTENCE'] = len(set(named_entities)) / num_sentences
    return res


def max_word_lenght(sentence):
    words = sentence.split()
    average = max(len(word) for word in words)
    return average


def get_all_nltk_feats(text):
    res = generate_text_features(text)
    res['number_get_named_entities'] = len(get_named_entities(text))
    res['max_word_lenght'] = max_word_lenght(text)
    new_res = {}
    for k, v in res.items():
        new_res[k] = [v]
    
    return new_res
    
# txt = 'Say hello to my little friend, Bro! I love you, Sarra!'
# nltk_feats = get_all_nltk_feats(txt)
# nltk_feats




In [None]:
#txt = 'Say hello to my little friend, Bro! I love you, Sarra!'
#nltk_feats = count_part_of_speechs(txt)

nltk_feats_df = pd.DataFrame()
for txt in progress_bar(train_df['excerpt']):
    nltk_feats_dict = get_all_nltk_feats(txt)
    nltk_feats_df = nltk_feats_df.append(pd.DataFrame(nltk_feats_dict))
     
        
        
for col in nltk_feats_df.columns:
    train_df[col] = nltk_feats_df[col].values


train_df.head()
    

# K-Best filtering

In [None]:
# from sklearn.feature_selection import SelectKBest, f_regression

# nltk_feats_df.fillna(0, inplace=True)


# feature_names = list(nltk_feats_df.columns.values)

# kb = SelectKBest(f_regression, k=60)
# kb.fit(nltk_feats_df, train_df['target'])


# mask = kb.get_support() #list of booleans
# new_features = [] # The list of your K best features

# for bool, feature in zip(mask, feature_names):
#     if bool:
#         new_features.append(feature)

# nltk_feats_df = pd.DataFrame(kb.transform(nltk_feats_df), columns=new_features)
# # nltk_feats_df = pd.DataFrame(kb.transform(X_test))


 

In [None]:
nltk_feats_df['target'] = train_df['target']

corr = abs(nltk_feats_df.corr())
 

import seaborn as sns
%matplotlib inline

# calculate the correlation matrix
#corr = abs(train_df.corr())


from matplotlib.pyplot import figure

figure(figsize=(10, 32), dpi=100)

# plot the heatmap
sns.heatmap(corr, 
        xticklabels=['target'],
        yticklabels=corr.columns)


In [None]:
def preprocess_text(df):
    df['len_tokens'] = df['excerpt'].str.strip().str.split(' ').apply(len)
    df['len'] = df['excerpt'].str.strip().apply(len)
    df['len_sent'] = df['excerpt'].str.strip().str.split('.').apply(len)
    df['n_comm'] = df['excerpt'].str.strip().str.split(',').apply(len)
    _t = df['excerpt'].str.strip().str.split(' ').values
    df['d_mean'] = [np.sum([j.isdigit() for j in i]) for i in _t]
    df['u_mean'] = [np.sum([j.isupper() for j in i]) for i in _t]
    
preprocess_text(train_df)

In [None]:
# Важно проверить число вот тут!

print(train_df.shape)
train_df.head()

In [None]:
automl = TabularNLPAutoML(task=task,
                          timeout=TIMEOUT,
                          general_params={'nested_cv': False, 'use_algos': [['linear_l2','nn', 'lgb', 'lgb_tuned', 'cb',]]},
                          text_params={'lang': 'en', 'bert_model': '../input/roberta-base'},
                          reader_params={'cv': 5},
                          selection_params={'mode': 1},
                          linear_pipeline_params={'text_features': 'embed'},
                          autonlp_params={'model_name': 'pooled_bert',
                                          'transformer_params': {'model_params': {'pooling': 'mean'},
                                                                 'dataset_params': {'max_length': 220}, # поменял max_length. было 220
                                                                 'loader_params': {'batch_size': 64,
                                                                                   'shuffle': False,
                                                                                   'num_workers': 4}
                                                                 }
                                          },
                          nn_params={'opt_params': {'lr': 3e-5},
                                     'lang': 'en',
                                     'path_to_save': './models',
                                     'bert_name': '../input/roberta-base',
                                     'snap_params': {'k': 1, 'early_stopping': True,
                                                     'patience': 2, 'swa': False},
                                     'init_bias': False,
                                     'pooling': 'mean',
                                     'max_length': 220, 'bs': 32, 'n_epochs': 20, # поменял max_length. было 220
                                     'use_cont': False,
                                     'use_cat': False,
                                     },
                          )

oof_pred = automl.fit_predict(train_df, roles=roles)
print('')
print(rmse(train_df[TARGET_NAME], oof_pred.data[:, 0]))

In [None]:
from lightautoml.addons.interpretation import LimeTextExplainer
lime = LimeTextExplainer(automl, feature_selection='lasso', force_order=False)

In [None]:
df = train_df.iloc[0]
exp = lime.explain_instance(df, perturb_column='excerpt')
exp.visualize_in_notebook()
print(df[TARGET_NAME])

In [None]:
df = train_df.iloc[1]
exp = lime.explain_instance(df, perturb_column='excerpt')
exp.visualize_in_notebook()
print(df[TARGET_NAME])

In [None]:
# df = train_df.iloc[100]
# exp = lime.explain_instance(df, perturb_column='excerpt')
# exp.visualize_in_notebook()
# print(df[TARGET_NAME])

In [None]:
df = train_df.iloc[777]
exp = lime.explain_instance(df, perturb_column='excerpt')
exp.visualize_in_notebook()
print(df[TARGET_NAME])

In [None]:
# df = train_df.iloc[2222]
# exp = lime.explain_instance(df, perturb_column='excerpt')
# exp.visualize_in_notebook()
# print(df[TARGET_NAME])

In [None]:
import pickle
with open('LAMA_model.pkl', 'wb') as f:
    pickle.dump(automl, f)

In [None]:
!rm text* -r
!rm pyp* -r

In [None]:
!ls