In [None]:
!pip install ../input/textstat/Pyphen-0.10.0-py3-none-any.whl
!pip install ../input/textstat/textstat-0.7.0-py3-none-any.whl

In [None]:
!pip install ../input/sentencetransformer/sentence-transformers-1.0.4

In [None]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style='darkgrid')

import textstat
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from sklearn.metrics import mean_squared_error

import warnings
warnings.simplefilter('ignore')

import sys
sys.path.append('../input/readability-package')

import readability
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag, pos_tag_sents
import string
import spacy

import optuna
import scipy

# from bertopic import BERTopic
import pandas as pd
from sentence_transformers import SentenceTransformer
import sklearn.manifold

# optuna.logging.disable_default_handler()

In [None]:
def clean_text(text):
    text = text.replace('\n', '')
    return text

In [None]:
SEED = 567

TRAIN = '../input/clrp-model-selection-from-oof-score/stacking_oof.csv'

df = pd.read_csv(TRAIN)
df['excerpt'] = df['excerpt'].apply(clean_text)

# Feature Engineering

## Textstat
Credit: https://www.kaggle.com/gunesevitan/commonlit-readability-prize-eda

In [None]:
# df['n_words'] = df['excerpt'].apply(textstat.lexicon_count)
# df['n_unique_words'] = df['excerpt'].apply(lambda x: len(set(str(x).split())))

# df['mean_word_len'] = df['excerpt'].apply(lambda x: np.mean([len(word) for word in str(x).split()]))
# df['max_word_len'] = df['excerpt'].apply(lambda x: np.max([len(word) for word in str(x).split()]))
# df['min_word_len'] = df['excerpt'].apply(lambda x: np.min([len(word) for word in str(x).split()]))

df['n_sentence'] = df['excerpt'].apply(textstat.sentence_count)
df['n_syllable'] = df['excerpt'].apply(textstat.syllable_count)

df['flesch_reading_ease'] = df['excerpt'].apply(textstat.flesch_reading_ease)
df['flesch_kincaid_grade'] = df['excerpt'].apply(textstat.flesch_kincaid_grade)
df['smog_index'] = df['excerpt'].apply(textstat.smog_index)
df['automated_readability_index'] = df['excerpt'].apply(textstat.automated_readability_index)
df['coleman_liau_index'] = df['excerpt'].apply(textstat.coleman_liau_index)
df['linsear_write_formula'] = df['excerpt'].apply(textstat.linsear_write_formula)

# df['gunning_fog'] = df['excerpt'].apply(textstat.gunning_fog)
# df['dale_chall'] = df['excerpt'].apply(textstat.dale_chall_readability_score)
# df['n_difficult_words'] = df['excerpt'].apply(textstat.difficult_words)
# df['consensus'] = df['excerpt'].apply(textstat.text_standard)

## Readability
Credit: https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline

In [None]:
def readability_measurements(passage: str):
    """
    This function uses the readability library for feature engineering.
    It includes textual statistics, readability scales and metric, and some pos stats
    """
    results = readability.getmeasures(passage, lang='en')
    
    chars_per_word = results['sentence info']['characters_per_word']
    syll_per_word = results['sentence info']['syll_per_word']
    words_per_sent = results['sentence info']['words_per_sentence']
    
    kincaid = results['readability grades']['Kincaid']
    ari = results['readability grades']['ARI']
    coleman_liau = results['readability grades']['Coleman-Liau']
    flesch = results['readability grades']['FleschReadingEase']
    gunning_fog = results['readability grades']['GunningFogIndex']
    lix = results['readability grades']['LIX']
    smog = results['readability grades']['SMOGIndex']
    rix = results['readability grades']['RIX']
    dale_chall = results['readability grades']['DaleChallIndex']
    
    tobeverb = results['word usage']['tobeverb']
    auxverb = results['word usage']['auxverb']
    conjunction = results['word usage']['conjunction']
    pronoun = results['word usage']['pronoun']
    preposition = results['word usage']['preposition']
    nominalization = results['word usage']['nominalization']
    
    pronoun_b = results['sentence beginnings']['pronoun']
    interrogative = results['sentence beginnings']['interrogative']
    article = results['sentence beginnings']['article']
    subordination = results['sentence beginnings']['subordination']
    conjunction_b = results['sentence beginnings']['conjunction']
    preposition_b = results['sentence beginnings']['preposition']

    
    return [chars_per_word, syll_per_word, words_per_sent,
            kincaid, ari, coleman_liau, flesch, gunning_fog, lix, smog, rix, dale_chall,
            tobeverb, auxverb, conjunction, pronoun, preposition, nominalization,
            pronoun_b, interrogative, article, subordination, conjunction_b, preposition_b]


In [None]:
scores_df = pd.DataFrame(df["excerpt"].apply(lambda p : readability_measurements(p)).tolist(), 
                                 columns=["chars_per_word", "syll_per_word", "words_per_sent",
                                          "kincaid", "ari", "coleman_liau", "flesch", "gunning_fog", "lix", "smog", "rix", "dale_chall",
                                          "tobeverb", "auxverb", "conjunction", "pronoun", "preposition", "nominalization",
                                          "pronoun_b", "interrogative", "article", "subordination", "conjunction_b", "preposition_b"])

In [None]:
df = pd.concat([df, scores_df], axis=1)

## Spacy
Credit: https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline

In [None]:
def spacy_features(df: pd.DataFrame):
    """
    This function generates features using spacy en_core_wb_lg
    I learned about this from these resources:
    https://www.kaggle.com/konradb/linear-baseline-with-cv
    https://www.kaggle.com/anaverageengineer/comlrp-baseline-for-complete-beginners
    """
    
    nlp = spacy.load('en_core_web_lg')
    with nlp.disable_pipes():
        vectors = np.array([nlp(text).vector for text in df.excerpt])
        
    return vectors

def get_spacy_col_names():
    names = list()
    for i in range(300):
        names.append(f"spacy_{i}")
        
    return names

In [None]:
# spacy_df = pd.DataFrame(spacy_features(df), columns=get_spacy_col_names())
# df = pd.concat([df, spacy_df], axis=1)

## Tags
Credit: https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline

In [None]:
def pos_tag_features(passage: str):
    """
    This function counts the number of times different parts of speech occur in an excerpt
    """
    pos_tags = ["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
                "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
                "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"]
    
    tags = pos_tag(word_tokenize(passage))
    tag_list= list()
    
    for tag in pos_tags:
        tag_list.append(len([i[0] for i in tags if i[1] == tag]))
    
    return tag_list

In [None]:
pos_df = pd.DataFrame(df["excerpt"].apply(lambda p : pos_tag_features(p)).tolist(),
                              columns=["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
                                       "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
                                       "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"])
df = pd.concat([df, pos_df], axis=1)

## Other
Credit: https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline

In [None]:
def generate_other_features(passage: str):
    """
    This function is where I test miscellaneous features
    This is experimental
    """
    # punctuation count
    periods = passage.count(".")
    commas = passage.count(",")
    semis = passage.count(";")
    exclaims = passage.count("!")
    questions = passage.count("?")
    
    # Some other stats
    num_char = len(passage)
    num_words = textstat.lexicon_count(passage) #len(passage.split(" ")) #
    unique_words = len(set(passage.split(" ") )) #
    word_diversity = unique_words/num_words
    
    word_len = [len(w) for w in passage.split(" ")]
    longest_word = np.max(word_len) #
    avg_len_word = np.mean(word_len) #
    
    return [periods, commas, semis, exclaims, questions,
            num_char, num_words, unique_words, word_diversity,
            longest_word, avg_len_word]

In [None]:
other_df = pd.DataFrame(df["excerpt"].apply(lambda p : generate_other_features(p)).tolist(),
                                columns=["periods", "commas", "semis", "exclaims", "questions",
                                         "num_char", "num_words", "unique_words", "word_diversity",
                                         "longest_word", "avg_len_word"])

df = pd.concat([df, other_df], axis=1)

## Line Breaks
Credit:
https://www.kaggle.com/ppjanka/2021-commonlitreadability-final

In [None]:
# df['n_linebreaks'] = df['excerpt'].str.split('\n').transform(len)
# df['mean_sentences_per_lineBreak'] = df['n_sentence'] / df['n_linebreaks']

# sentences_per_lineBreak = df.excerpt.str.split('\n').transform(lambda x : [len(y.split('.')) for y in x])
# df['min_sentences_per_lineBreak'] = sentences_per_lineBreak.apply(min)
# df['max_sentences_per_lineBreak'] = sentences_per_lineBreak.apply(max)
# df['std_sentences_per_lineBreak'] = sentences_per_lineBreak.apply(np.std)

# words_per_lineBreak = df.excerpt.str.split('\n').transform(lambda x : [len(y.split(' ')) for y in x])
# df['min_words_per_lineBreak'] = words_per_lineBreak.apply(min)
# df['max_words_per_lineBreak'] = words_per_lineBreak.apply(max)
# df['mean_words_per_lineBreak'] = words_per_lineBreak.apply(np.mean)
# df['std_words_per_lineBreak'] = words_per_lineBreak.apply(np.std)

## Sentence Transformer
Credit: https://www.kaggle.com/thedrcat/commonlit-what-are-we-reading-about

In [None]:
from sklearn import decomposition

texts = df.loc[:, 'excerpt']

model = SentenceTransformer('../input/sentence-transformer-models/stsb-roberta-large')
embeddings = model.encode(texts)

pca = decomposition.PCA(random_state=SEED)
pca.fit(embeddings)

In [None]:
max_embs = 10

embeddings_ = pca.transform(embeddings)
emb_df = pd.DataFrame(embeddings_[0:, 0: max_embs], columns=[f'embedding_{i}' for i in range(max_embs)])

df = pd.concat([df, emb_df], axis=1)

## OOF Target Prediction

In [None]:
cols_oof = [f'oof_{i}' for i in range(8)]
oofs = df.loc[:, cols_oof]

In [None]:
df['oof_mean'] = np.mean(oofs, axis=1)
df['oof_std'] = np.std(oofs, axis=1)
df['oof_max'] = np.max(oofs, axis=1)
df['oof_min'] = np.min(oofs, axis=1)
df['oof_range'] = df['oof_max'] - df['oof_min']
df['oof_std_over_range'] = df['oof_std'] / df['oof_range']

## Additional Feature

In [None]:
df['unique_words_per_sentence'] = df['unique_words'] / df['n_sentence']
df['longest_over_avg_word'] = df['longest_word'] / df['avg_len_word']
df['oof_skew'] = scipy.stats.skew(oofs, axis=1)
df['oof_kurtosis'] = scipy.stats.kurtosis(oofs, axis=1)

## Feature Selection

In [None]:
fe_pred = ['standard_error', 'target']  # stackingによるoofをtargetとして特徴量に追加したい

# fe_oof = [f'oof_{i}' for i in range(8)] + ['oof_stacking'] + ['oof_mean', 'oof_std', 'oof_max', 'oof_min', 'oof_range']
# remove oof_stacking for avoiding leakage
fe_oof = [f'oof_{i}' for i in range(8)] + ['oof_mean', 'oof_std', 'oof_max', 'oof_min', 'oof_range', 'oof_std_over_range']
fe_text = ['n_sentence', #  'n_words', 'n_unique_words', 'mean_word_len', 'max_word_len', 
           'n_syllable', 'flesch_reading_ease', 'flesch_kincaid_grade', 'smog_index', 
           'automated_readability_index', 'coleman_liau_index', 'linsear_write_formula']

fe_read = ["chars_per_word", "syll_per_word", "words_per_sent",
                                          "kincaid", "ari", "coleman_liau", "flesch", "gunning_fog", "lix", "smog", "rix", "dale_chall",
                                          "tobeverb", "auxverb", "conjunction", "pronoun", "preposition", "nominalization",
                                          "pronoun_b", "interrogative", "article", "subordination", "conjunction_b", "preposition_b"]

fe_spacy = get_spacy_col_names()

fe_tags = ["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
                                       "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
                                       "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"]

fe_other = ["periods", "commas", "semis", "exclaims", "questions",
                                         "num_char", "num_words", "unique_words", "word_diversity",
                                         "longest_word", "avg_len_word"]

fe_lb = ['n_linebreaks', 'mean_sentences_per_lineBreak', 'min_sentences_per_lineBreak', 'max_sentences_per_lineBreak', 'std_sentences_per_lineBreak', 
         'min_words_per_lineBreak', 'max_words_per_lineBreak', 'mean_words_per_lineBreak', 'std_words_per_lineBreak']

fe_emb = [f'embedding_{i}' for i in range(max_embs)]

fe_add = ['unique_words_per_sentence', 'longest_over_avg_word', 'oof_skew', 'oof_kurtosis']

fe = fe_pred + fe_text #+ fe_oof

# VIF Calculation

In [None]:
vif_df = pd.DataFrame()

vif_df['feature'] = fe
vif_df['vif'] = [vif(df.loc[:, fe].values, i) for i in range(len(fe))]

vif_df

In [None]:
# fe_text
plt.figure(figsize=(12, 10))
sns.heatmap(data=df.loc[:, fe].corr(), annot=True)
plt.show()

In [None]:
# fe_oof
plt.figure(figsize=(14, 10))
sns.heatmap(data=df.loc[:, fe_pred + fe_oof].corr(), annot=True)
plt.show()

In [None]:
# fe_read
plt.figure(figsize=(16, 10))
sns.heatmap(data=df.loc[:, fe_pred + fe_read].corr(), annot=True)
plt.show()

In [None]:
# fe_tags
plt.figure(figsize=(16, 10))
sns.heatmap(data=df.loc[:, fe_pred + fe_tags].corr(), annot=True)
plt.show()

In [None]:
# fe_other
plt.figure(figsize=(16, 10))
sns.heatmap(data=df.loc[:, fe_pred + fe_other + fe_add].corr(), annot=True)
plt.show()

In [None]:
# fe_emb
plt.figure(figsize=(16, 10))
sns.heatmap(data=df.loc[:, fe_pred + fe_emb].corr(), annot=True)
plt.show()

# LGBM Training / Prediction for standard_error

In [None]:
import optuna.integration.lightgbm as lgb
import lightgbm as lgb_original

In [None]:
def show_importance(fe, opt):
    fe_df = pd.DataFrame()
    fe_df['feature'] = fe
    fe_df['importance'] = opt.feature_importance(importance_type='gain')
    
    plt.figure(figsize=(14, 16))
    sns.barplot(data=fe_df, y='feature', x='importance')
    plt.title('Feature Importance')
    plt.show()
    
def RMSE_(y_pred, y_gt):
    mse = mean_squared_error(y_pred, y_gt)
    return np.sqrt(mse)

In [None]:
class OptLGBM:
    def __init__(self, df, fe, opt_fe='standard_error'):
        self.df = df
        self.fe = fe
        self.opt_fe = opt_fe
        
        self.best_scores = []
        self.best_params = []
        self.best_models = []
        
        self.df['opt_fe_oof'] = np.nan
         
    def optimize(self, num_boost_round=1000, early_stopping_rounds=500):
        for fold in range(5):
            train_df = self.df.loc[self.df.fold!=fold, self.fe + [self.opt_fe]].reset_index(drop=True)
            val_df = self.df.loc[self.df.fold==fold, self.fe + [self.opt_fe]].reset_index(drop=True)

            train_ds = lgb.Dataset(train_df.loc[:, self.fe], train_df.loc[:, self.opt_fe])
            val_ds = lgb.Dataset(val_df.loc[:, self.fe], val_df.loc[:, self.opt_fe])

            params = {
                'objective': 'regression',
                'metric': 'rmse',
                'verbosity': -1,
                'boosting_type': 'gbdt',
                'seed': SEED,
#                 'device': 'gpu'
            }

            opt = lgb.train(params, 
                            train_ds, 
                            valid_sets=val_ds, 
                            num_boost_round=num_boost_round, 
                            verbose_eval=False, 
                            early_stopping_rounds=early_stopping_rounds,
                            show_progress_bar=False)

            print(f'fold {fold}: ', opt.best_score['valid_0']['rmse'])
            self.best_scores.append(opt.best_score['valid_0']['rmse'])
            self.best_params.append(opt.params)
            self.best_models.append(opt.best_iteration)
            
            preds = opt.predict(val_df.loc[:, self.fe], num_iteration=opt.best_iteration)
            self.df.loc[self.df.fold==fold, 'opt_fe_oof'] = preds
            
            self._show_importance(opt)
            
        print('CV Score: ', RMSE_(self.df[self.opt_fe], self.df['opt_fe_oof']))
        
    def _show_importance(self, opt):
        show_importance(fe=self.fe, opt=opt)
        
    def retrain(self, col='se'):
        self.df[col] = np.nan
        
        for fold in range(5):
            train_df = self.df.loc[self.df.fold!=fold, self.fe + [self.opt_fe]].reset_index(drop=True)
            val_df = self.df.loc[self.df.fold==fold, self.fe + [self.opt_fe]].reset_index(drop=True)

            train_ds = lgb.Dataset(train_df.loc[:, self.fe], train_df.loc[:, self.opt_fe])
            val_ds = lgb.Dataset(val_df.loc[:, self.fe], val_df.loc[:, self.opt_fe])
            
            params = self.best_params[fold]
            
            model = lgb_original.train(params, 
                            train_ds, 
                            valid_sets=val_ds, 
                            verbose_eval=False)
            
            model.save_model(f'lgb_{col}_fold{fold}.pkl')
            self.df.loc[self.df.fold==fold, col] = model.predict(val_df.loc[:, self.fe])
            
        print('CV Score (retrained): ', RMSE_(self.df[self.opt_fe], self.df[col]))

## Optimize / Retrain

In [None]:
fe = fe_text + fe_oof + fe_read + fe_tags + fe_other + fe_add + fe_emb

opt = OptLGBM(df=df, fe=fe, opt_fe='standard_error')
opt.optimize(num_boost_round=1500, early_stopping_rounds=500)
opt.retrain(col='se')

In [None]:
def oof_vs_target(df, x='standard_error', y='oof'):
    temp_df = pd.DataFrame()
    temp_df['x'] = np.linspace(0.4, 0.7, 10)
    temp_df['y'] = temp_df['x']

    plt.figure(figsize=(8, 8))
    sns.scatterplot(data=df, x=x, y=y, label=f'{y} vs target', hue='fold', palette='bright')
    sns.lineplot(data=temp_df, x='x', y='y', color='orange')
    plt.title('OOF Prediction vs Target')
    plt.legend()
    plt.show()

In [None]:
oof_vs_target(opt.df, x='standard_error', y='se')

In [None]:
oof_vs_target(opt.df, x='oof_stacking', y='se')

In [None]:
oof_vs_target(opt.df, x='target', y='standard_error')

# Stacking by LGBM for Target

In [None]:
# remove fe_text, fe_add
fe = fe_oof + fe_read + fe_tags + fe_other + ['se']

opt = OptLGBM(df=df, fe=fe, opt_fe='target')
opt.optimize(num_boost_round=1500)
opt.retrain(col='oof_with_se')

# Stacking with BERTs + LGBM

In [None]:
from sklearn import linear_model

CV_PATHS = [
    {'model_type': 0,
     'path': '../input/clrp-robertalarge-attentions-mask-act',
     'model_name': 'CLRPModelLarge'},
    {'model_type': 1,
     'path': '../input/clrp-robertalarge-conv1d-attentions-mask',
     'model_name': 'CLRPModelLarge'},
    {'model_type': 2,
     'path': '../input/clrp-robertabase-from-colab',
     'model_name': 'CLRPModelColab'},
    {'model_type': 3,
     'path': '../input/clrp-electralarge-attentions-mask-act',
     'model_name': 'CLRPModelLarge'},
    {'model_type': 4,
     'path': '../input/clrp-xlnetlarge-attentions-mask',
     'model_name': 'CLRPModelLarge'},
    {'model_type': 5,
     'path': '../input/clrp-electralarge-attentions-conv1d',
     'model_name': 'CLRPModelLarge'},
    {'model_type': 6,
     'path': '../input/clrp-robertalarge-meanpooling',
     'model_name': 'CLRPModelLarge'},
    {'model_type': 7,
     'path': '../input/clrp-funnellarge-attentions-act',
     'model_name': 'CLRPModelLarge'},
]

In [None]:
df['oof_lgb'] = np.nan
    
for fold in range(5):
    filename = f'lgb_oof_with_se_fold{fold}.pkl'
    model = lgb.Booster(model_file=filename)
    df.loc[df.fold==fold, 'oof_lgb'] = model.predict(df.loc[df.fold==fold, fe])

In [None]:
df['oof_final'] = 0.0
targets = df['target'].values

features = []

# BERTs OOF Feature
for cv_path in CV_PATHS:
    model_type = cv_path['model_type']
    features.append(df[f'oof_{model_type}'].values)

# LGBM OOF Feature
features.append(df['oof_lgb'].values)

features = np.array(features)


# Stacking by Linear Model
lm = linear_model.LinearRegression(fit_intercept=True)
lm.fit(features.T, targets)

for i, cv_path in enumerate(CV_PATHS):
    model_type = cv_path['model_type']
    df['oof_final'] += lm.coef_[i] * df[f'oof_{model_type}']

df['oof_final'] += lm.coef_[-1] * df['oof_lgb']
df['oof_final'] += lm.intercept_

print('CV score (stacking): ', RMSE_(df['target'], df['oof_final']))
oof_vs_target(df, x='target', y='oof_final')
print("model weight: ", lm.coef_)
print("bias: ", lm.intercept_)

In [None]:
df.to_csv('oof_df.csv', index=False)