# CommonLit Random Forest Baseline - Advanced Features - Training

- I call this an "advanced features" baseline because:
    - Takes some features from: 
        - https://www.kaggle.com/konradb/linear-baseline-with-cv
        - https://www.kaggle.com/ruchi798/commonlit-readability-prize-eda-baseline?scriptVersionId=61834534
        - https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline
        - Amazing work by:
            - https://www.kaggle.com/konradb
            - https://www.kaggle.com/anaverageengineer
            - https://www.kaggle.com/ruchi798
            - https://www.kaggle.com/ravishah1
    - Includes additional features
        - https://medium.com/analytics-vidhya/visualising-text-complexity-with-readability-formulas-c86474efc730
        - TF-IDF
        - LDA
        - Tag Count
        - Difficult Word features using data from Sagemaker Ground Truth labeling job
            - https://www.kaggle.com/yeayates21/commonlit-train-test-vocab
            - https://www.kaggle.com/yeayates21/commonlit-sagemaker-ground-truth-data-eda
    - Wraps everything in a sklearn custom transformer
- This is not the end-all and be-all of advanced features.  This is just 1 version.
- Using XGBoost.
- For similar but thiner pipeline without advanced features, see https://www.kaggle.com/yeayates21/commonlit-random-forest-na-ve-baseline

# Imports

In [None]:
import pandas as pd
import numpy as np
import re
import os
import gc
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, truncnorm, randint
import cloudpickle
import spacy
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk import pos_tag
from fuzzywuzzy import fuzz
from sklearn.model_selection import train_test_split

# Load Data

In [None]:
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df.head()

In [None]:
df2 = pd.read_csv("../input/mjycommonlitdata/bbc_commonlit_ssl.csv")
df2.rename(columns={"text": "excerpt"}, inplace=True)
df2.head()

# Feature Engineering Transformer

In [None]:
class CustomFeatureEngineeringTransfomer(BaseEstimator, TransformerMixin):
    def __init__(self, tfidf_stop_words='english', tfidf_ngram_max=3, tfidf_max_df=0.9, tfidf_min_df=0.01, 
                 tfidf_max_features=1000, vloc='', ldanc=30, smgtloc=''):
        super().__init__()
        self.tfidf_stop_words = tfidf_stop_words
        self.tfidf_ngram_max = tfidf_ngram_max
        self.tfidf_max_df = tfidf_max_df
        self.tfidf_min_df = tfidf_min_df
        self.tfidf_max_features = tfidf_max_features
        self.tv = TfidfVectorizer(stop_words=self.tfidf_stop_words,ngram_range=(1,self.tfidf_ngram_max),
                                  max_df=self.tfidf_max_df,min_df=self.tfidf_min_df,max_features=self.tfidf_max_features)
        self.vloc = "../input/english-word-frequency/unigram_freq.csv"
        self.smgtloc = "../input/mjycommonlitdata/CommonLit_SagemakerGroundTruth_WordDifficulty.csv"
        self.ldanc = ldanc
        self.lda = LatentDirichletAllocation(n_components=self.ldanc)
        
        
    def fit(self, X, y=None):
        self.tv = self.tv.fit(X)
        tv_res = self.tv.transform(X)
        self.lda = self.lda.fit(tv_res)
        return self
    
    def transform(self, X, y=None):
        fenp = X.copy()
        fedf = pd.DataFrame()
        fedf['text'] = fenp
        
        # count syllables: https://stackoverflow.com/questions/46759492/syllable-count-in-python
        def syllable_count(word):
            count = 0
            vowels = "aeiouy"
            if word[0] in vowels:
                count += 1
            for index in range(1, len(word)):
                if word[index] in vowels and word[index - 1] not in vowels:
                    count += 1
                    if word.endswith("e"):
                        count -= 1
            if count == 0:
                count += 1
            return count
        
        # number of characters
        fedf['nof_char'] = fedf['text'].apply(len)
        
        # number of words
        fedf['nof_words'] = fedf['text'].apply(lambda s: len(s.split(' ')))

        # words to character ratio
        fedf['w2c'] = fedf['nof_words'] / fedf['nof_char']
        
        # number of sentences
        fedf['nof_sentences'] = fedf['text'].apply(lambda s: s.count('.'))
        
        # number of quotes
        fedf['nof_quotes'] = fedf['text'].apply(lambda s: s.count('"') / 2)

        # number of syllables
        fedf['nof_syllables'] =  fedf['text'].apply(lambda s: syllable_count(s))
        
        def syllable_count2(word):
            count = 0
            vowels = "aeiouy"
            if (word=='') | (word==' ') | (word==None):
                return 0
            else:
                if word[0] in vowels:
                    count += 1
                for index in range(1, len(word)):
                    if word[index] in vowels and word[index - 1] not in vowels:
                        count += 1
                        if word.endswith("e"):
                            count -= 1
                if count == 0:
                    count += 1
                return count
        
        # number of polysyllabic words
        fedf['nof_polysyllabic'] =  fedf['text'].apply(lambda s: np.sum([1 for x in s.split(' ') if syllable_count2(x)>=2]))
        
        # number of pauses
        fedf['nof_pauses'] = fedf['text'].apply(lambda s: s.count(','))

        # Fleisch score
        # https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
        fedf['fsa'] = 206.835 - 1.015 * (fedf['nof_words'] / fedf['nof_sentences'])
        fedf['fsb'] = -84.6 * (fedf['nof_syllables'] / fedf['nof_words'])
        fedf['fleisch_score'] = fedf['fsa'] + fedf['fsb']

        # Fleisch score 2
        fedf['fsa2'] = (fedf['nof_words'] / fedf['nof_sentences'])
        fedf['fsb2'] = (fedf['nof_syllables'] / fedf['nof_words'])
        fedf['fleisch_score2'] = 0.39 * fedf['fsa2'] + 11.8 * fedf['fsb2'] - 15.59
        
        # Automated Readability Index
        fedf['auto_read_idx'] = 4.71*(fedf['nof_char']/fedf['nof_words']) + 0.5*fedf['fsa2'] + 21.43
        
        # Smog Index
        fedf['smog_idx'] = 1.0430*np.sqrt(fedf['nof_polysyllabic']*(30/fedf['nof_sentences'])) + 3.1291

        # number of unique words
        fedf['nof_unique_words'] = fedf['text'].apply(lambda s: len(set( s.split(' ') )))
        
        # longest word
        fedf['max_word_size'] = fedf['text'].apply(lambda s: max([len(x) for x in s.split(' ')]))
        
        # average word size
        fedf['avg_word_size'] = fedf['text'].apply(lambda s: np.mean([len(x) for x in s.split(' ')]))
        
        # median word size
        fedf['med_word_size'] = fedf['text'].apply(lambda s: np.median([len(x) for x in s.split(' ')]))

        # text diversity
        fedf['txt_diversity'] = fedf['nof_unique_words'] / fedf['nof_words']
        
        # spacy
        nlp = spacy.load('en_core_web_lg')
        with nlp.disable_pipes():
            train_vectors = np.array([nlp(text).vector for text in fedf['text'].values.tolist()])
            
        namelist = ['f' + str(ii) for ii in range(train_vectors.shape[1])]
        spdf = pd.DataFrame(train_vectors)
        spdf.columns = namelist
        
        # vocab complexity (low values means more complex words and high values means more commonly used English words)
        easy_vocab = pd.read_csv(self.vloc)
        total_count = sum(easy_vocab['count'].values)
        easy_vocab_dic = easy_vocab.set_index('word').to_dict()
        sub_val = 1
        fedf['vocab_complexity1'] = fedf['text'].apply(lambda x: 
                                                       sum([easy_vocab_dic['count'].get(s, sub_val) 
                                                            for s in re.sub(r'([^a-zA-Z ]+?)','',x).lower().split(' ')]))
        # normalized by number of total words in text and total count in 'easy_vocab'
        fedf['vocab_complexity1_norm'] = fedf['vocab_complexity1'] / (fedf['nof_words'] + ([total_count]*len(fedf)))
        
        # number of complex words
        fedf['nof_complex_words'] = fedf['text'].apply(lambda x: 
                                                       sum([1 for s in re.sub(r'([^a-zA-Z ]+?)','',x).lower().split(' ') 
                                                            if easy_vocab_dic['count'].get(s, sub_val)<369180]))
        
        # Gunning Fog
        fedf['gunning_fog'] = 0.4*((fedf['nof_words']/fedf['nof_sentences'])+100*(fedf['nof_complex_words']/fedf['nof_words']))
        
        # proper noun estimate
        # (we subtract by nof_sentences because we assume each sentence starts with a capital letter)
        fedf['proper_noun_est'] = fedf['text'].apply(lambda x: sum(1 for c in x if x.isupper())) - fedf['nof_sentences']
        
        # number of questions
        fedf['nof_questions'] = fedf['text'].apply(lambda s: s.count('?'))
        
        # number of exclamations
        fedf['nof_exclamations'] = fedf['text'].apply(lambda s: s.count('!'))
        
        # number of colons
        fedf['nof_colons'] = fedf['text'].apply(lambda s: s.count(':'))
        
        # number of semicolons
        fedf['nof_semicolons'] = fedf['text'].apply(lambda s: s.count(';'))
        
        # number of commas
        fedf['nof_commas'] = fedf['text'].apply(lambda s: s.count(','))
        
        # number of hyphens
        fedf['nof_hyphens'] = fedf['text'].apply(lambda s: s.count('-'))
        
        # sentiment
        fedf['polarity'] = fedf['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
        fedf['subjectivity'] = fedf['text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
        
        # number of noun phrases
        fedf['nof_noun_phrases'] = fedf['text'].apply(lambda x: len(TextBlob(x).noun_phrases))
        
        ### pos columns
        fedf['pos_tags'] = fedf['text'].str.split().map(pos_tag)
        
        def count_tags(pos_tags):
            tag_count = {}
            for word,tag in pos_tags:
                if tag in tag_count:
                    tag_count[tag] += 1
                else:
                    tag_count[tag] = 1
            return tag_count

        fedf['tag_counts'] = fedf['pos_tags'].map(count_tags)
        tag_cols = ['UH', ')', 'RP', 'WP', "''", 'POS', '.', 'NN', 'WRB', ',', '$', 'TO', ':', 'JJ', 'VB', 'VBN', 'RBS', 'NNS', 
                    'NNP', 'PDT', 'JJS', 'NNPS', 'IN', 'CD', 'VBP', '(', 'PRP$', 'WDT', 'RBR', 'WP$', 'VBD', 'JJR', 'FW', 'RB', 
                    'CC', 'DT', 'PRP', 'VBZ', 'MD', 'EX', 'VBG']
        
        for tag in tag_cols:
            fedf[tag] = fedf['tag_counts'].map(lambda x: x.get(tag, 0))
        
        fedf.drop(['pos_tags','tag_counts'], axis=1, inplace=True)
        
        ### Sagemaker Ground Truth Features
        sgt = pd.read_csv("../input/mjycommonlitdata/CommonLit_SagemakerGroundTruth_WordDifficulty.csv")
        vocab_dic0 = sgt[['source','Easy_Word_Conf']].set_index('source').to_dict()
        vocab_dic1 = sgt[['source','Modr_Word_Conf']].set_index('source').to_dict()
        vocab_dic2 = sgt[['source','Diff_Word_Conf']].set_index('source').to_dict()
        vocab_dic3 = sgt[['source','NotW_Word_Conf']].set_index('source').to_dict()
        
        def sgt_txt_cln(x):
            words = list(re.split(r'\W+', x))
            return [x for x in words if not any(c.isdigit() for c in x)]
        
        def avg_easy_word_scorer(x):
            words_no_num = sgt_txt_cln(x)
            scores = [vocab_dic0['Easy_Word_Conf'].get(x, 0) for x in words_no_num]
            score = np.mean(scores)
            return score
        
        # average easy word score
        fedf['EasyWordScore'] = fedf['text'].apply(lambda x: avg_easy_word_scorer(x))
        
        def avg_modr_word_scorer(x):
            words_no_num = sgt_txt_cln(x)
            scores = [vocab_dic1['Modr_Word_Conf'].get(x, 0) for x in words_no_num]
            score = np.mean(scores)
            return score
        
        # average moderate word score
        fedf['ModrWordScore'] = fedf['text'].apply(lambda x: avg_modr_word_scorer(x))
        
        def avg_diff_word_scorer(x):
            words_no_num = sgt_txt_cln(x)
            scores = [vocab_dic2['Diff_Word_Conf'].get(x, 0) for x in words_no_num]
            score = np.mean(scores)
            return score
        
        # average difficult word score
        fedf['DiffWordScore'] = fedf['text'].apply(lambda x: avg_diff_word_scorer(x))
        
        def avg_notw_word_scorer(x):
            words_no_num = sgt_txt_cln(x)
            scores = [vocab_dic3['NotW_Word_Conf'].get(x, 0) for x in words_no_num]
            score = np.mean(scores)
            return score
        
        # average 'not a word' word score
        fedf['NotwWordScore'] = fedf['text'].apply(lambda x: avg_notw_word_scorer(x))
        
        def nof_easy_words(x):
            words_no_num =sgt_txt_cln(x)
            count = [1 if vocab_dic0['Easy_Word_Conf'].get(x, 0)>.9 else 0 for x in words_no_num]
            score = np.sum(count)
            return score
        
        # number of easy words
        fedf['nof_easy_words'] = fedf['text'].apply(lambda x: nof_easy_words(x))
        
        def nof_modr_words(x):
            words_no_num =sgt_txt_cln(x)
            count = [1 if vocab_dic1['Modr_Word_Conf'].get(x, 0)>.9 else 0 for x in words_no_num]
            score = np.sum(count)
            return score
        
        # number of moderate words
        fedf['nof_modr_words'] = fedf['text'].apply(lambda x: nof_modr_words(x))
        
        def nof_diff_words(x):
            words_no_num =sgt_txt_cln(x)
            count = [1 if vocab_dic2['Diff_Word_Conf'].get(x, 0)>.9 else 0 for x in words_no_num]
            score = np.sum(count)
            return score
        
        # number of difficult words
        fedf['nof_diff_words'] = fedf['text'].apply(lambda x: nof_diff_words(x))
        
        def nof_notw_words(x):
            words_no_num =sgt_txt_cln(x)
            count = [1 if vocab_dic3['NotW_Word_Conf'].get(x, 0)>.9 else 0 for x in words_no_num]
            score = np.sum(count)
            return score
        
        # number of "not word" words
        fedf['nof_notw_words'] = fedf['text'].apply(lambda x: nof_notw_words(x))
        
        def nof_easy_words_v2(x):
            words_no_num = sgt_txt_cln(x)
            sclas = [np.argmax([vocab_dic0['Easy_Word_Conf'].get(x, 0),
                                vocab_dic1['Modr_Word_Conf'].get(x, 0),
                                vocab_dic2['Diff_Word_Conf'].get(x, 0),
                                vocab_dic3['NotW_Word_Conf'].get(x, 0)]) for x in words_no_num]
            score = np.sum([1 if clas==0 else 0 for clas in sclas])
            return score
        
        # number of easy words - version 2
        fedf['nof_easy_words_v2'] = fedf['text'].apply(lambda x: nof_easy_words_v2(x))
        
        # easy word ratio
        fedf['easy_words_v2_ratio'] = fedf['nof_easy_words_v2'] / fedf['nof_words']
        
        def nof_modr_words_v2(x):
            words_no_num = sgt_txt_cln(x)
            sclas = [np.argmax([vocab_dic0['Easy_Word_Conf'].get(x, 0),
                                vocab_dic1['Modr_Word_Conf'].get(x, 0),
                                vocab_dic2['Diff_Word_Conf'].get(x, 0),
                                vocab_dic3['NotW_Word_Conf'].get(x, 0)]) for x in words_no_num]
            score = np.sum([1 if clas==1 else 0 for clas in sclas])
            return score
        
        # number of easy words - version 2
        fedf['nof_modr_words_v2'] = fedf['text'].apply(lambda x: nof_modr_words_v2(x))
        
        # moderate word ratio
        fedf['modr_words_v2_ratio'] = fedf['nof_modr_words_v2'] / fedf['nof_words']
        
        def nof_diff_words_v2(x):
            words_no_num = sgt_txt_cln(x)
            sclas = [np.argmax([vocab_dic0['Easy_Word_Conf'].get(x, 0),
                                vocab_dic1['Modr_Word_Conf'].get(x, 0),
                                vocab_dic2['Diff_Word_Conf'].get(x, 0),
                                vocab_dic3['NotW_Word_Conf'].get(x, 0)]) for x in words_no_num]
            score = np.sum([1 if clas==2 else 0 for clas in sclas])
            return score
        
        # number of easy words - version 2
        fedf['nof_diff_words_v2'] = fedf['text'].apply(lambda x: nof_diff_words_v2(x))
        
        # difficult word ratio
        fedf['diff_words_v2_ratio'] = fedf['nof_diff_words_v2'] / fedf['nof_words']
        
        def nof_notw_words_v2(x):
            words_no_num = sgt_txt_cln(x)
            sclas = [np.argmax([vocab_dic0['Easy_Word_Conf'].get(x, 0),
                                vocab_dic1['Modr_Word_Conf'].get(x, 0),
                                vocab_dic2['Diff_Word_Conf'].get(x, 0),
                                vocab_dic3['NotW_Word_Conf'].get(x, 0)]) for x in words_no_num]
            score = np.sum([1 if clas==3 else 0 for clas in sclas])
            return score
        
        # number of 'not word' words - version 2
        fedf['nof_notw_words_v2'] = fedf['text'].apply(lambda x: nof_notw_words_v2(x))
        
        # 'not word' word ratio
        fedf['notw_words_v2_ratio'] = fedf['nof_notw_words_v2'] / fedf['nof_words']
        
        # complex language v1
        fedf['complex_lang_v1'] = fedf['nof_complex_words']/ fedf['nof_words']
        
        # straight avg complex language
        fedf['strgt_avg_complex_lang'] = fedf['diff_words_v2_ratio']*0.5 + fedf['complex_lang_v1']*0.5
        
        # sentence statistics (average length, max length; by words and by characters)
        fedf['avg_sent_wrd_len'] = fedf['text'].apply(lambda s: np.mean([len(j) for j in [x.split(' ') for x in s.split('.')]]))
        fedf['max_sent_wrd_len'] = fedf['text'].apply(lambda s: np.max([len(j) for j in [x.split(' ') for x in s.split('.')]]))
        fedf['avg_sent_str_len'] = fedf['text'].apply(lambda s: np.mean([len(x) for x in s.split('.')]))
        fedf['max_sent_str_len'] = fedf['text'].apply(lambda s: np.max([len(x) for x in s.split('.')]))
        
        # tfidf
        tv_res = self.tv.transform(fenp)
        tvdf = pd.DataFrame(tv_res.todense(), columns=self.tv.get_feature_names())
        #tvdf['avg_tfidf'] = tvdf.mean(axis=1)
        #tvdf['med_tfidf'] = tvdf.median(axis=1) # most values end up being 0
        #tvdf['lowq_tfidf'] = tvdf.quantile(.1, axis=1) # most values end up being 0
        
        # lda
        lda_res = self.lda.transform(tv_res)
        ladf = pd.DataFrame(lda_res, columns=["LDA"+str(x) for x in range(lda_res.shape[1])])
        
        # final join
        df = pd.concat([fedf, spdf, tvdf, ladf], axis = 1)
        df.drop(['text'], axis=1, inplace=True)
        
        return df

# Train Validation Split

In [None]:
df1, valid = train_test_split(df, test_size=0.25, random_state=42)

# stack data in 1 training set
train = pd.concat([
    df1[['excerpt', 'target']],
    df2[['excerpt', 'target']]
])

x_train = train.excerpt.values
y_train = train.target.values

x_valid = valid.excerpt.values
y_valid = valid.target.values

# Hyperparameter Search Pipeline

In [None]:
%%time

cfet = CustomFeatureEngineeringTransfomer(tfidf_stop_words='english')
xgb = XGBRegressor()
pipe = Pipeline([('cfet', cfet), ('xgb', xgb)])
distributions = {'cfet__tfidf_max_df': uniform(.6, .05),
                 'cfet__tfidf_min_df': uniform(0.02, .3),
                 'cfet__tfidf_max_features': randint(10,600),
                 'cfet__tfidf_ngram_max': [3],
                 'cfet__ldanc': randint(5,75),
                 'xgb__n_estimators': randint(5,400),
                 'xgb__max_depth': randint(1,5)}
reg = RandomizedSearchCV(pipe, distributions, random_state=4582, n_iter=32, cv=3, 
                         scoring='neg_mean_squared_error', n_jobs=-1, verbose=2, error_score='raise')
search = reg.fit(x_train, y_train) # best model is search.best_estimator_

# Save Model

In [None]:
cloudpickle.dump(search.best_estimator_, open("comlit_xgb_model6.pkl", "wb"))

# Results

#### Best Score and Parameter Set

In [None]:
print("Best model scored mse of {} using the following hyperparameters: {}".format(search.best_score_, search.best_params_))
print("")
print("CV RMSE: ", np.sqrt(np.abs(search.best_score_)))
print("")

#### Holdout Score

In [None]:
validation_predictions = search.best_estimator_.predict(x_valid)
print("Holdout RMSE: ", np.sqrt(mean_squared_error(y_valid, validation_predictions)))
print("")

# Feature Importance

In [None]:
importances = search.best_estimator_.named_steps['xgb'].feature_importances_
cfet = CustomFeatureEngineeringTransfomer(tfidf_max_df=search.best_params_['cfet__tfidf_max_df'],tfidf_min_df=search.best_params_['cfet__tfidf_min_df'],
                                          tfidf_max_features=search.best_params_['cfet__tfidf_max_features'],
                                          tfidf_ngram_max=search.best_params_['cfet__tfidf_ngram_max'],
                                          tfidf_stop_words='english')
features = list(cfet.fit_transform(df['excerpt']).columns)
fidf = pd.DataFrame()
fidf['feature'] = pd.Series(features)
fidf['importance'] = pd.Series(importances)
fidf.sort_values(by=['importance'], ascending=False, inplace=True)
fidf.head(20)

# Inference

(just for QA, we'll run inference in another notebook for private LB speed)

In [None]:
dft = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
dft.head()

In [None]:
target = search.best_estimator_.predict(dft['excerpt'])

In [None]:
dft['target'] = target
dft.head()

In [None]:
dft[['id','target']].to_csv("submission.csv", index=False)

# Notebook Complete

In [None]:
print("Program complete!")