## Introduction
The goal of this competition is to predict the text difficulty for a given text.The text difficulty here ranges from **-3.67 to 1.71**,the latter one being most difficult.
Here in this notebook I am doing an **Exploratory Data Analysis** and baseline model using **tensorflow and roberta** ( huggingface). Hope you will learn something new from this notebook.

<img src="https://media.giphy.com/media/WoWm8YzFQJg5i/giphy.gif" alt="Paris" class="center" >


### Importing imporant libraries

In [None]:
!pip install ../input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl
!pip install ../input/textstat-pypi/textstat-0.7.0-py3-none-any.whl


In [None]:
import pandas as pd
import numpy as np
import tokenizers
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.stem import PorterStemmer,WordNetLemmatizer
import matplotlib.colors as mcolors
import matplotlib.colors as mcolors
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
import gensim,pyLDAvis
from collections import Counter
import pyLDAvis.gensim_models as gensimvis
from nltk.tokenize import word_tokenize
from warnings import filterwarnings
import tensorflow as tf
import tensorflow.keras.backend as K
from transformers import *
from sklearn.model_selection import train_test_split,KFold
from tqdm import tqdm
from tensorflow.keras.layers import Dense, Input,GlobalAveragePooling1D,Dropout,Average
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from IPython.core.display import display, HTML
import xgboost as xgb
ROOT = '../input/tf-meta-features-bert/'

In [None]:
class color:
    BOLD = '\033[1m' + '\033[93m'
    END = '\033[0m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    
sns.set(style='whitegrid')
   
filterwarnings("ignore")
display(HTML("<style>.container { max-width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))
display(HTML("<style>.output_area { max-width:100% !important; }</style>"))
display(HTML("<style>.input_area { max-width:100% !important; }</style>"))
pyLDAvis.enable_notebook()

## Read Data

In [None]:
df_train = pd.read_csv("../input/commonlitreadabilityprize/train.csv",nrows=300)
df_test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

print(f"Train data has {df_train.shape[0]} rows as {df_train.shape[1]} columns")
print(f"Test data has {df_test.shape[0]} rows as {df_test.shape[1]} columns")

In [None]:
df_train.head(5)

## EDA

### Let's look at the target distribution first.

In [None]:
plt.figure(figsize=(20, 15))
sns.displot(df_train['target'],kde=True)
plt.title("Target Distribution", size=20)
plt.show()

- This looks like a normal distrbution with values ranging from ~ -3 to ~-1. Let's look at the minimum and maximum value of the target here.

In [None]:
print(f"The min value of target is {color.BOLD} {df_train['target'].min()} {color.END} and max value of target is {color.BOLD} {df_train['target'].max()}")

#### Standard Error

There is a column names **standard_error**. What is this?
Multiple coders rate each texts and they might disagree on the readability of texts. This measure of spread of scores among multiple raters for each text.


In [None]:
plt.figure(figsize=(20, 15))
sns.displot(df_train['standard_error'],kde=True)
plt.title("Target Distribution", size=20)
plt.show()

We can observe that the standard error is very left skewed, this implies that there are many instances in which the coders ( people who rate the text) have disagreed.

### Binning and segment Analysis

In [None]:
bins = 3
w = (df_train['target'].max() - df_train['target'].min())/bins
vals = df_train['target'].min()+w,df_train['target'].min()+2*w,df_train['target'].min()+3*w


In [None]:
def do_binning(x,vals=vals):
    
    if x<=vals[0]:
        return "Low"
    elif x>vals[0] and x<=vals[1]:
        return "Medium"
    else:
        return "High"

In [None]:
df_train['segment'] = df_train['target'].map(lambda x : do_binning(x))

In [None]:
plt.figure(figsize=(8, 5))
x=df_train.segment.value_counts().index
y=df_train.segment.value_counts().values
sns.barplot(x,y)
plt.show()

## Text Level Analysis

### WordCloud

In [None]:
stop=set(stopwords.words('english'))

def preprocess(df):
    corpus=[]
    stem=PorterStemmer()
    lem=WordNetLemmatizer()
    for news in df['excerpt']:
        words=[w for w in word_tokenize(news) if (w not in stop)]
        
        words=[lem.lemmatize(w) for w in words if len(w)>2]
        
        corpus.append(words)
    return corpus


In [None]:
def show_wordcloud(data):
    data = " ".join([ i for texts in data for i in texts])
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stop,
        max_words=150,
        max_font_size=50,
        scale=3,
        random_state=1,width = 800, height = 800)
   
    wordcloud=wordcloud.generate(str(data))

    return wordcloud


fig,ax = plt.subplots(1,3,figsize=(16,6))

for i,segment in enumerate(['Low','Medium','High']):
    
    corpus = preprocess(df_train.query(f'segment=="{segment}"'))
    wordcloud = show_wordcloud(corpus)
    ax[i].imshow(wordcloud)
    ax[i].set_title(segment,fontweight='bold')
    ax[i].axis('off')
    
    
plt.show()
    



## Baseline

## Tokenizer 

Here I am using ByterLevelBPEtokenizer. 

In [None]:
MAX_LEN = 256
PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab=PATH+'vocab-roberta-base.json', 
    merges=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)

In [None]:
def batch_encode_texts(texts,tokenizer,batch_size=256):

    
    
    all_ids = []
    all_masks = []
    
    for batch in tqdm(range(0,len(texts),batch_size)):
        encoder = tokenizer.encode_batch(texts[batch:batch+batch_size],add_special_tokens=False)
        all_ids.extend([([0]+enc.ids[:MAX_LEN-2]+[2])+[0]*(MAX_LEN-(len(enc.ids)+2)) for enc in encoder])
        all_masks.extend([([1]+mask.attention_mask[:MAX_LEN-2]+[1])+[0]*(MAX_LEN-(len(mask.attention_mask)+2)) for mask in encoder])
        
    return np.array(all_ids),np.array(all_masks)

    
    

### Meta-Features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
import spacy
from textstat.textstat import textstatistics, legacy_round
import textstat

In [None]:
with open("../input/english-common-words/20k.txt",'r') as file:
    common_words = file.readlines()

In [None]:
stops = np.array(stopwords.words('english'))
with open("../input/english-common-words/20k.txt",'r') as file:
    common_words = file.readlines()
    
NLP = spacy.load('en')


def make_sentences(text):
    doc = NLP(text)
    return [x for x in doc.sents]
  
    
def get_meta_features(df,col):
    
    tfidf = TfidfVectorizer()
    #tsvd = TruncatedSVD(n_components = 50)
    
    df['word_count'] = df[col].map(lambda x : len(x.split()))
    df['unique_words'] = df[col].map(lambda x : len(np.unique(x.split())))
    df['char_len'] = df[col].str.len()
    df['stop_words'] = df[col].map(lambda x : len(np.intersect1d(x.split(),stops)))
    df['sentence_count'] = df[col].map(lambda x : len(make_sentences(x)))
    df['common_words'] = df[col].map(lambda x : len(np.intersect1d(x.split(),common_words)))
    df['syllable_count'] = df[col].map(lambda x : textstatistics().syllable_count(x))
    df['average_syllable_word'] = df['word_count']/df['syllable_count']
    df['average_sentence_len'] = df['word_count']/df['sentence_count']
    df['flesch_reading_ease'] = df[col].map(lambda x : textstat.flesch_reading_ease(x) )
    df['smog_index'] = df[col].map(lambda x : textstat.smog_index(x))
    df['difficult_words'] = df[col].map(lambda x : textstat.difficult_words(x))
    
    
    vectors = tfidf.fit_transform(df[col].values).toarray()
    
    #vectors = tsvd.fit_transform(vectors)
    count_features = df[['word_count','unique_words','char_len','stop_words','sentence_count',
                         'common_words','syllable_count','average_syllable_word','average_sentence_len',
                        'flesch_reading_ease','smog_index','difficult_words']].values
    return np.hstack([count_features,vectors])
    

## Model
This is a very simple model that averages the 786 dim output and passes it through a linear layer to produce teh output.

In [None]:
def commonlit_model(max_len=256):
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    attention_masks = Input(shape=(max_len,), dtype=tf.int32, name="attention_masks")
    
    config = RobertaConfig.from_pretrained('../input/tf-roberta/config-roberta-base.json')
    bert_model = TFRobertaModel.from_pretrained('../input/tf-roberta/pretrained-roberta-base.h5',config=config)
    output = bert_model(input_word_ids,attention_mask = attention_masks)[0]
    
    pool = GlobalAveragePooling1D()(output)
    
    dense=[]
    FC = Dense(32,activation='relu')
    for p in np.linspace(0.1,0.5,5):
        x=Dropout(p)(pool)
#         x=FC(x)
        x=Dense(1)(x)
        dense.append(x)
    
    out = Average()(dense)
    model = Model(inputs=[input_word_ids,attention_masks], outputs=out)
    model.compile(Adam(lr=1e-5), loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    return model

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE =16

### KFold Model

I am using 10 fold split, only using the first 5 folds.


In [None]:
SEED=42

def train_nn(df_train,n_splits):
    
    train_ids,train_masks = batch_encode_texts(df_train['excerpt'].values,tokenizer)
    target = df_train['target'].values
    
    skf = KFold(n_splits=n_splits,shuffle=True,random_state=SEED)
    train_metics,valid_metrics=[],[]

    for fold,(train_idx,valid_idx) in enumerate(skf.split(df_train)):


            x_train_ids,x_train_masks,y_train = train_ids[train_idx],train_masks[train_idx],target[train_idx]
            x_valid_ids,x_valid_masks,y_valid = train_ids[valid_idx],train_masks[valid_idx],target[valid_idx]



            train_dataset = (
            tf.data.Dataset
            .from_tensor_slices(((x_train_ids,x_train_masks,train_meta),y_train))
            .repeat()
            .shuffle(2048)
            .batch(BATCH_SIZE)
            .prefetch(AUTO)
            )

            valid_dataset = (
                tf.data.Dataset
                .from_tensor_slices(((x_valid_ids,x_valid_masks),y_valid))
                .batch(BATCH_SIZE)
                .cache()
                .prefetch(AUTO)
            )

            checkpoint = tf.keras.callbacks.ModelCheckpoint(
                f'commonlit_fold{fold}.h5', monitor='val_root_mean_squared_error', verbose=0, save_best_only=True,
                save_weights_only=True, mode='auto', save_freq='epoch')

            model = commonlit_model()

            n_steps = len(train_idx)// BATCH_SIZE
            valid_steps = len(valid_idx)// BATCH_SIZE

            print(color.OKCYAN,f"Eval fold {fold}...")
            train_history = model.evaluate(valid_dataset)
            print(color.OKBLUE,f"Average train RMSE of fold {fold} = {np.mean(train_history.history['root_mean_squared_error']):.4f}")
            print(color.OKBLUE,f"Average validation RMSE of fold {fold} = {np.mean(train_history.history['val_root_mean_squared_error']):.4f}")

            train_metics.append(np.mean(train_history.history['root_mean_squared_error']))
            valid_metrics.append(np.mean(train_history.history['val_root_mean_squared_error']))
        
       
            
       
    




In [None]:
def train_xgb(df_train,n_splits):
    

    train_meta_features = get_meta_features(df_train,'excerpt')

    skf = KFold(n_splits=n_splits,shuffle=True,random_state=SEED)
    best_iterations=[]
    oof_rmses,train_rmses=[],[]

    for fold,(train_idx,valid_idx) in enumerate(skf.split(df_train)):

        dtrain = xgb.DMatrix(train_meta_features[train_idx],target[train_idx])
        dvalid =  xgb.DMatrix(train_meta_features[valid_idx],target[valid_idx])

        evals_result = dict()
        booster = xgb.train(params,
                            dtrain,
                            evals=[(dtrain, 'train'), (dvalid, 'valid')],
                            num_boost_round=300,
                            early_stopping_rounds=20,
                            evals_result=evals_result,
                            verbose_eval=False)

        best_iteration = np.argmin(evals_result['valid']['rmse'])
        best_iterations.append(best_iteration)
        oof_rmse = evals_result['valid']['rmse'][best_iteration]
        train_rmse = evals_result['train']['rmse'][best_iteration]
        oof_rmses.append(oof_rmse)
        train_rmses.append(train_rmse)

    evals_df = pd.DataFrame()
    evals_df['fold'] = range(1, skf.n_splits+1)
    evals_df['best_iteration'] = best_iterations
    evals_df['oof_rmse'] = oof_rmses
    evals_df['train_rmse'] = train_rmses

    display(evals_df)
    print('mean oof rmse = {}'.format(np.mean(oof_rmses)))


    


## Inference

In [None]:
import joblib


In [None]:
def inference_nn(df_test,n_splits=7):
    
    
    test_ids,test_masks = batch_encode_texts(df_test['excerpt'].values,tokenizer)
    y_test = np.zeros((len(df_test)))

    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(((test_ids,test_masks),y_test))
        .batch(BATCH_SIZE)
    )
    
    preds_nn = np.zeros((len(test_ids),1))
    model = commonlit_model()
    for fold in range(n_splits):
        model.load_weights(ROOT+f'commonlit_fold{fold}.h5')
        print(color.OKGREEN,f"Inference fold {fold}...")
        preds_nn += model.predict(test_dataset, verbose=1)/ n_splits

        
    return preds_nn

 
def inference_xgb(df_test,n_splits=7):
    
    test_meta_features = get_meta_features(df_test,'excerpt')
    
    preds_xgb = np.zeros((len(test_meta_features)))
    for fold in tqdm(range(n_splits)):
        booster = joblib.load(ROOT+f'xgb_fold{fold}')
        preds_xgb += booster.predict(xgb.DMatrix(test_meta_features)) / n_splits
        
    return preds_xgb



    

In [None]:
preds_nn = inference_nn(df_test)
preds_xgb = inference_xgb(df_test)


In [None]:
preds = np.average([preds_nn.flatten(),preds_xgb.flatten()],axis=0)

In [None]:
sub = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
sub['target'] = preds
sub.to_csv('submission.csv', index=False)

### Work in Progress! I hope you liked it :) 