This notebook demonstrates usage examples the following modules/packages:
* Matplotlib and seaborn for plots
* Some NLTK module functions for text cleaning, splitting and lemmatisation
* Universal Sentence Encoder for text vectorization
* XGBRegressor model training on data folds
* Optuna for XGBRegressor hyperparameters optimization

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import optuna

import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub

np.random.seed(42)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv", low_memory=False)
test = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv", low_memory=False)
train.info(memory_usage="deep")

In [None]:
test.info(memory_usage="deep")

In [None]:
train.head()

# **Data analysis**

In [None]:
train["target"].describe()

In [None]:
fig, ax = plt.subplots(figsize=(7, 4))
ax.hist(train["target"], bins=20, edgecolor="black", color="steelblue")
ax.set_title("Target distribution", fontsize=15)
ax.set_xlabel("Target", fontsize=12)
ax.set_ylabel("Amount of observations", fontsize=12)
plt.show();

In [None]:
train["standard_error"].describe()

In [None]:
fig, ax = plt.subplots(figsize=(7, 4))
ax.hist(train["standard_error"], bins=20, edgecolor="black", color="palevioletred")
ax.set_title("Standard error distribution", fontsize=15)
ax.set_xlabel("Standard error", fontsize=12)
ax.set_ylabel("Amount of observations", fontsize=12)
plt.show();

To make further analysis we need to exctract more data from text exceprts. So the function below calculates some numeric parameters I think may be important for target predicting like text length decrease after deleting stop words, mean sentence length, amount of quotes (i.e. dialogs), word and lemma mean length etc. The function returns only these parameters. The transformed and preprocessed text object is not returned.

In [None]:
def get_text_data_parameters(data, stop_words):
    """
    Calculates some numeric parameters of paragraphs of the given series object.
    """
    
    text_shortage = []
    quotes = []
    sentences = []
    sent_length = []
    word_length = []
    lemma_length = []
    
#     new_data = []
    for row in data:
        # Amount of quotes devided by 2 to determine if there is any dialogue
        quotes.append(row.count('"')/2)
        # The original, raw text paragraph lenght
        initial_length = len(row)
        # Using nltk tokenizer to split a text into sentences to determine their amount
        num_sent = len(sent_tokenize(row))
        sentences.append(num_sent)
        # Getting rid of all noncharacter symbols and splitting a text into 
        # words using nltk tokenizer and getting amount of words
        row = re.sub("[^a-zA-Z]", " ", row)
        row = row.lower()
        row = word_tokenize(row)
        num_words = len(row)
        
        # Calculating mean amount of words per sentence and mean word length 
        sent_length.append(num_words/num_sent)
        word_length.append(initial_length/num_words)
        # Splitting text data into words and dropping stop words
        row = [word for word in row if not word in stop_words]
        # Words lemmatisation
        lemma = nltk.WordNetLemmatizer()
        row = [lemma.lemmatize(word) for word in row]
        num_lemmas = len(row)
        row = " ".join(row)
        # Text length after cleaning and lemmatisation
        processed_length = len(row)
        # Calculating mean lemma length and amount of text shrinkage after the processing
        lemma_length.append(processed_length/num_lemmas)
        text_shortage.append(processed_length/initial_length)
    
    # Creating a dataframe containing all calculated parameters
    result_df = pd.concat([pd.Series(text_shortage), pd.Series(quotes),
                          pd.Series(sentences), pd.Series(sent_length),
                          pd.Series(word_length), pd.Series(lemma_length)], axis=1)
    result_df.columns = ["text_shortage", "num_quotes",
                        "num_sentences", "sent_length",
                        "mean_word_length", "mean_lemma_length"]
    
    return result_df

In [None]:
# Stopwords import from nltk 
stop_words = set(stopwords.words("english"))

In [None]:
text_params = get_text_data_parameters(train["excerpt"].copy(), stop_words)
text_params.head()

Let's take a look at these parameters.

In [None]:
df = pd.concat([text_params, train["target"]], axis=1)

ax = sns.pairplot(data=df,
                  diag_kws=dict(bins=15, color="lightcoral"),
                  plot_kws=dict(color="seagreen"))
ax.fig.suptitle("Target and text parameters pairplots", fontsize=15, y=1.03)
plt.show();

In [None]:
# Plot dataframe
corr = df.corr().round(2)

# Mask to hide upper-right part of plot as it is a duplicate
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

# Making a plot
plt.figure(figsize=(8,8))
ax = sns.heatmap(corr, annot=True, mask=mask, cmap="RdBu", linewidths=1,
                 annot_kws={"weight": "bold", "fontsize": 13})

ax.set_title("Target and text parameters heatmap", fontsize=15, y=1.03)
plt.show();

# **Data preparation**

In [None]:
def scale_train_text_params(df):
    scalers = []
#     scaled_columns = []
    
    for column in df.columns:
        scaler = MinMaxScaler()
        df[column] = scaler.fit_transform(np.array(df[column]).reshape(-1, 1))
        scalers.append(scaler)
#         scaled_columns.append(scaled_column)

    return df, scalers

def scale_test_text_params(df, scalers):
#     scaled_columns = []
    
    for i, column in enumerate(df.columns):
        scaler = scalers[i]
        df[column] = scaler.transform(np.array(df[column]).reshape(-1, 1))
#         scaled_columns.append(scaled_column)

    return df

def vectorize_text_data(X_train, text_params, vectorizer):
    vectorized_text = vectorizer(X_train)
    print(f"Vectorized text shape: {vectorized_text.shape}")
    
    X_train = np.concatenate((text_params.to_numpy(), vectorized_text), axis=1)
    print(f"Concatenated data shape: {X_train.shape}")
    
    return X_train

In [None]:
# Loading Universal Sentence Encoder
vectorizer = hub.load("/kaggle/input/universalsentenceencoderv4tf20/")

In [None]:
# Scaling text parameters
text_params, scalers = scale_train_text_params(text_params)
# Vectorizing text data and concatenating it with text parameters
X = vectorize_text_data(train["excerpt"].str.lower().copy(), text_params, vectorizer)
y = train["target"].copy()

# **Optuna**

In [None]:
# Calculating edges of target bins to be used for stratified split
target_bin_edges = np.histogram_bin_edges(train["target"], bins=10)
target_bin_edges[0] = -np.inf
target_bin_edges[-1] = np.inf
target_bins = pd.cut(train["target"], target_bin_edges, labels=np.arange(10))
target_bins

In [None]:
def train_model_optuna(trial, X_train, X_valid, y_train, y_valid):
    """
    A function to train a model using different hyperparamerters combinations provided by Optuna. 
    RMSE of validation data predictions is returned to estimate hyperparameters effectiveness.
    """
    preds = 0
    
       
    #A set of hyperparameters to optimize by optuna
    xgb_params = {
                 "n_estimators": trial.suggest_categorical('n_estimators', [4000]),
                 "learning_rate": trial.suggest_float('learning_rate', 0.01, 0.8),
                 "max_depth": trial.suggest_int("max_depth", 2, 30),
                 "booster": trial.suggest_categorical('booster', ["gbtree"]),
                 "tree_method": trial.suggest_categorical('tree_method', ["auto"]),
        
                 "reg_lambda": trial.suggest_float('reg_lambda', 0.00001, 0.9),
                 "reg_alpha": trial.suggest_float('reg_alpha', 0.00001, 0.9),
                 "random_state": trial.suggest_categorical('random_state', [42]),
                 "n_jobs": trial.suggest_categorical('n_jobs', [4]),
                    }

    # Model loading and training
    model = XGBRegressor(**xgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="rmse",
              early_stopping_rounds=100,
              verbose=False)
    
    # Out of fold predictions
    oof = model.predict(X_valid)
    # Number of actually grown trees before overfitting is detected 
    print(f"Number of boosting rounds: {model.best_iteration}")
    
    return np.sqrt(mean_squared_error(y_valid, oof))

The code below runs hyperparameters optimization. It is commeted to save runtime.

In [None]:
# %%time
# # Splitting data into train and valid folds using target bins for stratification
# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_idx, valid_idx in split.split(X, target_bins):
#     X_train, X_valid = X[train_idx], X[valid_idx]
#     y_train, y_valid = y[train_idx], y[valid_idx]
# # Setting optuna verbosity to show only warning messages
# # If the line is uncommeted each iteration results will be shown
# # optuna.logging.set_verbosity(optuna.logging.WARNING)

# study = optuna.create_study(direction='minimize')
# study.optimize(lambda trial: train_model_optuna(trial, X_train, X_valid,
#                                                     y_train, y_valid),
#                n_trials = 500)

# # Showing optimization results
# print('Number of finished trials:', len(study.trials))
# print('Best trial parameters:', study.best_trial.params)
# print('Best score:', study.best_value)

# **XGBRegressor model training**

In [None]:
# The function splits given train data into 10 folds and trains each model on each fold.
# Each model makes test predictions. Mean predictions returned. 
def train_with_folds(X, y, X_test, target_bins, params):
    splits = 10
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    oof_preds = np.zeros((X.shape[0],))
    model_preds = 0
    model_fi = 0
    for num, (train_idx, valid_idx) in enumerate(skf.split(X, target_bins)):
        X_train, X_valid = X[train_idx], X[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]
        model = XGBRegressor(**params)
        model.fit(X_train, y_train,
                  eval_set=[(X_train, y_train), (X_valid, y_valid)],
                  eval_metric="rmse",
                  early_stopping_rounds=100,
                  verbose=False)
        model_preds += model.predict(X_test) / splits
        model_fi += model.feature_importances_
        oof_preds[valid_idx] = model.predict(X_valid)
        print(f"Fold {num} RMSE: {np.sqrt(mean_squared_error(y_valid, oof_preds[valid_idx]))}")
    model_rmsle = np.sqrt(mean_squared_error(y, oof_preds))
    print(f"Overall RMSE: {model_rmsle}")
    
    return model_preds, model_fi

In [None]:
# Hyperparameters values optimized by Optuna
xgb_params = {'n_estimators': 988,
              'learning_rate': 0.026709466947908544,
              'max_depth': 3,
              'booster': 'gbtree',
              'tree_method': 'auto',
              'reg_lambda': 0.128405467661076,
              'reg_alpha': 0.43102397027400285,
              'random_state': 42,
              'n_jobs': 4}

In [None]:
# Preprocessing test data
text_params = get_text_data_parameters(test["excerpt"], stop_words)
text_params = scale_test_text_params(text_params, scalers)
X_test = vectorize_text_data(test["excerpt"], text_params, vectorizer)

In [None]:
%%time
preds, feature_importances = train_with_folds(X, y, X_test, target_bins, xgb_params)

Let's check feature importances of all features and then engineered feature only.

In [None]:
fig, ax = plt.subplots(figsize=(16, 5))
ax = sns.lineplot(data=pd.Series(feature_importances))
ax.set_title("Feature importance")
ax.set_xlabel("Feature number")
ax.set_ylabel("Importance")
plt.show();

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ax = sns.lineplot(x=text_params.columns, y=pd.Series(feature_importances[:6]))
ax.set_title("Feature importance")
ax.set_xlabel("Feature")
ax.set_ylabel("Importance")
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
plt.show();

# **Submission**

In [None]:
submission = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv", low_memory=False)
submission["target"] = preds
submission.to_csv('submission.csv', index=False)
submission