In [None]:
import re
import os
import gc
gc.enable()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math
import tensorflow as tf
import tensorflow_addons as tfa
import logging
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import *
from sklearn.linear_model import *
from tensorflow.keras import backend as K
from tensorflow.keras.layers import *
from tensorflow_addons.optimizers import AdamW, RectifiedAdam
from transformers import AutoTokenizer, AutoConfig, TFAutoModel, TFFunnelBaseModel
tf.get_logger().setLevel(logging.ERROR)
from glob import glob

### Calculate RMSE using mixed means

In [None]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sub = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

mean_no_url = train[train.url_legal.isnull()].target.mean()
mean_url    = train[train.url_legal.notnull()].target.mean()

print(f'Mean w/o url_legal: {mean_no_url:.5f}')
print(f'Mean w/  url_legal: {mean_url:.5f}\n')

train['pred'] = train.url_legal.apply(lambda x: mean_no_url if type(x) == float else mean_url)

train.iloc[train.url_legal.isnull().index].pred = mean_no_url
train.iloc[train.url_legal.notnull().index].pred = mean_url
has_no_url = train[train.url_legal.isnull()]
has_url    = train[train.url_legal.notnull()]
print(f'RMSE w/ mixed means: {mean_squared_error(train.target, train.pred, squared=False):.5f}')
print(f'RMSE w/ simple mean: {mean_squared_error(train.target, np.full(len(train), train.target.mean()), squared=False):.5f}\n')
print(f'RMSE w/o url_legal mean: {mean_squared_error(has_no_url.target, has_no_url.pred, squared=False):.5f}')
print(f'RMSE w/  url_legal mean: {mean_squared_error(has_url.target, has_url.pred, squared=False):.5f}')
sns.displot(train[train.url_legal.isnull()].target);
sns.displot(train[train.url_legal.notnull()].target);

In [None]:
# Configurations
BATCH_SIZE = 4
SEED = 1
VERBOSE = 2
FOLDS = 5
MAX_LEN = 256
AUTO = tf.data.AUTOTUNE

In [None]:
# Get the trained model we want to use
MODEL = '../input/download-distilbert-base-uncased'
TOKENIZER = AutoTokenizer.from_pretrained(MODEL)
model_paths = sorted(glob('../input/clr-exp387/*.h5'))

In [None]:
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

# This function tokenize the text according to a transformers model tokenizer
def regular_encode(texts, tokenizer, maxlen = MAX_LEN):
    enc_di = tokenizer.batch_encode_plus(
        texts,
        padding = 'max_length',
        truncation = True,
        max_length = maxlen,
    )
    
    return np.array(enc_di['input_ids'])

# This function encode our test sentences
def encode_test(x_test, MAX_LEN, tokenizer=None):
    x_test = regular_encode(x_test.tolist(), tokenizer, maxlen = MAX_LEN)
    return x_test

# Function to build our model
def build_base_model(max_len, MODEL):
    transformer = TFAutoModel.from_pretrained(MODEL)
    input_word_ids = tf.keras.layers.Input(shape = (max_len, ), dtype = tf.int32, name = 'input_word_ids')
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    output = tf.keras.layers.Dense(1, activation = 'sigmoid', dtype = 'float32')(cls_token)
    model = tf.keras.models.Model(inputs = [input_word_ids], outputs = [output])
    return model

# Function to train and evaluate our model
def evaluate(MODEL, MAX_LEN, TOKENIZER):
    df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
    test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
    x_test = test_df['excerpt']
    x_test = encode_test(x_test, MAX_LEN, tokenizer=TOKENIZER)
    
    seed_everything(SEED)
    predictions = np.zeros(len(test_df))
    
    for fold, model_path in enumerate(model_paths):
        print('\n')
        print('-'*50)
        print(f'Training fold {fold + 1}')
        K.clear_session()
        model = build_base_model(MAX_LEN, MODEL)
        model.load_weights(model_path)
        predictions += model.predict(x_test)[:,0] / FOLDS
        del model

    sub = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
    sub.target = predictions
    sub.to_csv('has_url.csv', index=False)
    return sub

In [None]:
%%time
test = evaluate(MODEL, MAX_LEN, TOKENIZER)

In [None]:
test = pd.merge(test, pd.read_csv('../input/commonlitreadabilityprize/test.csv'), on=['id'], how='left')
test.target = test.target.map(lambda x: mean_url if x > 0.5 else mean_no_url)
test[['id', 'target']].to_csv('submission.csv', index=False)
test.head(7)