In [None]:
import os
import json
import regex as re
from functools import lru_cache
import numpy as np
# The GPT-2 implementation uses TF 1
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [None]:
# To see available data
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# From https://github.com/modelblocks/modelblocks-release/blob/master/resource-gpt2/scripts/encoder.py

In [None]:
"""Byte pair encoding utilities"""

@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

def get_pairs(word):
    """Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

class Encoder:
    def __init__(self, encoder, bpe_merges, errors='replace'):
        self.encoder = encoder
        self.decoder = {v:k for k,v in self.encoder.items()}
        self.errors = errors # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token)
        pairs = get_pairs(word)

        if not pairs:
            return token

        while True:
            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word)-1 and word[i+1] == second:
                    new_word.append(first+second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word

    def encode(self, text):
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens

    def decode(self, tokens):
        text = ''.join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
        return text

def get_encoder(model_name, models_dir):
    with open(os.path.join(models_dir, model_name, 'encoder.json'), 'r') as f:
        encoder = json.load(f)
    with open(os.path.join(models_dir, model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
        bpe_data = f.read()
    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
    return Encoder(
        encoder=encoder,
        bpe_merges=bpe_merges,
    )

# From https://github.com/modelblocks/modelblocks-release/blob/master/resource-gpt2/scripts/model.py

In [None]:
def default_hparams_dict():
    return {
        "n_vocab": 0,
        "n_ctx": 1024,
        "n_embd": 768,
        "n_head": 12,
        "n_layer": 12,        
    }
#    return HParams(
#        n_vocab=0,
#        n_ctx=1024,
#        n_embd=768,
#        n_head=12,
#        n_layer=12,
#    )

def shape_list(x):
    """Deal with dynamic shape in tensorflow cleanly."""
    static = x.shape.as_list()
    dynamic = tf.shape(x)
    return [dynamic[i] if s is None else s for i, s in enumerate(static)]

def softmax(x, axis=-1):
    x = x - tf.reduce_max(x, axis=axis, keepdims=True)
    ex = tf.exp(x)
    return ex / tf.reduce_sum(ex, axis=axis, keepdims=True)

def gelu(x):
    return 0.5*x*(1+tf.tanh(np.sqrt(2/np.pi)*(x+0.044715*tf.pow(x, 3))))

def norm(x, scope, *, axis=-1, epsilon=1e-5):
    """Normalize to mean = 0, std = 1, then do a diagonal affine transform."""
    with tf.variable_scope(scope):
        n_state = x.shape[-1].value
        g = tf.get_variable('g', [n_state], initializer=tf.constant_initializer(1))
        b = tf.get_variable('b', [n_state], initializer=tf.constant_initializer(0))
        u = tf.reduce_mean(x, axis=axis, keepdims=True)
        s = tf.reduce_mean(tf.square(x-u), axis=axis, keepdims=True)
        x = (x - u) * tf.rsqrt(s + epsilon)
        x = x*g + b
        return x

def split_states(x, n):
    """Reshape the last dimension of x into [n, x.shape[-1]/n]."""
    *start, m = shape_list(x)
    return tf.reshape(x, start + [n, m//n])

def merge_states(x):
    """Smash the last two dimensions of x into a single dimension."""
    *start, a, b = shape_list(x)
    return tf.reshape(x, start + [a*b])

def conv1d(x, scope, nf, *, w_init_stdev=0.02):
    with tf.variable_scope(scope):
        *start, nx = shape_list(x)
        w = tf.get_variable('w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev))
        b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0))
        c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, start+[nf])
        return c

def attention_mask(nd, ns, *, dtype):
    """1's in the lower triangle, counting from the lower right corner.

    Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
    """
    i = tf.range(nd)[:,None]
    j = tf.range(ns)
    m = i >= j - ns + nd
    return tf.cast(m, dtype)


def attn(x, scope, n_state, *, past, hparams_dict):
    assert x.shape.ndims == 3  # Should be [batch, sequence, features]
    assert n_state % hparams_dict["n_head"] == 0
    if past is not None:
        assert past.shape.ndims == 5  # Should be [batch, 2, heads, sequence, features], where 2 is [k, v]

    def split_heads(x):
        # From [batch, sequence, features] to [batch, heads, sequence, features]
        return tf.transpose(split_states(x, hparams_dict["n_head"]), [0, 2, 1, 3])

    def merge_heads(x):
        # Reverse of split_heads
        return merge_states(tf.transpose(x, [0, 2, 1, 3]))

    def mask_attn_weights(w):
        # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
        _, _, nd, ns = shape_list(w)
        b = attention_mask(nd, ns, dtype=w.dtype)
        b = tf.reshape(b, [1, 1, nd, ns])
        w = w*b - tf.cast(1e10, w.dtype)*(1-b)
        return w

    def multihead_attn(q, k, v):
        # q, k, v have shape [batch, heads, sequence, features]
        w = tf.matmul(q, k, transpose_b=True)
        w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype))

        w = mask_attn_weights(w)
        w = softmax(w)
        a = tf.matmul(w, v)
        return a

    with tf.variable_scope(scope):
        c = conv1d(x, 'c_attn', n_state*3)
        q, k, v = map(split_heads, tf.split(c, 3, axis=2))
        present = tf.stack([k, v], axis=1)
        if past is not None:
            pk, pv = tf.unstack(past, axis=1)
            k = tf.concat([pk, k], axis=-2)
            v = tf.concat([pv, v], axis=-2)
        a = multihead_attn(q, k, v)
        a = merge_heads(a)
        a = conv1d(a, 'c_proj', n_state)
        return a, present


def mlp(x, scope, n_state, *, hparams_dict):
    with tf.variable_scope(scope):
        nx = x.shape[-1].value
        h = gelu(conv1d(x, 'c_fc', n_state))
        h2 = conv1d(h, 'c_proj', nx)
        return h2


def block(x, scope, *, past, hparams_dict):
    with tf.variable_scope(scope):
        nx = x.shape[-1].value
        a, present = attn(norm(x, 'ln_1'), 'attn', nx, past=past, hparams_dict=hparams_dict)
        x = x + a
        m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams_dict=hparams_dict)
        x = x + m
        return x, present

def past_shape(*, hparams_dict, batch_size=None, sequence=None):
    return [batch_size, hparams_dict["n_layer"], 2, hparams_dict["n_head"], sequence, hparams_dict["n_embd"] // hparams_dict["n_head"]]

def expand_tile(value, size):
    """Add a new axis of given size."""
    value = tf.convert_to_tensor(value, name='value')
    ndims = value.shape.ndims
    return tf.tile(tf.expand_dims(value, axis=0), [size] + [1]*ndims)

def positions_for(tokens, past_length):
    batch_size = tf.shape(tokens)[0]
    nsteps = tf.shape(tokens)[1]
    return expand_tile(past_length + tf.range(nsteps), batch_size)


def model(hparams_dict, X, past=None, scope='model', reuse=False):
    with tf.variable_scope(scope, reuse=reuse):
        results = {}
        batch, sequence = shape_list(X)

        wpe = tf.get_variable('wpe', [hparams_dict["n_ctx"], hparams_dict["n_embd"]],
                             initializer=tf.random_normal_initializer(stddev=0.01))
        wte = tf.get_variable('wte', [hparams_dict["n_vocab"], hparams_dict["n_embd"]],
                             initializer=tf.random_normal_initializer(stddev=0.02))
        past_length = 0 if past is None else tf.shape(past)[-2]
        h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length))

        # Transformer
        presents = []
        pasts = tf.unstack(past, axis=1) if past is not None else [None] * hparams_dict["n_layer"]
        assert len(pasts) == hparams_dict["n_layer"]
        for layer, past in enumerate(pasts):
            h, present = block(h, 'h%d' % layer, past=past, hparams_dict=hparams_dict)
            presents.append(present)
        results['present'] = tf.stack(presents, axis=1)
        h = norm(h, 'ln_f')

        # Language model loss.  Do tokens <n predict token n?
        h_flat = tf.reshape(h, [batch*sequence, hparams_dict["n_embd"]])
        logits = tf.matmul(h_flat, wte, transpose_b=True)
        logits = tf.reshape(logits, [batch, sequence, hparams_dict["n_vocab"]])
        results['logits'] = logits
        return results

# From https://raw.githubusercontent.com/modelblocks/modelblocks-release/master/resource-gpt2/scripts/surprisal.py

In [None]:
# maybe needs to be None?
BATCH_SIZE = 1

def get_per_subword_surprisal(*, corpus, hparams_dict, encoder):
    start_token = encoder.encoder['<|endoftext|>']
    context = tf.fill([BATCH_SIZE, 1], start_token)

    def step(hparams_dict, tokens, past=None):
        lm_output = model(hparams_dict=hparams_dict, X=tokens, past=past, reuse=tf.AUTO_REUSE)
        logits = lm_output['logits'][:, :, :hparams_dict["n_vocab"]]
        presents = lm_output['present']
        presents.set_shape(past_shape(hparams_dict=hparams_dict, batch_size=BATCH_SIZE))
        return {
            'logits': logits,
            'presents': presents,
        }

    with tf.name_scope('get_per_word_surprisal'):
        # word is a list of integers (encoded chunks)
        def body(corpus, i, past, prev, surprisals):
            # chunk should be a scalar here
            chunk = corpus[i]
            next_outputs = step(hparams_dict, prev, past=past)
            # dimension is batch_size x vocab_size
            logits = next_outputs['logits'][:, -1, :]
            softmax = tf.nn.softmax(logits)
            surp = tf.math.scalar_mul(-1, tf.math.log(softmax[0, chunk]))
            # TODO assuming here that batch size is 1.
            # find a better solultion
            return [
                corpus,
                tf.add(i, 1),
                next_outputs['presents'] if past is None else tf.concat([past, next_outputs['presents']], axis=-2),
                tf.reshape(chunk, [1, 1]),
                tf.concat([surprisals, tf.reshape(surp, [1, 1])], axis=1),
            ]

        corpus = tf.constant(corpus)
        i = tf.constant(0)
        corpus, i, past, prev, surprisals = body(corpus, i, None, context, tf.constant([[]]))

        corpus_length = corpus.shape[0].value

        def cond(corpus, i, past, prev, surprisals):
            return tf.less(i, corpus_length)

        _, _, _, _, surprisals = tf.while_loop(
            cond=cond, body=body,
            loop_vars=[corpus, i, past, prev, surprisals],
            shape_invariants=[
                corpus.get_shape(),
                i.get_shape(),
                tf.TensorShape(past_shape(hparams_dict=hparams_dict, batch_size=BATCH_SIZE)),
                tf.TensorShape([BATCH_SIZE, None]),
                tf.TensorShape([BATCH_SIZE, None]),
            ],
            back_prop=False,
        )

        return surprisals

# From https://github.com/modelblocks/modelblocks-release/blob/master/resource-gpt2/scripts/per_word_surprisal.py

In [None]:
BATCH_SIZE = 1
DELIMITER = '!ARTICLE'


def get_delimited_texts(lineitems_fn):
    '''Returns the texts in the delim.linetimes file. Each text
    is preceded by a line with a special delimiter token. Output is a
    list of strings, where each string is one complete text'''
    texts = list()

    f = open(lineitems_fn)
    firstline = f.readline()
    assert firstline.strip() == DELIMITER

    curr_text = list()
    for l in f:
        sentence = l.strip()
        if sentence == DELIMITER:
            texts.append(' '.join(curr_text))
            curr_text = list()
        else:
            curr_text.append(sentence)
    # don't forget the last text
    texts.append(' '.join(curr_text))
        
    return texts


def get_subword_windows(encoded_text, context_size):
    '''Splits encoded_text, a list of subword IDs, into a list of
    overlapping windows of subwords, each of length <= context_size. These
    windows are what get fed into GPT-2.'''

    # Need an even context_size for splitting the subwords into windows
    if context_size % 2 == 1:
        raise ValueError("context_size must be an even number")
    windows = list()
    while len(encoded_text) > context_size:
        windows.append(encoded_text[:context_size])
        # size of overlap between windows is context_size/2
        encoded_text = encoded_text[int(context_size/2):]
    windows.append(encoded_text)
    return windows


def combine_window_surprisals(window_surprisals, context_size):
    '''Given the surprisal measurements for a list of overlapping windows,
    returns the recombined list of subword surprisals.'''
    # use all subword surprisals from the first window. For subsequent
    # windows, the first context_size/2 terms will overlap with the
    # previous window, so throw them out
    subword_surprisals = window_surprisals[0]
    for w in window_surprisals[1:]:
        subword_surprisals.extend(w[int(context_size/2):])
    return subword_surprisals


def get_per_char_surprisal(subwords, subword_surps):
    # subword_surps[i] is the surprisal for subwords[i]
    assert len(subwords) == len(subword_surps)
    chars = list()
    char_surps = list()
    for sub, surp in zip(subwords, subword_surps):
        per_char_surp = surp/len(sub)
        for char in sub:
            chars.append(char)
            char_surps.append(per_char_surp)
    assert len(chars) == len(char_surps)
    return chars, char_surps


def roll_subword_surprisal(subwords, subword_surps, words, line_idx):
    chars, char_surps = get_per_char_surprisal(subwords, subword_surps)
    word_surps = list()
    char_index = 0
    curr_char = chars[char_index]
    curr_surp = char_surps[char_index]
    for w in words:
        word_surp = 0
        # roll surprisals from spaces before the word into the word
        while curr_char.isspace():
            word_surp += curr_surp
            char_index += 1
            curr_char = chars[char_index]
            curr_surp = char_surps[char_index]
        for c in w:
            assert c == curr_char
#             if c != curr_char:
#                 print('Failed text line_idx', line_idx)
#                 continue
            word_surp += curr_surp
            char_index += 1
            # at the very end of the string we can't advance
            if char_index < len(chars):
                curr_char = chars[char_index]
                curr_surp = char_surps[char_index]
        word_surps.append(word_surp)
    return word_surps


def get_surprisal(
    texts, 
    model_name='124M',
    models_dir='models',
    context_size=1024
):
    models_dir = os.path.expanduser(os.path.expandvars(models_dir))
    hparams_dict = default_hparams_dict()
    with open(os.path.join(models_dir, model_name, 'hparams.json')) as f:
        override_params = json.load(f)
    hparams_dict.update(override_params)

    all_words = list()
    all_word_surprisals = list()
    for line_idx, text in enumerate(texts):
        print('line_idx', line_idx)
        
        # get sequence of subword encodings
        enc = get_encoder(model_name, models_dir)
        enc_text = enc.encode(text)
        subwords = [enc.decode([x]) for x in enc_text]
        words = text.split()
    
        # split subword sequence into windows of size <= context_size
        windows = get_subword_windows(enc_text, context_size)
    
        # feed each window into GPT-2, get per-subword surprisals
        window_surprisals = list()
        for window in windows:
            with tf.Session(graph=tf.Graph()) as sess:
        
                output = get_per_subword_surprisal(
                    corpus=window, hparams_dict=hparams_dict,
                    encoder=enc
                )
        
                saver = tf.train.Saver()
                ckpt = tf.train.latest_checkpoint(
                           os.path.join(models_dir, model_name))
                saver.restore(sess, ckpt)
        
                # out has dimension batch_size x num_subwords
                # containing per-subword surprisals
                assert BATCH_SIZE == 1
                out = sess.run(output)
                surps = list(out[0])
            window_surprisals.append(surps)
        subword_surprisals = combine_window_surprisals(
                                 window_surprisals, context_size)
    
        # "roll" surprisals together to get per-word surprisal
        word_surprisals = roll_subword_surprisal(
                              subwords, subword_surprisals, words, line_idx)

        all_words.append(words)
        all_word_surprisals.append(word_surprisals)
    
    #print("word gpt2surp")
    #for i in range(len(all_words)):
    #    print('{} {}'.format(all_words[i], all_word_surprisals[i]))
    return all_word_surprisals, all_words


def per_word_surprisal(
    lineitems, 
    model_name='124M',
    models_dir="/kaggle/input/gpt2simple-all-models-not-finetuned/models",
    context_size=1024
):
    """
    Use GPT-2 to calculate per-word surprisal for a provided lineitems file

    Parameters
    ----------
    lineitems :
        path to delimited lineitems file (*.delim.lineitems)

    model_name :
        which model to use

    models_dir :
        path to parent folder containing model subfolders
        (i.e. contains the <model_name> folder)

    context_size :
        the maximum context size allowed by the model (n_ctx). 
        If the length of the input text exceeds context_size, the text is 
        split into overlapping windows to calculate surprisal
    """

    # prepare each text as a single string to feed into GPT-2
    texts = get_delimited_texts(lineitems)

    return get_surprisal(
        texts=texts, 
        model_name='124M',
        models_dir="/kaggle/input/gpt2simple-all-models-not-finetuned/models",
        context_size=1024)

    

In [None]:
# Full natural stories dataset (~10K words) -- takes 30-60 minutes
#lineitems = "/kaggle/input/d/ceclark/gpt2surp/naturalstories.delim.lineitems"
# Small example (1 sentence) -- takes less than a minute
models_dir = "/kaggle/input/gpt2simple-all-models-not-finetuned/models"
# available sizes 
model_name = "124M"

In [None]:
# lineitems = "/kaggle/input/d/ceclark/gpt2surp/naturalstories.mini.delim.lineitems"

# surps, sents = per_word_surprisal(lineitems, model_name, models_dir)

# for i, sent in enumerate(sents):
#     for j, word in enumerate(sent):
#         print(word, round(surps[i][j], 4), sep='\t')

In [None]:
# texts = [
#     'Earth is the planet we live on. It is the third planet from the Sun. It is the only planet known to have life on it. The Earth formed around 4.5 billion years ago. It is one of four rocky planets on the inside of the Solar System. The other three are Mercury, Venus and Mars. The large mass of the Sun makes the Earth move around it, just as the mass of the Earth makes the Moon move around it. The Earth also turns round in space, so different parts face the Sun at different times. The Earth goes around the Sun once (one "year") for every 365 times it turns all the way around (one "day"). The Moon goes around the Earth about every 27 days. As the Earth goes round the Sun at the same time, the changing light of the Moon takes about 29 days to go from dark to bright to dark again. That is where the idea of "month" came from. However, now most months have 30 or 31 days so they fit into one year.'
# ]
# surps, text_lines = get_surprisal(texts, 
#     model_name, models_dir)

# print(surps)
# print(text_lines)

In [None]:

import pandas as pd
import numpy as np

import unicodedata

# data = [{'name': 'saylı'}, {'name': 'öhdəliyi'}, {'name': 'said—\n‘I say—about that charm—Jane—come out. We ought to talk about it, a'}]
# df = pd.DataFrame.from_dict(data, orient='columns')
# df['name'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())

train = pd.read_csv('../input/commonlitreadabilityprize/train.csv', nrows=1000)
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')



In [None]:
# train['excerpt'] = train['excerpt'].str.replace('\n', ' ')
# train['excerpt'] = train['excerpt'].str.replace('‘', "'")

train['text'] = train['excerpt'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())


# print(train.iloc[338]['excerpt'])

train_texts = train['text'].values.tolist()

surps, text_lines = get_surprisal(train_texts, model_name, models_dir)

train_surps = [np.mean(x) for x in surps]

train['surprisal'] = train_surps

train.to_csv('train_surp.csv')


In [None]:
# train.iloc[18]['excerpt']

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

sns.jointplot(data=train, x='target', y='surprisal', kind='reg', height=10)
plt.show()
