In [None]:
import os
import tensorflow as tf
import pandas as pd
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import time


# Config

In [None]:
class CFG: 
    model_name = 'gpt2' # obviously, moving to a larger model helps -> e.g. "gpt2-large"
    # for the sake speedy demonstration, keep one possible output - 
    # but an obvious extension is generating more and selecting the one most similar (in readability score) to the input
    nof_outputs = 1 
    seed = 34
    MAX_LEN = 70
    nof_rows = 10

tf.random.set_seed(CFG.seed)

# Functions

In [None]:
# quick score for comparison
def syllable_count(word):
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
            if word.endswith("e"):
                count -= 1
    if count == 0:
        count += 1
    return count

def fleisch(passage):
    nof_char = len(passage)
    nof_words = len(passage.split(' '))
    nof_sent = passage.count('.')
    nof_syl = syllable_count(passage)

    fl = 206  - ((0.1 + nof_words) / (0.1 + nof_sent)) - 84 * (nof_syl / nof_words)
    return fl

def generate_paragraph(xinput):
    lix = len(xinput)

    input_ids = tokenizer.encode(xinput, return_tensors='tf')
    sample_outputs = GPT2.generate(input_ids, do_sample = True,  max_length = lix, 
                                   temperature = .85, 
                                   top_k = 50, 
                                   top_p = 0.85, 
    #                               num_return_sequences = CFG.nof_outputs
                                  )
    xoutput = tokenizer.decode(sample_outputs[0], skip_special_tokens = True)[lix:  2 * lix]


    return xoutput

# Model

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(CFG.model_name)
GPT2 = TFGPT2LMHeadModel.from_pretrained(CFG.model_name, pad_token_id=tokenizer.eos_token_id)


In [None]:
# load the training data
xtrain = pd.read_csv('../input/commonlitreadabilityprize/train.csv', encoding = 'utf8')


In [None]:
# # process single paragraph by sentence - HORRIBLY SLOW ATM

# xinput = xtrain['excerpt'][0]

# xoutput = ''

# nof_sentences = len(xinput.split('.'))

# for ii in range(nof_sentences):

#     input_sequence = xinput.split('.')[ii]

#     len_inp = len(input_sequence)

#     input_ids = tokenizer.encode(input_sequence, return_tensors='tf')

#     len_inp = len(input_sequence)

#     # only process non-empty sentences
#     if len_inp:
#         ## topk + top p
#         sample_outputs = GPT2.generate(input_ids, do_sample = True,  max_length = 2*len_inp, temperature = .7, top_k = 50, top_p = 0.85, num_return_sequences = 3)

#         print('input: ' + input_sequence)

#         fin_score = 10 ** 10
#         fin_output = ''

#         for i, sample_output in enumerate(sample_outputs):
#             outtext = tokenizer.decode(sample_output, skip_special_tokens = True)[(len_inp + 1): (2 * len_inp  + 1) ]
#             dist = np.abs(fleisch(input_sequence) - fleisch(outtext))
#             if dist < fin_score:
#                 fin_score = dist
#                 fin_output = outtext
#             print('')

#         print('output: ' + fin_output)
#         xoutput += '. ' + fin_output
#         print('---')

In [None]:
start_time = time.time()
# for testing purposes, run on a small subset

xaug = xtrain.loc[0:CFG.nof_rows]

xaug['new_xrc'] = xaug['excerpt'].apply(generate_paragraph)

print("--- %s seconds ---" % (time.time() - start_time))


In [None]:
xaug.to_csv('train_augmented.csv', index = False)