In [None]:
!pip install datasets
!pip install transformers
!pip install rouge_score
!pip install sentencepiece

# 'UN-summarizing' Keywords to Generate Sample Sentences 

### Experiments in generating coherent sentences from keywords.

### This notebook applies previous approaches to T5-large with more samples.

# Preprocess Data

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
df = pd.read_csv('/content/drive/MyDrive/NLP_stuff/keyword_to_text.csv')

In [None]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,keyword,text
82212,82212,impervious,"But since I knew that witch of a girl, St. Ant..."
46702,46702,then,It then was about ten o’clock in the morning.
71411,71411,shall,And I shall do as I please.
18562,18562,liberty,They had brought away nothing but their own hi...
40868,40868,savant,"I can’t go before, because I am going to a rec..."
72236,72236,gossip,"This, according to gossip, was the proclamatio..."
58046,58046,busily,"In a corner of the hut sat a young man, a mino..."
84149,84149,prescription,"He wrote a prescription, and with pleasant wor..."
19732,19732,saliva,Bees and wasps use this saliva in building the...
59605,59605,into,"At once she turned into a donkey, dropped the ..."


In [None]:
dff = df.sample(40000).copy()
dff.head()

Unnamed: 0.1,Unnamed: 0,keyword,text
32198,32198,crook,In this story you will read of the further adv...
41773,41773,misunderstanding,The account of it which the bereaved mother ga...
23102,23102,straight,Instead of standing straight and tall like a s...
34280,34280,reckoning,When re-ordering or reckoning up the duration ...
37428,37428,hearty,An hour later I had packed my kit and was read...


In [None]:
train, validate, test = np.split(dff.sample(frac=1, random_state=42), [int(.6*len(dff)), int(.8*len(dff))])

In [None]:
print(len(train),len(test),len(validate))

24000 8000 8000


In [None]:
train.to_csv('english_books_train.csv')
validate.to_csv('english_books_validate.csv')
test.to_csv('english_books_test.csv')

Load to Hugging Face Dataset Format

In [None]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train': '/content/english_books_train.csv', 'test': '/content/english_books_test.csv', 'validation':'/content/english_books_validate.csv'})



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-0aa2e6c99c245548/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-0aa2e6c99c245548/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
dataset['train'][221]

{'Unnamed: 0': 77168,
 'Unnamed: 0.1': 77168,
 'keyword': 'anywhere',
 'text': 'It was a terrible labor for Indian to look anywhere from his present position, because, as Dewey explained, he had to see over his stomach.'}

### Original Model

In [None]:
import random

print(len(set(df['keyword'])))
word_list = random.choices(list(set(df['keyword'])),k=50)

15131


In [None]:
for word in word_list:
  print(word)

peaceable
wealthy
continuation
brokenly
wrangler
mica
detector
impartially
pervading
predilection
suffocating
resolute
wetting
misrepresentation
discrimination
language
kidnap
terrain
brilliant
passive
firmness
libelous
overexertion
paleness
blamed
recuperate
unavailable
specimen
smile
mess
foreshadow
avalanche
collapsed
mead
vehemently
maternal
engaged
delineation
darkened
crop
rhetorical
playfully
squeamish
tinder
fiend
absentminded
tangible
nomadic
nibble
querulously


In [None]:
from transformers import T5ForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer

In [None]:
model_name = "t5-base"

tokenizer, model = T5Tokenizer.from_pretrained(model_name), T5ForConditionalGeneration.from_pretrained(model_name)

Downloading spiece.model:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading pytorch_model.bin:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [None]:
def generate_samples_from_list(word_list, model, tokenizer, min_length=20):
    generated_text = []
    for word in tqdm(word_list):
        encoded_input = tokenizer("summarize: " + word)
        with torch.no_grad():
              generated_ids = model.generate(
                    input_ids = torch.LongTensor(encoded_input['input_ids']).unsqueeze(0),
                    attention_mask = torch.LongTensor(encoded_input['attention_mask']).unsqueeze(0), 
                    min_length=min_length, 
                    num_beams=5,
                    repetition_penalty=10.0, 
                    length_penalty=1.0, 
                    early_stopping=True
                    )
        text = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
      
        generated_text.append([word,text[0]])
      
    return pd.DataFrame(generated_text)

In [None]:
sample_df = generate_samples_from_list(word_list, model, tokenizer)

100%|██████████| 50/50 [01:16<00:00,  1.52s/it]


In [None]:
sample_df

Unnamed: 0,0,1
0,settle,"settle on a fee of up to $2,000 for each day s..."
1,lightening,Lightening is a term used to describe the proc...
2,mournful,mournful to hear of the loss of a loved one or...
3,suspicious,"Surveillant, l'investigation est effectuée sur..."
4,unresisting,"unresisting and awe-inspiring, the new york times"
5,fleetingly,fleetingly: frenziedly: fleetingly: fleetingly...
6,whare,"whareland: whareland, whareland and whareland"
7,surrender,surrendered to the u.s. supreme court on wedne...
8,fervent,"fervent, unflinchingly sweet and full of life...."
9,handy,Handyman: a handyman’s dictionary is here to h...


In [None]:
sample_df.to_csv('/content/drive/MyDrive/NLP_stuff/sample_df.csv')

### There are some coherent sentence fragments here, but it's largely repetetive words. Do larger models work better?

In [None]:
model_name = "t5-large"

tokenizer, model = T5Tokenizer.from_pretrained(model_name), T5ForConditionalGeneration.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
sample_df_2 = generate_samples_from_list(word_list, model, tokenizer)

100%|██████████| 50/50 [04:09<00:00,  5.00s/it]


In [None]:
sample_df_2

Unnamed: 0,0,1
0,settle,"settle for less than a dollar per minute, and ..."
1,lightening,"lightening has arrived in the philippines, and..."
2,mournful,"mournful katie brennan, who died last year at ..."
3,suspicious,"i'm not sure what to do next, but it would be ..."
4,unresisting,unresisting idaho girl gets her way to the top of
5,fleetingly,"fleetingly, i had the pleasure of meeting you ..."
6,whare,whare is the most important thing to consider ...
7,surrender,sen. marco rubio calls for an end to the capit...
8,fervent,fervent fan of sportsmanship and the great out...
9,handy,handy to have on hand for when you're out and ...


In [None]:
sample_df_2.to_csv('/content/drive/MyDrive/NLP_stuff/sample_df_2.csv')

In [None]:
### This is the model that was finetuned on less data.

model_name = "caffsean/t5-base-finetuned-keyword-to-text-generation"

tokenizer, model = T5Tokenizer.from_pretrained(model_name), T5ForConditionalGeneration.from_pretrained(model_name)

Downloading spiece.model:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/2.29k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [None]:
sample_df_3 = generate_samples_from_list(word_list, model, tokenizer)

100%|██████████| 50/50 [01:16<00:00,  1.54s/it]


In [None]:
sample_df_3

Unnamed: 0,0,1
0,settle,"Then he had to settle down with his wife, who ..."
1,lightening,"The lightening of the air was a good thing, as..."
2,mournful,"He was mournful, but he did not know what to d..."
3,suspicious,He was suspicious of the way he had been treat...
4,unresisting,"He was unresisting, and he could not be more p..."
5,fleetingly,"He shook his head fleetingly, and began to cry..."
6,whare,The whares of the sea were a few hundred yards...
7,surrender,"Then he surrendered to his father, who was in ..."
8,fervent,He was fervent in his desire to make the most ...
9,handy,It’s a handy thing to do when you are out and ...


In [None]:
sample_df_3.to_csv('/content/drive/MyDrive/NLP_stuff/sample_df_3.csv')

## The results are pretty good so far, let's see if we can improve them with further fine tuning. 

# Fine-Tuning with Hugging Face

### This time we're using the T5-large with 10000 training samples

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!apt install git-lfs
!git config --global credential.helper store

In [None]:
import transformers

print(transformers.__version__)

4.21.2


In [None]:
from datasets import load_metric

metric = load_metric("rouge")

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
model_checkpoint = 't5-large'

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer("Sample of text")

{'input_ids': [12474, 13, 1499, 1], 'attention_mask': [1, 1, 1, 1]}

In [None]:
max_input_length = 512
max_target_length = 128

def preprocess(examples):
    inputs = ["summarize: " + doc for doc in examples["keyword"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["text"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
preprocess(dataset['train'][4:6])

{'input_ids': [[21603, 10, 3, 10666, 9889, 1], [21603, 10, 10543, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1]], 'labels': [[621, 3, 9, 3, 10666, 9889, 6179, 3, 88, 877, 139, 8, 2851, 18, 3082, 11, 3, 7, 144, 323, 44, 8, 953, 6, 68, 8, 6398, 11, 5949, 13, 542, 28495, 15, 26, 376, 5, 1], [71, 2968, 768, 10543, 2650, 27969, 2158, 524, 13668, 47, 840, 44, 350, 15432, 51, 9184, 15, 6, 3, 9, 874, 18, 6890, 1262, 45, 1546, 3225, 5472, 1468, 402, 6, 8, 422, 3309, 666, 8, 13243, 5, 1]]}

In [None]:
tokenized_data = dataset.map(preprocess, batched=True)

  0%|          | 0/24 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

In [None]:
tokenized_data['train']['labels'][0]

[1029,
 48,
 833,
 8,
 4151,
 16829,
 141,
 2496,
 6786,
 18,
 21182,
 2286,
 6,
 6237,
 21,
 334,
 6178,
 13,
 1591,
 11,
 3,
 29297,
 9425,
 14637,
 5,
 1]

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/2.75G [00:00<?, ?B/s]

In [None]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 10 for key, value in result.items()}
    ## usually multiply by 100 instead of 10, but Rogue may not be the best metric, so we're weakening it's effect
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_checkpoint}-finetune-keyword-to-text-generation",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

## Reduced batch size from 16 to 8 to address memory issue
## Changed fp16 to false because of conversion problems

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
from transformers import T5ForConditionalGeneration, AutoModelForSeq2SeqLM

custom_model = "caffsean/t5-large-finetune-keyword-to-text-generation"

In [None]:
model = T5ForConditionalGeneration.from_pretrained(custom_model)

Downloading config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.75G [00:00<?, ?B/s]

In [None]:
comp_df = pd.read_csv('/content/drive/MyDrive/NLP_stuff/text_gen_comparison.csv')
list(comp_df['keyword'])
word_list = list(comp_df['keyword'])

In [None]:
sample_df_5 = generate_samples_from_list(word_list, model, tokenizer)

100%|██████████| 50/50 [03:46<00:00,  4.54s/it]


In [None]:
comp_df['result_finetuned_large_plus'] = sample_df_5[1]

In [None]:
comp_df.to_csv('/content/drive/MyDrive/NLP_stuff/text_gen_comparison.csv')

#Conclusion

### We can see that increasing the size of the model and the generated sentences are increasingly sentence-like. I'll be experimenting with the text-generation parameters such as beam search and temperature in order to further refine the quality of the output. 

### NOTE: Interestingly, we do see a large male bias in this output. Perhaps that relates to the fact that the training data was derived from public domain literature that is often quite antiquated. Does this cause a male bias in the results? How could we research this and/or ameliorate this problem?

In [None]:
for x in range(len(comp_df)):
  print(f"{comp_df['keyword'].iloc[x]}: \n t5_base: {comp_df['result_t5_base'].iloc[x]} \n t5_large: {comp_df['result_t5_large'].iloc[x]} \n finetuned-base: {comp_df['result_finetuned_base'].iloc[x]} \n finetuned-large: {comp_df['result_finetuned_large'].iloc[x]} \n finetuned-large-plus: {comp_df['result_finetuned_large_plus'].iloc[x]} \n")

settle: 
 t5_base: settle on a fee of up to $2,000 for each day spent in the United States. 
 t5_large: settle for less than a dollar per minute, and you'll have more peace of mind 
 finetuned-base: Then he had to settle down with his wife, who was in charge of the house 
 finetuned-large: Then he went to settle down in the house of his father, and there was no 
 finetuned-large-plus: It was a long time before I could settle down and think of anything else to do. 

lightening: 
 t5_base: Lightening is a term used to describe the process of lightening and darken 
 t5_large: lightening has arrived in the philippines, and it's going to 
 finetuned-base: The lightening of the air was a good thing, as it had been for many 
 finetuned-large: There was a lightening in the air, but it did not last long; and 
 finetuned-large-plus: Then a flash of lightening struck, and the air was filled with smoke. 

mournful: 
 t5_base: mournful to hear of the loss of a loved one or two who have passed away 

In [None]:
def evaluate_sentences(df, column_names):
  full_stop_counts = []
  capital_counts = []
  
  for col in column_names:
    sentences = list(df[col])
    capital = 0
    full_stop = 0
    for sentence in sentences:
      if sentence[-1] == '.':
        full_stop += 1
      if sentence[0].isupper() == True: 
        capital += 1
    capital_counts.append(capital/len(df) * 100)
    full_stop_counts.append(full_stop/len(df) * 100)

  dff = pd.DataFrame()
  dff.index = column_names
  dff['properly_capitalized'] = capital_counts
  dff['ends_with_full_stop'] = full_stop_counts
  return dff


In [None]:
list(comp_df.columns[2:])

['result_t5_base',
 'result_t5_large',
 'result_finetuned_base',
 'result_finetuned_large',
 'result_finetuned_large_plus']

In [None]:
evaluate_sentences(comp_df, list(comp_df.columns[2:]))

Unnamed: 0,properly_capitalized,ends_with_full_stop
result_t5_base,38.0,22.0
result_t5_large,0.0,10.0
result_finetuned_base,100.0,36.0
result_finetuned_large,100.0,28.0
result_finetuned_large_plus,100.0,42.0


## Analysis:

### On these simple criteria, it does appear that the quality of the sentences improves with more epochs, more data, and larger models. Even so, the model still produces incomplete sentences. This might be easily addressed with a two stage pipeline that utilizes GPT for text completion. Overall, this approach seems feasible for generating text for the sake of teaching vocabulary. 