In [None]:
!pip install transformers

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
import os
import glob

In [None]:
torch.manual_seed(42)

<torch._C.Generator at 0x7f68e6de0b70>

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
model.resize_token_embeddings(len(tokenizer))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 1024)

In [None]:
from google.colab import files
uploaded = files.upload()

Saving dataset_unlimited.csv to dataset_unlimited.csv


In [None]:
import io
df = pd.read_csv(io.BytesIO(uploaded['dataset_unlimited.csv']))

In [None]:
clauses = df['Clause']
max_length = max([len(tokenizer.encode(clause)) for clause in clauses])

In [None]:
class ClauseDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
dataset = ClauseDataset(clauses, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [None]:
import gc
gc.collect()

88

In [None]:
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')

In [None]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset,
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 3329
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3329


Step,Training Loss
100,0.7925
200,0.3305
300,0.2895
400,0.2782
500,0.3083
600,0.2759
700,0.2822
800,0.2813
900,0.3039
1000,0.2515




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3329, training_loss=0.2835805462659318, metrics={'train_runtime': 4321.0153, 'train_samples_per_second': 0.77, 'train_steps_per_second': 0.77, 'total_flos': 3943054554083328.0, 'train_loss': 0.2835805462659318, 'epoch': 1.0})

In [None]:
torch.save(model.state_dict(), 'nwp.pth')

# download checkpoint file
files.download('nwp.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
generated = tokenizer("<|startoftext|> ", return_tensors="pt").input_ids.cuda()

In [None]:
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=20)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

0:   in conjunction with the Initial Term a $1 Million written royalty free payment  with Effective Thirteen days from the time Initial Net B Income = Initial Net revenue at or during any subsequent term after any thirty days without obtaining written consent  may now or may during the same sixty day period  be applied as follows in (1) upon obtaining (a) an aggregate   total and total sale by  any  vendor at an  least  two retail locations for such an unit price at the above. "Net Sales", each "sale as related a sales consideration", the e-MAIL URL the Co-Be the following  link, (c) one or more "bounty"  auctions by each seller; as a condition and subject  any  other items which are not used primarily
1:   - Endorsement made under such Exchange on a Time and under penalty of five dollar million for each dollar thousand pounds on the following page  [INFORMATION INC., 533 KENDREE'S CORONERA CO./VETROIS-SIMA PERSANCHTE BOTAC, 1516-18 DINETERING ST., SUEDEVENSBURG ATH-M5 3  -------------

In [None]:
input_text_full = "Either party may terminate immediately upon written notice if the other party (i) ceases to function"
input_text_cropped = "In consideration of the mutual obligations specified in this Agreement, the"

In [None]:
generated = tokenizer(f"<|startoftext|> {input_text_cropped}", return_tensors="pt").input_ids.cuda()

In [None]:
sample_outputs_n_w_p = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=8, top_p=0.95, temperature=1.9, num_return_sequences=20)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 13, but ``max_length`` is set to 8. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


In [None]:
for i, sample_output_1 in enumerate(sample_outputs_n_w_p):
    print("{}: tokenizer.decode(sample_output_1, skip_special_tokens=True)))

NameError: ignored