In [None]:
!pip install transformers

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
import os
import glob

In [None]:
torch.manual_seed(42)

<torch._C.Generator at 0x7f4176a5ccb0>

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
model.resize_token_embeddings(len(tokenizer))

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Embedding(50259, 1024)

In [None]:
from google.colab import files
uploaded = files.upload()

Saving dataset_unlimited_with_labels.csv to dataset_unlimited_with_labels.csv


In [None]:
import io
df = pd.read_csv(io.BytesIO(uploaded['dataset_unlimited_with_labels.csv']))

In [None]:
clauses = df['Clause']
max_length = max([len(tokenizer.encode(clause)) for clause in clauses])

In [None]:
class ClauseDataset(Dataset):
    def __init__(self, txt_list, labels, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt, label in zip(txt_list, labels):
            encodings_dict = tokenizer('<|startoftext|> ' + f'<<{label}>>' + txt + f'<</{label}>> <|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
dataset = ClauseDataset(clauses, df['type'], tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [None]:
import gc
gc.collect()

50

In [None]:
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')

In [None]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset,
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 3329
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3329


Step,Training Loss
100,1.0373
200,0.3458
300,0.2939
400,0.2867
500,0.316
600,0.2808
700,0.2897
800,0.2849
900,0.311
1000,0.2579




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3329, training_loss=0.29560328575539924, metrics={'train_runtime': 2428.683, 'train_samples_per_second': 1.371, 'train_steps_per_second': 1.371, 'total_flos': 3943054554083328.0, 'train_loss': 0.29560328575539924, 'epoch': 1.0})

In [None]:
torch.save(model.state_dict(), 'nwp.pth')

files.download('nwp.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
input_text_cropped = "The term of this Agreement"
max_length_s_o = len(input_text_cropped.split(" ")) + 3

In [None]:
generated = tokenizer(f"<|startoftext|> <<Agency Agreement>> {input_text_cropped}", return_tensors="pt").input_ids.cuda()

In [None]:
sample_outputs_n_w_p = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=max_length_s_o, top_p=0.95, temperature=1.9, num_return_sequences=20)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 11, but ``max_length`` is set to 8. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


In [None]:
import re

In [None]:
for i, sample_output_n_w_p in enumerate(sample_outputs_n_w_p):
  pattern  = re.compile(r'<.*?>')
  to_print = "{}: {}".format(i, tokenizer.decode(sample_output_n_w_p, skip_special_tokens=True))
  print(to_print)

0:  <<Agency Agreement>> The term of this Agreement shall
1:  <<Agency Agreement>> The term of this Agreement may
2:  <<Agency Agreement>> The term of this Agreement ("
3:  <<Agency Agreement>> The term of this Agreement shall
4:  <<Agency Agreement>> The term of this Agreement shall
5:  <<Agency Agreement>> The term of this Agreement and
6:  <<Agency Agreement>> The term of this Agreement,
7:  <<Agency Agreement>> The term of this Agreement will
8:  <<Agency Agreement>> The term of this Agreement ("
9:  <<Agency Agreement>> The term of this Agreement comm
10:  <<Agency Agreement>> The term of this Agreement shall
11:  <<Agency Agreement>> The term of this Agreement,
12:  <<Agency Agreement>> The term of this Agreement ("
13:  <<Agency Agreement>> The term of this Agreement comm
14:  <<Agency Agreement>> The term of this Agreement will
15:  <<Agency Agreement>> The term of this Agreement (
16:  <<Agency Agreement>> The term of this Agreement has
17:  <<Agency Agreement>> The term of th