In [None]:
%pip install transformers

In [3]:
import numpy as np
import pandas as pd
import re
import random

import torch
from tqdm.notebook import tqdm

if torch.cuda.is_available():    
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

import warnings
warnings.filterwarnings('ignore')

import transformers

from transformers import GPT2Tokenizer

import textwrap

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
with open('IT_Advices_Proverbs_En.txt', encoding='utf8') as f:
    text = f.read()
    text = text.split('\n')

random.shuffle(text)
print(text)


In [7]:
train = []
test = []
max_length = 0
index_max_length = 0

for idx, value in enumerate(text):
  tokens = tokenizer.encode(value, add_special_tokens=True)
  tokens = np.array(tokens)

  curr_len = len(tokens)
  if curr_len >= max_length:
    max_length = curr_len
    index_max_length = idx

  if idx <= (len(text) * .8):
    train.append(tokens)

  else:
    test.append(tokens)

train = np.array(train)
test = np.array(test)

print('len(train), len(test): ', len(train), len(test))
print('max_length, index_max_length: ', max_length, index_max_length)


len(train), len(test):  151 37
max_length, index_max_length:  69 18


In [8]:
def Padding(review_int, seq_len):
    '''
    Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(review_int), seq_len), dtype = int)
    for i, review in enumerate(review_int):
        if len(review) <= seq_len:
            zeros = list(np.zeros(seq_len - len(review)))
            # print(len(zeros))
            # print(review.shape)
            # new = np.array(zeros) + review
            new = np.append(zeros, review)

        features[i, :] = np.array(new)
            
    return features

train = Padding(train, max_length)
test = Padding(test, max_length)

print(train.shape)
print(train[105, :])

(151, 69)
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
  3987   470  9585  3511   784   318   257  7989   286  3788  2478  8998
   379  8868 29693   286  3788  7572    11 13586   340   351 12531   507
   393  1262  1366  3487  1634   284  3368 49052    13]


In [17]:
from transformers import GPT2LMHeadModel, AdamW

model_init = GPT2LMHeadModel.from_pretrained(
    'gpt2',
    output_attentions = False,
    output_hidden_states = False,
)

model = GPT2LMHeadModel.from_pretrained(
    'gpt2',
    output_attentions = False,
    output_hidden_states = False,
)

model.to(device);
model_init.to(device);

In [18]:
optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8)

batch_size = 8
epochs = 100

n_train = len(train)//batch_size + 1
n_test = len(test)//batch_size + 1
print(n_train, n_test)

total_steps = n_train * epochs
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

def accuracy(y_true, logits):
    return torch.mean((y_true[1:] == torch.argmax(logits, dim=2)[:-1]).float()).detach().cpu().numpy()

19 5


In [11]:
def prep_tensors(x, i, batch_size=batch_size):
    start_idx = i*batch_size
    end_idx = start_idx + batch_size
    batch_ids = x[start_idx: end_idx]
    batch_ids = torch.tensor(batch_ids).to(device)
    return torch.tensor(batch_ids).to(device)

preped = prep_tensors(train, 17)
print('preped shape: ', preped.shape)

preped shape:  torch.Size([8, 69])


In [None]:

for epoch in range(1, epochs+1):
    print(f'epoch {epoch}/{epochs} : training')

    train_loss = []
    train_acc = []
    model.train()

    pbar = tqdm(range(n_train))
    for i in pbar:
        batch_ids = prep_tensors(train, i)

        model.zero_grad()
        loss, logits, _ = model(batch_ids,
                             token_type_ids=None, 
                             labels=batch_ids
                             ).values()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        train_loss.append(loss.item())
        train_acc.append(accuracy(batch_ids, logits))
        pbar.set_description(f'acc {np.mean(train_acc):.4f} loss {np.mean(train_loss):.4f}', refresh=True)

    
    print(f'epoch {epoch}/{epochs} : validation')
    model.eval()
    val_acc = []
    val_loss = []
    pbar = tqdm(range(n_test))
    for i in pbar:
        batch_ids = prep_tensors(test, i)
        with torch.no_grad():        
            loss, logits, _ = model(batch_ids, 
                                token_type_ids=None, 
                                labels=batch_ids
                                 ).values()
        
        val_loss.append(loss.item())
        val_acc.append(accuracy(batch_ids, logits))
        pbar.set_description(f'acc {np.mean(val_acc):.4f} loss {np.mean(val_loss):.4f}', refresh=True)

In [20]:
# Model without training

prompt = 'Design must be'
prompt = tokenizer.encode(prompt, return_tensors='pt').to(device)
out = model_init.generate(
    input_ids=prompt,
    max_length=40,
    num_beams=5,
    do_sample=True,
    temperature=.7,
    top_k=10,
    top_p=0.95,
    no_repeat_ngram_size=2,
    num_return_sequences=7,
    ).cpu().numpy()
for out_ in out:
  wraped = textwrap.fill(tokenizer.decode(out_), 120)
  wraped = wraped.replace("  ", " ")
  if '.' in wraped:
    arr = wraped.split('.')
    arr.pop()
    final_out_text = '.'.join(arr) + '.'
  else:
    final_out_text = wraped + '.'
  print(final_out_text, end='\n------------------\n')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Design must be used in conjunction with the following: (a) The name of the person to whom the application is made; and
(b) the name, address, and telephone number of.
------------------
Design must be used in conjunction with the following: (a) The name and address of the person to whom the application
is made; and (b) the name, address, and telephone.
------------------
Design must be used in conjunction with the following: (a) The name and address of the person to whom the application
is made; and (b) the name, address, and telephone.
------------------
Design must be used in conjunction with the following: (a) The name and address of the person to whom the application
is made; and (b) the name, address and telephone number.
------------------
Design must be used in conjunction with the following: (1) The name of the person to whom the application is made; and
(2) the name, address, and telephone number of.
------------------
Design must be approved by the Secretary of Health and Hum

In [21]:
# Trained model

prompt = 'Design must be'
prompt = tokenizer.encode(prompt, return_tensors='pt').to(device)

out = model.generate(
    input_ids=prompt,
    max_length=40,
    num_beams=5,
    do_sample=True,
    temperature=.7,
    top_k=10,
    top_p=0.95,
    no_repeat_ngram_size=2,
    num_return_sequences=7,
    ).cpu().numpy()

for out_ in out:
  wraped = textwrap.fill(tokenizer.decode(out_), 120)
  # print(wraped)
  wraped = wraped.replace("  ", " ")
  if '.' in wraped:
    arr = wraped.split('.')
    arr.pop()
    final_out_text = '.'.join(arr) + '.'
  else:
    final_out_text = wraped + '.'
  print(final_out_text, end='\n------------------\n')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Design must be kept in mind when it comes to the design of a computer system.
------------------
Design must be implemented in a way that is easy to read, maintain, and maintain.
------------------
Design must be implemented in a way that is easy to read, maintain, and maintain.
------------------
Design must be kept in mind when designing software, because it is often the first step in the development of a new
product or service.
------------------
Design must be implemented in a way that is easy to read, maintain, and maintain.
------------------
Design must be kept in mind when designating a part of a system as a whole, because part-whole systems can vary widely
in their functionality. Also, parts may not always be interchangeable.
------------------
Design must be kept in mind when it comes to the design of a computer system.
------------------


In [None]:
def generate(prompt, len_gen=20, temperature=.7):
    generated = tokenizer.encode(prompt)
    context = torch.tensor([generated]).to(device)
    past = None

    for i in tqdm(range(len_gen)):
        output, past = model(context, past_key_values=past).values()
        # token = torch.argmax(output[..., -1, :], dim=-1)
        output = output / temperature
        token = torch.distributions.Categorical(logits=output[..., -1, :]).sample()
        
        generated += token.tolist()
        context = token.unsqueeze(0)

    sequence = tokenizer.decode(generated)

    return sequence

In [None]:
prompt = 'Design must be'
print(textwrap.fill(generate(prompt, 200, temperature=.8), 120))