In [3]:
from itertools import chain

# Let's define our contexts and special tokens
persona = [["i", "like", "playing", "football", "."],
           ["i", "am", "from", "NYC", "."]]
history = [["hello", "how", "are", "you", "?"],
           ["i", "am", "fine", "thanks", "."]]
reply = ["great", "to", "hear"]
bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>"

def build_inputs(persona, history, reply):
    # Build our sequence by adding delimiters and concatenating
    sequence = [[bos] + list(chain(*persona))] + history + [reply + [eos]]
    sequence = [sequence[0]] + [ [speaker2 if (len(sequence)-i) % 2 else speaker1] + s
                                for i, s in enumerate(sequence[1:])]
    # Build our word, segments and position inputs from the sequence
    words = list(chain(*sequence))                          # word tokens
    segments = [speaker2 if i % 2 else speaker1             # segment tokens
                for i, s in enumerate(sequence) for _ in s]
    position = list(range(len(words)))                      # position tokens
    return words, segments, position, sequence

words, segments, position, sequence = build_inputs(persona, history, reply)

# >>> print(sequence)  # Our inputs looks like this:
# [['<bos>', 'i', 'like', 'playing', 'football', '.', 'i', 'am', 'from', 'NYC', '.'],
#  ['<speaker1>', 'hello', 'how', 'are', 'you', '?'],
#  ['<speaker2>', 'i', 'am', 'fine', 'thanks', '.'],
#  ['<speaker1>', 'great', 'to', 'hear', '<eos>']]


In [6]:
for sen in sequence:
    print(sen)

['<bos>', 'i', 'like', 'playing', 'football', '.', 'i', 'am', 'from', 'NYC', '.']
['<speaker1>', 'hello', 'how', 'are', 'you', '?']
['<speaker2>', 'i', 'am', 'fine', 'thanks', '.']
['<speaker1>', 'great', 'to', 'hear', '<eos>']


In [10]:
# Prepare our language modeling targets: keep only the reply segment, -1 on the rest
lm_targets = ([-1] * sum(len(s) for s in sequence[:-1])) \
             + [-1] + tokenizer.convert_tokens_to_ids(sequence[-1][1:])

NameError: name 'tokenizer' is not defined

In [11]:
sequence[:-1] # input is masked
# <speaker2> is masked
# calculate loss only on the response


[['<bos>',
  'i',
  'like',
  'playing',
  'football',
  '.',
  'i',
  'am',
  'from',
  'NYC',
  '.'],
 ['<speaker1>', 'hello', 'how', 'are', 'you', '?'],
 ['<speaker2>', 'i', 'am', 'fine', 'thanks', '.']]

In [14]:
sequence[-1][:]

['<speaker1>', 'great', 'to', 'hear', '<eos>']

In [1]:
import torch
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode some inputs
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"
indexed_tokens_1 = tokenizer.encode(text_1)
indexed_tokens_2 = tokenizer.encode(text_2)

# Convert inputs to PyTorch tensors
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])

INFO:pytorch_pretrained_bert.tokenization_gpt2:loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /Users/shensq/.pytorch_pretrained_bert/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:pytorch_pretrained_bert.tokenization_gpt2:loading merges file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /Users/shensq/.pytorch_pretrained_bert/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda


In [2]:
# Load pre-trained model (weights)
model = GPT2Model.from_pretrained('gpt2')
model.eval()


# Predict hidden states features for each layer
with torch.no_grad():
    hidden_states_1, past = model(tokens_tensor_1)
    # past can be used to reuse precomputed hidden state in a subsequent predictions
    # (see beam-search examples in the run_gpt2.py example).
    hidden_states_2, past = model(tokens_tensor_2, past=past)

INFO:pytorch_pretrained_bert.modeling_gpt2:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin from cache at /Users/shensq/.pytorch_pretrained_bert/4295d67f022061768f4adc386234dbdb781c814c39662dd1662221c309962c55.778cf36f5c4e5d94c8cd9cefcf2a580c8643570eb327f0d4a1f007fab2acbdf1
INFO:pytorch_pretrained_bert.modeling_gpt2:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /Users/shensq/.pytorch_pretrained_bert/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.085d5f6a8e7812ea05ff0e6ed0645ab2e75d80387ad55c1ad9806ee70d272f80
INFO:pytorch_pretrained_bert.modeling_gpt2:Model config {
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "vocab_size": 50257
}



In [3]:
tokenizer.convert_ids_to_tokens([8241])

['Who']

In [4]:
pos_tensor = torch.arange(6).reshape(1,-1)

In [5]:
hidden_states_1, past = model(tokens_tensor_1,pos_tensor)

In [6]:
import pickle

In [11]:
ls ../data_processed/

x_flat


In [12]:
handler = open('../data_processed/x_flat','rb')

In [13]:
x_flat = pickle.load(handler)

In [14]:
x_flat

["Such as like? I don't know what you mean. I'm not sure.",
 'Well maybe your daughter if you kind of told her what you were trying to do, she could be a support for you.',
 "Well we could probably be a support for eachother because she's concerned about her weight. So if we teamed up and said ok, two nights this week we're going to just have salads, I think that would be a good thing.",
 "OK, so if you had your daughter's support, that it would be easier to change your behaviour and routine.",
 "We'll a team, and we could probably acomplish that",
 "Sure, success would be nice but I've never really had much success so I'm not very confident.",
 'OK, is there anything in your life that would make it easier for you to make those choices?',
 "Such as like? I don't know what you mean. I'm not sure.",
 'Well maybe your daughter if you kind of told her what you were trying to do, she could be a support for you.',
 "Well we could probably be a support for eachother because she's concerned ab

In [15]:
tokenizer.convert_ids_to_tokens([0,1,2])

['!', '"', '#']

In [16]:
tokenizer.convert_tokens_to_ids(['<eos>'])

[0]

In [17]:
tokenizer.convert_tokens_to_ids(['<bos>'])

[0]

In [18]:
tokenizer.convert_tokens_to_ids(['<|endoftext|>'])

[50256]

In [19]:
help(tokenizer)

Help on GPT2Tokenizer in module pytorch_pretrained_bert.tokenization_gpt2 object:

class GPT2Tokenizer(builtins.object)
 |  GPT-2 BPE tokenizer. Peculiarities:
 |      - Byte-level BPE
 |  
 |  Methods defined here:
 |  
 |  __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __len__(self)
 |  
 |  bpe(self, token)
 |  
 |  convert_ids_to_tokens(self, ids, skip_special_tokens=False)
 |      Converts a sequence of ids in BPE tokens using the vocab.
 |  
 |  convert_tokens_to_ids(self, tokens)
 |      Converts a sequence of tokens into ids using the vocab.
 |  
 |  decode(self, tokens)
 |  
 |  encode(self, text)
 |  
 |  save_vocabulary(self, vocab_path)
 |      Save the tokenizer vocabulary and merge files to a directory.
 |  
 |  set_special_tokens(self, special_tokens)
 |      Add a list of additional tokens to the encoder.
 |      The additional tokens are indexed

# May.30th. Train GPT-2

In [20]:
folder = '../data_processed//'
x_flat = pickle.load(open(folder+'x_flat','rb'))
y_all_join = pickle.load(open(folder+'y_all_join','rb'))


## Convert tf checkpoint to pytorch

In [21]:
from pytorch_pretrained_bert import convert_gpt2_checkpoint_to_pytorch

In [22]:
GPT2_DIR='/Users/shensq/Documents/LIT_ai_counseling/gpt2/models/345M'
# GPT2_DIR='/Users/shensq/Google Drive/2019Winter/retrieval/run1'

In [24]:
convert_gpt2_checkpoint_to_pytorch.convert_gpt2_checkpoint_to_pytorch(GPT2_DIR,GPT2_DIR+'/hparams.json','/Users/shensq/Documents/LIT_ai_counseling/gpt2/models/pytorch_345M')

Converting TensorFlow checkpoint from /Users/shensq/Documents/LIT_ai_counseling/gpt2/models/345M
Loading TF weight model/h0/attn/c_attn/b with shape [3072]
Loading TF weight model/h0/attn/c_attn/w with shape [1, 1024, 3072]
Loading TF weight model/h0/attn/c_proj/b with shape [1024]
Loading TF weight model/h0/attn/c_proj/w with shape [1, 1024, 1024]
Loading TF weight model/h0/ln_1/b with shape [1024]
Loading TF weight model/h0/ln_1/g with shape [1024]
Loading TF weight model/h0/ln_2/b with shape [1024]
Loading TF weight model/h0/ln_2/g with shape [1024]
Loading TF weight model/h0/mlp/c_fc/b with shape [4096]
Loading TF weight model/h0/mlp/c_fc/w with shape [1, 1024, 4096]
Loading TF weight model/h0/mlp/c_proj/b with shape [1024]
Loading TF weight model/h0/mlp/c_proj/w with shape [1, 4096, 1024]
Loading TF weight model/h1/attn/c_attn/b with shape [3072]
Loading TF weight model/h1/attn/c_attn/w with shape [1, 1024, 3072]
Loading TF weight model/h1/attn/c_proj/b with shape [1024]
Loading T

Loading TF weight model/h20/ln_1/g with shape [1024]
Loading TF weight model/h20/ln_2/b with shape [1024]
Loading TF weight model/h20/ln_2/g with shape [1024]
Loading TF weight model/h20/mlp/c_fc/b with shape [4096]
Loading TF weight model/h20/mlp/c_fc/w with shape [1, 1024, 4096]
Loading TF weight model/h20/mlp/c_proj/b with shape [1024]
Loading TF weight model/h20/mlp/c_proj/w with shape [1, 4096, 1024]
Loading TF weight model/h21/attn/c_attn/b with shape [3072]
Loading TF weight model/h21/attn/c_attn/w with shape [1, 1024, 3072]
Loading TF weight model/h21/attn/c_proj/b with shape [1024]
Loading TF weight model/h21/attn/c_proj/w with shape [1, 1024, 1024]
Loading TF weight model/h21/ln_1/b with shape [1024]
Loading TF weight model/h21/ln_1/g with shape [1024]
Loading TF weight model/h21/ln_2/b with shape [1024]
Loading TF weight model/h21/ln_2/g with shape [1024]
Loading TF weight model/h21/mlp/c_fc/b with shape [4096]
Loading TF weight model/h21/mlp/c_fc/w with shape [1, 1024, 4096

Save configuration file to /Users/shensq/Documents/LIT_ai_counseling/gpt2/models/pytorch_345M/config.json


In [422]:
pwd

'/Users/shensq/Google Drive/2019Winter/retrieval'

In [91]:
# Path to the pytorch checkpoint
# '/Users/shensq/Google Drive/2019Winter/retrieval/gpt_345M/'

'/Users/shensq/Google Drive/2019Winter/retrieval'

## Load from checkpoint

In [25]:
import torch
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode some inputs
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"
indexed_tokens_1 = tokenizer.encode(text_1)
indexed_tokens_2 = tokenizer.encode(text_2)

# Convert inputs to PyTorch tensors
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])

INFO:pytorch_pretrained_bert.tokenization_gpt2:loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /Users/shensq/.pytorch_pretrained_bert/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:pytorch_pretrained_bert.tokenization_gpt2:loading merges file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /Users/shensq/.pytorch_pretrained_bert/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda


In [26]:
model = GPT2Model.from_pretrained('/Users/shensq/Documents/LIT_ai_counseling/gpt2/models/pytorch_345M')
model.eval()

INFO:pytorch_pretrained_bert.modeling_gpt2:loading weights file /Users/shensq/Documents/LIT_ai_counseling/gpt2/models/pytorch_345M/pytorch_model.bin
INFO:pytorch_pretrained_bert.modeling_gpt2:loading configuration file /Users/shensq/Documents/LIT_ai_counseling/gpt2/models/pytorch_345M/config.json
INFO:pytorch_pretrained_bert.modeling_gpt2:Model config {
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_layer": 24,
  "n_positions": 1024,
  "n_vocab": 50257,
  "vocab_size": 50257
}



GPT2Model(
  (wte): Embedding(50257, 1024)
  (wpe): Embedding(1024, 1024)
  (h): ModuleList(
    (0): Block(
      (ln_1): BertLayerNorm()
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
      )
      (ln_2): BertLayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
      )
    )
    (1): Block(
      (ln_1): BertLayerNorm()
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
      )
      (ln_2): BertLayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
      )
    )
    (2): Block(
      (ln_1): BertLayerNorm()
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
      )
      (ln_2): BertLayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
      )
    )
    (3): Block(
      (ln_1): BertLayerNorm()
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
      )
      (ln_2): BertLayerNorm()
   

In [38]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (h): ModuleList(
      (0): Block(
        (ln_1): BertLayerNorm()
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
        )
        (ln_2): BertLayerNorm()
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
        )
      )
      (1): Block(
        (ln_1): BertLayerNorm()
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
        )
        (ln_2): BertLayerNorm()
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
        )
      )
      (2): Block(
        (ln_1): BertLayerNorm()
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
        )
        (ln_2): BertLayerNorm()
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
        )
      )
      (3): Block(
        (ln_1): BertLayerNorm()
    

In [47]:
from torch import optim

In [49]:
optimizer =  optim.SGD(model.transformer.wte.parameters(),lr=0.1)

In [45]:
model.transformer.wte.parameters()

<generator object Module.parameters at 0x14a577938>

In [27]:
# Predict hidden states features for each layer
with torch.no_grad():
    hidden_states_1, past = model(tokens_tensor_1)
    # past can be used to reuse precomputed hidden state in a subsequent predictions
    # (see beam-search examples in the run_gpt2.py example).
    hidden_states_2, past = model(tokens_tensor_2, past=past)

In [28]:
model = GPT2LMHeadModel.from_pretrained('/Users/shensq/Documents/LIT_ai_counseling/gpt2/models/pytorch_345M')

INFO:pytorch_pretrained_bert.modeling_gpt2:loading weights file /Users/shensq/Documents/LIT_ai_counseling/gpt2/models/pytorch_345M/pytorch_model.bin
INFO:pytorch_pretrained_bert.modeling_gpt2:loading configuration file /Users/shensq/Documents/LIT_ai_counseling/gpt2/models/pytorch_345M/config.json
INFO:pytorch_pretrained_bert.modeling_gpt2:Model config {
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_layer": 24,
  "n_positions": 1024,
  "n_vocab": 50257,
  "vocab_size": 50257
}



In [29]:
# Predict all tokens
with torch.no_grad():
    predictions_1, past = model(tokens_tensor_1)
    # past can be used to reuse precomputed hidden state in a subsequent predictions
    # (see beam-search examples in the run_gpt2.py example).
    predictions_2, past = model(tokens_tensor_2, past=past)

# get the predicted last token
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
predicted_token = tokenizer.decode([predicted_index])

In [30]:
hidden_states_1.shape

torch.Size([1, 6, 1024])

In [31]:
predictions_1.shape

torch.Size([1, 6, 50257])

In [35]:
tokenizer = GPT2Tokenizer.from_pretrained('/Users/shensq/Google Drive/2019Winter/retrieval/gpt_345M_origin/')

INFO:pytorch_pretrained_bert.tokenization_gpt2:loading vocabulary file /Users/shensq/Google Drive/2019Winter/retrieval/gpt_345M_origin/vocab.json
INFO:pytorch_pretrained_bert.tokenization_gpt2:loading merges file /Users/shensq/Google Drive/2019Winter/retrieval/gpt_345M_origin/merges.txt


In [36]:
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
tokenizer.set_special_tokens(SPECIAL_TOKENS)

INFO:pytorch_pretrained_bert.tokenization_gpt2:Special tokens {'<bos>': 50257, '<eos>': 50258, '<speaker1>': 50259, '<speaker2>': 50260, '<pad>': 50261}


# Parse Data

In [166]:
folder = 'multiturns_data/'
x_flat = pickle.load(open(folder+'x_flat','rb'))
y_all_join = pickle.load(open(folder+'y_all_join','rb'))

In [149]:
tokenizer.convert_ids_to_tokens([2])

['#']

In [223]:
for i,y in enumerate(y_cleaned):
    
    tmp =tokenizer.convert_tokens_to_ids(y.split())

In [215]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub("[’]","\'",text)
    text = re.sub("it's", "it is", text)
    text = re.sub("i'm", "i am", text)
    text = re.sub("he's", "he is", text)
    text = re.sub("she's", "she is", text)
    text = re.sub("that's", "that is", text)
    text = re.sub("what's", "what is", text)
    text = re.sub("where's", "where is", text)
    text = re.sub("he's", "he is", text)
    text = re.sub("\'s", " \'s",text)
    text = re.sub("\'ll", " will", text)
    text = re.sub("\'ve", " have", text)
    text = re.sub("\'re", " are", text)
    text = re.sub("\'d", " would", text)
    text = re.sub("\'re", " are", text)
    text = re.sub("don't", "do not", text)
    text = re.sub("won't", "will not", text)
    text = re.sub("can't", "can not", text)
#     text = re.sub("[-()\"#/@;:<>{}+=~.…,|!?\’]", "", text)
    return text

In [219]:
y_cleaned = [clean_text(y) for y in y_all_join]

In [258]:
y_encoded = [tokenizer.convert_tokens_to_ids(y.split()) for y in y_cleaned]

In [260]:
x_cleaned = [clean_text(x) for x in x_flat]
x_encoded = [tokenizer.convert_tokens_to_ids(x.split()) for x in x_cleaned]

In [269]:
# need 3 special tokens
# # as <ref start> 3
# $ as <speaker1> 4
# % as <speaker2> 5
# '<|endoftext|>' as <eos> 50256

In [378]:
import torch
from torch.utils.data import Dataset,DataLoader
from torch.autograd import Variable

class GptDataset(Dataset):
    def __init__(self,x_encoded,y_encoded,num_turns=5):
        self.x_encoded = x_encoded
        self.y_encoded = y_encoded

        self.num_turns = num_turns
        
    def __getitem__(self,index):
        type_x = []
        x = []
        lm_x = []
        
        x += [4] + x_encoded[index*self.num_turns]
        type_x += [4]*(len(x_encoded[index*self.num_turns])+1)
        
        x += [5] + x_encoded[index*self.num_turns+1]
        type_x += [5]*(len(x_encoded[index*self.num_turns+1])+1)
        
        x += [4] + x_encoded[index*self.num_turns+2]
        type_x += [4]*(len(x_encoded[index*self.num_turns+2])+1)
        
        x += [5] + x_encoded[index*self.num_turns+3]
        type_x += [4]*(len(x_encoded[index*self.num_turns+3])+1)
        
        x += [4] + x_encoded[index*self.num_turns+4]
        type_x += [4]*(len(x_encoded[index*self.num_turns+4])+1)
        lm_x += [-1]*len(x)
        
        
        x += [3] + y_encoded[index] + [50256]
        type_x += [3]*(len(y_encoded[index])+2)
        lm_x += [-1] + y_encoded[index] + [-1]
        
        position_x = list(range(len(x)))
        
        x = torch.Tensor(x)
        type_x = torch.Tensor(type_x)
        position_x = torch.Tensor(position_x)
        lm_x = torch.Tensor(lm_x)
        x_len = x.shape[0]
        
        return x,type_x,position_x,lm_x
    
    def __len__(self):
        return len(self.y_encoded)

In [379]:
USE_CUDA = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor
def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (src_seq, trg_seq).
    We should build a custom collate_fn rather than using default collate_fn,
    because merging sequences (including padding) is not supported in default.
    Seqeuences are padded to the maximum length of mini-batch sequences (dynamic padding).
    Args:
        data: list of tuple (src_seq, trg_seq).
            - src_seq: torch tensor of shape (?); variable length.
            - trg_seq: torch tensor of shape (?); variable length.
    Returns:
        src_seqs: torch tensor of shape (batch_size, padded_length).
        src_lengths: list of length (batch_size); valid length for each padded source sequence.
        trg_seqs: torch tensor of shape (batch_size, padded_length).
        trg_lengths: list of length (batch_size); valid length for each padded target sequence.
    """
    def merge(sequences):
        lengths = [len(seq) for seq in sequences]
        padded_seqs = torch.zeros(len(sequences), max(lengths)).long()
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq[:end]
        return padded_seqs, lengths

    # sort a list by sequence length (descending order) to use pack_padded_sequence
    data.sort(key=lambda x: len(x[0]), reverse=True)

    # seperate source and target sequences
    src_seqs, trg_seqs, pos_seqs,lm_seqs = zip(*data)

    # merge sequences (from tuple of 1D tensor to 2D tensor)
    src_seqs, src_lengths = merge(src_seqs)
    trg_seqs, trg_lengths = merge(trg_seqs)
    pos_seqs, pos_lengths = merge(pos_seqs)
    lm_seqs, lm_lengths = merge(lm_seqs)
    if USE_CUDA:
        src_seqs = src_seqs.cuda()
        trg_seqs = trg_seqs.cuda()
    return Variable(LongTensor(src_seqs)), Variable(LongTensor(trg_seqs)), Variable(LongTensor(pos_seqs)),Variable(LongTensor(lm_seqs)),  src_lengths

In [380]:
gpt_data = GptDataset(x_encoded,y_encoded)

In [381]:
data_loader = DataLoader(dataset=gpt_data,batch_size=4,shuffle=True,drop_last=True,collate_fn=collate_fn)

In [382]:
counter=0
for x,type_x,pos_x,lm_x,x_len in data_loader:
    if counter>0:
        break
    counter+=1

In [412]:

lm_loss = model(x, position_ids=pos_x, token_type_ids=type_x, lm_labels=lm_x)

In [417]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

In [376]:
pos_x.shape

torch.Size([4, 242])

In [377]:
lm_x.shape

torch.Size([4, 242])

In [385]:
predictions_1.shape

torch.Size([4, 146, 50257])

In [387]:
lm_x.shape

torch.Size([4, 146])

In [388]:
type_x.shape

torch.Size([4, 146])

In [390]:
type(type_x)

torch.Tensor

In [395]:
lm_x

tensor([[   -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,  

In [431]:
x_flat[0:5]

["Such as like? I don't know what you mean. I'm not sure.",
 'Well maybe your daughter if you kind of told her what you were trying to do, she could be a support for you.',
 "Well we could probably be a support for eachother because she's concerned about her weight. So if we teamed up and said ok, two nights this week we're going to just have salads, I think that would be a good thing.",
 "OK, so if you had your daughter's support, that it would be easier to change your behaviour and routine.",
 "We'll a team, and we could probably acomplish that"]

In [None]:
Such as like? I don't know what you mean. I'm not sure. Well maybe your daughter if you kind of told her what you were trying to do, she could be a support for you.' Well we could probably be a support for eachother because she's concerned about her weight. So if we teamed up and said ok, two nights this week we're going to just have salads, I think that would be a good thing. OK, so if you had your daughter's support, that it would be easier to change your behaviour and routine. We'll a team, and we could probably acomplish that

In [None]:
y_cleaned = [clean_text(y) for y in y_all_join]
y_encoded = [tokenizer.convert_tokens_to_ids(y.split()) for y in y_cleaned]