I found out why this happens.

GPT2 was originally didn't have a dedicated padding token since it was trained on sequences of equal lengths. The maintainers of pytorch pretrained bert have gotten around this by letting you set special tokens with their own vocab indices.

This should fix the problem:

    # Add the <pad> token to the vocabulary
    SPECIAL_TOKENS = ["<pad>"]
    tokenizer.set_special_tokens(SPECIAL_TOKENS)

    # Set the number of special tokens in the model
    model.set_num_special_tokens(len(SPECIAL_TOKENS))

    # Get the <pad> token's index
    pad_idx = tokenizer.convert_tokens_to_ids(['<pad>'])[0]
    
    # Use keras's tokenizer to pad sequences with pad_idx
    x = []
    for i in tqdm(range(len(x_train))):
        x.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x_train[i])[:MAX_LEN]))
    
    x_train = sequence.pad_sequences(x, maxlen=MAX_LEN, padding='post', value=pad_idx)
    x_train = torch.tensor(x_train, dtype=torch.int32)

I also made a kernel where I preprocess the data and save it to disk [here](https://www.kaggle.com/bkkaggle/jigsaw-preprocessing-gpt2-1)

#### Resources
- https://github.com/huggingface/pytorch-pretrained-BERT/issues/573
- https://github.com/huggingface/pytorch-pretrained-BERT/issues/577
- https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313

In [None]:
%%time
!cp -r ../input/jigsaw-pytorch-pretrained-bert/repository/huggingface-pytorch-pretrained-BERT-3fc63f1/ ./
!pip install ./huggingface-pytorch-pretrained-BERT-3fc63f1/.

In [None]:
%%time
# Borrows a lot of code from https://www.kaggle.com/bminixhofer/simple-lstm-pytorch-version
FOLD = 0

import os
import sys
import random
import glob
import gc
import requests
import pickle
import csv

import numpy as np
import pandas as pd

import mlcrate as mlc

import os

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

from tqdm._tqdm_notebook import tqdm_notebook as tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils import data
from torch.nn.utils.rnn import pad_sequence
import torch.utils.checkpoint as checkpoint

from keras.preprocessing import text, sequence

# from apex import amp

import spacy
from spacy.lang.en import English

import matplotlib.pyplot as plt

from pytorch_pretrained_bert import BertTokenizer, GPT2Tokenizer

# disable progress bars when submitting
def is_interactive():
   return 'SHLVL' not in os.environ

if not is_interactive():
    def nop(it, *a, **k):
        return it

    tqdm = nop

SEED = 4242

def seed_everything(SEED=SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

seed_everything()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

# from https://github.com/floydhub/save-and-resume
def save_checkpoint(state):
    """Save checkpoint if a new best is achieved"""
    print (" Saving checkpoint")

    filename = f'./checkpoint-{state["fold"]}.pt.tar'
    torch.save(state, filename)

def initialize(model, fold):
    path = f'./checkpoint-{fold}.pt.tar'
    
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model'])

    print(f' Loaded checkpoint {path} | Trained for {checkpoint["epoch"] + 1} epochs')
    
    return model

In [None]:
WORKERS = 0

SPLITS = 5
MAX_LEN = 220
NUM_WORDS = 100000

BATCH_SIZE = 512

In [None]:
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')

In [None]:
# train = train.loc[:1000]

In [None]:
x_train = train['comment_text'].values

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
x = []
for i in tqdm(range(len(x_train))):
    x.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x_train[i])[:MAX_LEN]))
    
x_train = sequence.pad_sequences(x, maxlen=MAX_LEN, padding='post')
x_train = torch.tensor(x_train, dtype=torch.int32)

In [None]:
with open('x_train_gpt.pkl', 'wb') as handle:
    pickle.dump(x_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
!rm -rf huggingface-pytorch-pretrained-BERT-3fc63f1