In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
!pip install performer-pytorch

## Performer Model

In [None]:
# Try enwik dataset
from performer_pytorch import PerformerLM
# Calculates loss
from performer_pytorch.autoregressive_wrapper import AutoregressiveWrapper

import random
#import tqdm
from tqdm.notebook import tqdm
import gzip
import numpy as np
import torch
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

# constants
NUM_BATCHES = 10#int(1e5)
BATCH_SIZE = 4
GRADIENT_ACCUMULATE_EVERY = 4
LEARNING_RATE = 1e-4
VALIDATE_EVERY  = 100
GENERATE_EVERY  = 500
GENERATE_LENGTH = 2048
SEQ_LEN = 4096

# helpers
def cycle(loader):
    while True:
        for data in loader:
            yield data

def decode_token(token):
    """
    chr: returns character from string; e.g. chr(97)) > a; chr of <=32 > whitespace
    """
    return str(chr(max(32, token)))

def decode_tokens(tokens):
    return ''.join(list(map(decode_token, tokens)))


# instantiate model

model = PerformerLM(
    num_tokens = 22000,          # As many tokens as we tokenize to (21128 if we use transformers voc)
    dim = 512,
    depth = 6,
    max_seq_len = SEQ_LEN,
    heads = 8,
    causal = True,
    reversible = True,
    nb_features = 256,
    use_scalenorm = True,
    local_attn_heads = (8, 8, 8, 6, 4, 2) # Attention Heads per layer
)

model = AutoregressiveWrapper(model)
model.cuda()


## 新闻 Data Preparation

In [None]:
news_df = pd.read_csv("../input/chinese-official-daily-news-since-2016/chinese_news.csv")
# Concat all content together
# TODO check if concatting is how it's done in practice
full_text = ''.join([str(i) for i in news_df["content"]])

# Note: In reality we need better encoding for CN chars, as eg 中 is encoded as [228 184 173])
# We will probably just use a vocab.txt with 40K CN chars, like for BERT
X = np.fromstring(full_text, dtype=np.uint8)
trX, vaX = np.split(X, [int(len(full_text)*0.9)])
data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)

class TextSamplerDataset(Dataset):
    def __init__(self, data, seq_len):
        super().__init__()
        self.data = data
        self.seq_len = seq_len

    def __getitem__(self, index):
        rand_start = torch.randint(0, self.data.size(0) - self.seq_len - 1, (1,))
        full_seq = self.data[rand_start: rand_start + self.seq_len + 1].long()
        return full_seq.cuda()

    def __len__(self):
        return self.data.size(0) // self.seq_len
    
train_dataset = TextSamplerDataset(data_train, SEQ_LEN)
val_dataset   = TextSamplerDataset(data_val, SEQ_LEN)
train_loader  = cycle(DataLoader(train_dataset, batch_size = BATCH_SIZE))
val_loader    = cycle(DataLoader(val_dataset, batch_size = BATCH_SIZE))

# optimizer
optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
# Alternative - Tokenize with huggingface; Runs in 2min

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

print("Vocab Length: ", len(tokenizer.get_vocab()))

# Example:
token_ex = tokenizer.tokenize("中")
id_ex = tokenizer.convert_tokens_to_ids(token_ex)
print(token_ex, id_ex)

# Load & Tokenize data

news_df = pd.read_csv("../input/chinese-official-daily-news-since-2016/chinese_news.csv")
# Concat all content together
# TODO check if concatting is how it's done in practice
full_text = ''.join([str(i) for i in news_df["content"]])

X = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(full_text))
# Note: In reality we need better encoding for CN chars, as eg 中 is encoded as [228 184 173])
# We will probably just use a vocab.txt with 40K CN chars, like for BERT
X = np.fromstring(full_text, dtype=np.uint8)
trX, vaX = np.split(X, [int(len(full_text)*0.9)])
data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)

class TextSamplerDataset(Dataset):
    def __init__(self, data, seq_len):
        super().__init__()
        self.data = data
        self.seq_len = seq_len

    def __getitem__(self, index):
        rand_start = torch.randint(0, self.data.size(0) - self.seq_len - 1, (1,))
        full_seq = self.data[rand_start: rand_start + self.seq_len + 1].long()
        return full_seq.cuda()

    def __len__(self):
        return self.data.size(0) // self.seq_len
    
train_dataset = TextSamplerDataset(data_train, SEQ_LEN)
val_dataset   = TextSamplerDataset(data_val, SEQ_LEN)
train_loader  = cycle(DataLoader(train_dataset, batch_size = BATCH_SIZE))
val_loader    = cycle(DataLoader(val_dataset, batch_size = BATCH_SIZE))

# optimizer
optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

## Training

In [None]:

for i in tqdm(range(NUM_BATCHES), desc='training'):
    model.train()

    for __ in range(GRADIENT_ACCUMULATE_EVERY):
        
        loss = model(next(train_loader), return_loss = True)
        loss.backward()
        
    print(f'training loss: {loss.item()}')

    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optim.step()
    
    optim.zero_grad()

    if i % VALIDATE_EVERY == 0:
        model.eval()
        with torch.no_grad():
            loss = model(next(val_loader), return_loss = True)
            print(f'validation loss: {loss.item()}')

    if i % GENERATE_EVERY == 0 and i != 0:
        model.eval()
        inp = random.choice(val_dataset)[:-1]
        prime = decode_tokens(inp)
        print(f'%s \n\n %s', (prime, '*' * 100))

        sample = model.generate(inp, GENERATE_LENGTH)
        output_str = decode_tokens(sample)
        print(output_str)

## Discard the LM & train body on CL task

In [None]:
# Save the model
from copy import deepcopy
transfer = deepcopy(model.net)
transfer

In [None]:
# Freeze some layers if nec.
#for param in transfer.parameters():
#    param.requires_grad = False
    
# Load model.net 
# Replace final Linear(512, 20000) with a Linear(512, 3)
transfer.to_out = torch.nn.Linear(512, 3, bias=True)    
transfer

In [None]:
def preprocess(sents, max_seq_len, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""
    features = []
    for sent in sents:
        # Remove double whitespaces
        tokens = tokenizer.tokenize(sent)

        if len(tokens) > max_seq_len:
            tokens = tokens[:(max_seq_len)]
            #print("Too long: ", tokens)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_len - len(input_ids))
        input_ids += padding
        
        assert len(input_ids) == max_seq_len
        
        features.append(input_ids)
        
    return features


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

# Load & Tokenize data

news_df = pd.read_csv("../input/chinese-official-daily-news-since-2016/chinese_news.csv")

# Drop ~5 NAN content rows
news_df = news_df.dropna()


#X = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x)) for x in news_df["content"]]
#X = np.array(X, dtype=np.uint8)

X = np.array(preprocess(news_df["content"], SEQ_LEN, tokenizer), dtype=np.uint8)

# 详细全文 > 0
# 国内 > 1
# 国际 > 2

def convert_cl(str_lab):
    if str_lab == "详细全文":
        return 0
    elif str_lab == "国内":
        return 1
    elif str_lab == "国际":
        return 2
    print("Unknown Label: ", str_lab)
    

Y = np.array([convert_cl(y) for y in news_df["tag"]], dtype=np.uint8)

trX, vaX = np.split(X, [int(len(X)*0.9)])
trY, vaY = np.split(Y, [int(len(Y)*0.9)])

data_trainX, data_trainY = torch.from_numpy(trX), torch.from_numpy(trY)
data_valX, data_valY = torch.from_numpy(vaX), torch.from_numpy(vaY)

print(len(X), len(Y))
print(len(trX), len(trY))
print(trY[1])


class TextClassficationDataset(Dataset):
    def __init__(self, dataX, dataY, seq_len):
        super().__init__()
        self.dataX = dataX
        self.dataY = dataY
        self.seq_len = seq_len

    def __getitem__(self, index):
        X = self.dataX[index][:self.seq_len].long()
        Y = self.dataY[index].long()
        return (X.cuda(), Y.cuda())
        #rand_start = torch.randint(0, self.dataX.size(0) - self.seq_len - 1, (1,))
        #full_seq = self.data[rand_start: rand_start + self.seq_len + 1].long()
        #return full_seq.cuda()

    def __len__(self):
        return self.dataX.size(0) #// self.seq_len
    
train_dataset = TextClassficationDataset(data_trainX, data_trainY, SEQ_LEN)
val_dataset   = TextClassficationDataset(data_valX, data_valY, SEQ_LEN)
train_loader  = cycle(DataLoader(train_dataset, batch_size = BATCH_SIZE))
val_loader    = cycle(DataLoader(val_dataset, batch_size = BATCH_SIZE))

# optimizer
optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
# Additional preparation

criterion = torch.nn.CrossEntropyLoss()

In [None]:
transfer.cuda()

for i in tqdm(range(NUM_BATCHES), desc='training'):
    transfer.train()

    for __ in range(GRADIENT_ACCUMULATE_EVERY):
        
        x, y = next(train_loader)
        out = transfer(x)
        # Take first token as CLS
        out = out[:, 0, :]
        
        loss = criterion(out, y)
        loss.backward()
        
    print(f'training loss: {loss.item()}')

    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optim.step()
    
    optim.zero_grad()

    if i % VALIDATE_EVERY == 0:
        model.eval()
        with torch.no_grad():
            x, y = next(val_loader)
            out = transfer(x)
            out = out[:, 0, :]
            loss = criterion(out, y)
            print(f'validation loss: {loss.item()}')