In [1]:
import io
import os
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, RandomSampler, TensorDataset, SequentialSampler
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from transformers import (set_seed,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          BertConfig,
                          BertTokenizer,
                          BertForSequenceClassification)
from helper import b_metrics

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

seed = 1520
set_seed(seed)



In [2]:
label2tag = {
    0: "news_story",
    1: "news_culture",
    2: "news_entertainment",
    3: "news_sports",
    4: "news_finance",
    5: "news_house",
    6: "news_car",
    7: "news_edu",
    8: "news_tech",
    9: "news_military",
    10: "news_travel",
    11: "news_world",
    12: "stock",
    13: "news_agriculture",
    14: "news_game",
}
tag2label = {v: k for k, v in label2tag.items()}

val_ratio = 0.1

# Average 30, 
# n_words = [len(i) for i in texts]
# avg_words = sum(n_words)/len(n_words)
# print(f"Average words = {avg_words}")
max_length = 25

n_labels = len(tag2label)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  and should_run_async(code)


In [3]:
tokenizer = BertTokenizer.from_pretrained('model/vocab.txt')
config = BertConfig.from_pretrained('model/config.json')
model = BertForSequenceClassification.from_pretrained('model/pytorch_model.bin', config=config, num_labels=n_labels)
model.cuda()

  and should_run_async(code)


TypeError: __init__() got an unexpected keyword argument 'num_labels'

In [None]:
def preprocessing(input_text, max_length, tokenizer):
    '''
    Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
        - input_ids: list of token ids
        - token_type_ids: list of token type ids
        - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
    '''
    return tokenizer.encode_plus(
        input_text,
        add_special_tokens = True,
        max_length = max_length,
        # truncation = True,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
)

In [None]:
path = "./data/toutiao_cat_data.txt"

texts = []
labels = []

token_ids = []
attention_masks = []

with open(path) as f:
    for line in f.readlines():
        split = line.split("_!_")
        texts.append(split[3])
        labels.append(int(split[1]))
    
    texts = texts[:10000]
    labels = labels[:10000]

for sample in tqdm(texts):
    encoding_dict = preprocessing(sample, max_length, tokenizer)
    token_ids.append(encoding_dict["input_ids"])
    attention_masks.append(encoding_dict["attention_mask"])

token_ids = torch.cat(token_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [None]:
for item in token_ids[:10]:
    print(len(item))

In [None]:
# Calculating average words in a sentence.
n_words = [len(i) for i in texts]
n_words = np.array(n_words)
print(np.mean(n_words), np.std(n_words))

In [None]:
batch_size = 4
epochs = 5

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_ids[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_ids[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [None]:
def train(epoch, dataloader, optimizer, scheduler, device):
    # Take global model.
    global model

    # Tracking variables.
    predictions_labels = []
    true_labels = []
    # Total loss for this epoch.
    total_loss = 0

    # Put the model into training mode.
    model.train()

    with tqdm(dataloader, total=len(dataloader), unit="batch") as tepoch:
        tepoch.set_description(f"Epoch {epoch}/{epochs}")
        for batch in tepoch:

            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            optimizer.zero_grad()
            # Forward pass
            # TODO: Check what's the return structure looks like.
            train_output = model(
                b_input_ids, 
                token_type_ids = None, 
                attention_mask = b_input_mask, 
                labels = b_labels)
            # Backward pass
            train_output.loss.backward()
            optimizer.step()
            scheduler.step()
            # Update tracking variables
            loss = train_output.loss.item()
            total_loss += loss

            tepoch.set_postfix(loss=loss)
    
    avg_epoch_loss = total_loss / len(dataloader)

    return avg_epoch_loss

def validation(dataloader, device):
    global model

    # Tracking variables
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []
    #total loss for this epoch.
    total_loss = 0

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluatdion.
    model.eval()

    with tqdm(dataloader, total=len(dataloader), unit="batch") as tepoch:
        for batch in tepoch:

            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
                eval_output = model(
                    b_input_ids, 
                    token_type_ids = None, 
                    attention_mask = b_input_mask, 
                )
            logits = eval_output.logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            # Calculate validation metrics
            b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
            val_accuracy.append(b_accuracy)
            # Update precision only when (tp + fp) !=0; ignore nan
            if b_precision != 'nan': val_precision.append(b_precision)
            # Update recall only when (tp + fn) !=0; ignore nan
            if b_recall != 'nan': val_recall.append(b_recall)
            # Update specificity only when (tn + fp) !=0; ignore nan
            if b_specificity != 'nan': val_specificity.append(b_specificity)

    
    return val_accuracy, val_precision, val_recall, val_specificity

In [None]:
total_steps = len(train_dataloader) * epochs

optimizer = AdamW(
    model.parameters(),
    lr = 2e-5, # default is 5e-5, our notebook had 2e-5
    eps = 1e-8 # default is 1e-8.
)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Metrics
all_loss = {
    "train_loss": [],
    "valid_loss": []
}
all_acc = {
    "train_acc": [],
    "valid_acc": []
}
all_precision = {
    "train_precision": [],
    "valid_precision": []
}
all_recall = {
    "train_recall": [],
    "valid_recall": []
}
all_specificity = {
    "train_specificity": [],
    "valid_specificity": []
}

for epoch in range(1, epochs+1):
    train_loss = train(epoch, train_dataloader, optimizer, scheduler, device)

    val_accuracy, val_precision, val_recall, val_specificity = validation(
        validation_dataloader,
        device
    )

    epoch_loss = train_loss / epoch*len(train_dataloader)
    epoch_acc = sum(val_accuracy)/len(val_accuracy)
    epoch_precision = sum(val_precision)/len(val_precision)
    epoch_recall = sum(val_recall)/len(val_recall)
    epoch_specificity = sum(val_specificity)/len(val_specificity)

    all_loss['train_loss'].append(epoch_loss) 
    all_acc['valid_acc'].append(epoch_acc)
    all_precision['valid_precision'].append(epoch_precision)
    all_recall['valid_recall'].append(epoch_recall)
    all_specificity['valid_specificity'].append(epoch_specificity)

    print('\n\t - Train loss: {:.4f}'.format(epoch_loss))
    print('\t - Validation Accuracy: {:.4f}'.format(epoch_acc))
    print('\t - Validation Precision: {:.4f}'.format(epoch_precision) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(epoch_recall) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(epoch_specificity) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')


plot_dict(all_loss, use_xlabel="Epochs", use_ylabel="Value", use_linestyles=['-', '--'])
plot_dict(all_acc, use_xlabel="Epochs", use_ylabel="Value", use_linestyles=['-', '--'])
plot_dict(all_precision, use_xlabel="Epochs", use_ylabel="Value", use_linestyles=['-', '--'])
plot_dict(all_recall, use_xlabel="Epochs", use_ylabel="Value", use_linestyles=['-', '--'])
plot_dict(all_specificity, use_xlabel="Epochs", use_ylabel="Value", use_linestyles=['-', '--'])