Tez can be found here: https://github.com/abhishekkrthakur/tez

Please give it some love by starring the repo!

In [None]:
import sys
sys.path.append("../input/tez-lib/")

In [None]:
import torch

import pandas as pd
import torch.nn as nn

from scipy import stats
from tez import Tez, TezConfig
from tez.callbacks import EarlyStopping
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup

In [None]:
class args:
    model = "anferico/bert-for-patents"
    max_len = 32
    accumulation_steps = 1
    batch_size = 32
    epochs = 5
    learning_rate = 2e-5

In [None]:
class PhraseDataset:
    def __init__(self, anchor, target, context, score, tokenizer, max_len):
        self.anchor = anchor
        self.target = target
        self.context = context
        self.score = score
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.anchor)

    def __getitem__(self, item):
        anchor = self.anchor[item]
        context = self.context[item]
        target = self.target[item]
        score = self.score[item]

        encoded_text = self.tokenizer.encode_plus(
            context + " " + anchor,
            target,
            padding="max_length",
            max_length=self.max_len,
            truncation=True,
        )
        input_ids = encoded_text["input_ids"]
        attention_mask = encoded_text["attention_mask"]
        token_type_ids = encoded_text["token_type_ids"]

        return {
            "ids": torch.tensor(input_ids, dtype=torch.long),
            "mask": torch.tensor(attention_mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "score": torch.tensor(score, dtype=torch.float),
        }

In [None]:

class PhraseModel(nn.Module):
    def __init__(self, model_name, learning_rate, num_train_steps, steps_per_epoch):
        super().__init__()
        self.learning_rate = learning_rate
        self.model_name = model_name
        self.num_train_steps = num_train_steps
        self.steps_per_epoch = steps_per_epoch

        config = AutoConfig.from_pretrained(model_name)
        config.update(
            {
                "output_hidden_states": True,
                "add_pooling_layer": True,
                "num_labels": 1,
            }
        )
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.output = nn.Linear(config.hidden_size, 1)

    def monitor_metrics(self, outputs, targets):
        device = targets.get_device()
        outputs = outputs.cpu().detach().numpy().ravel()
        targets = targets.cpu().detach().numpy().ravel()
        pearsonr = stats.pearsonr(outputs, targets)
        return {"pearsonr": torch.tensor(pearsonr[0], device=device)}

    def optimizer_scheduler(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.01,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        opt = torch.optim.AdamW(optimizer_parameters, lr=self.learning_rate)
        sch = get_linear_schedule_with_warmup(
            opt,
            num_warmup_steps=0,
            num_training_steps=self.num_train_steps,
        )
        return opt, sch

    def forward(self, ids, mask, token_type_ids, score):
        transformer_out = self.transformer(ids, mask, token_type_ids)
        output = transformer_out.pooler_output
        output = self.dropout(output)
        output = self.output(output)
        loss = nn.MSELoss()(output.squeeze(), score.squeeze())
        metrics = self.monitor_metrics(output, score)
        return output, loss, metrics

In [None]:
for fold_ in range(10):
    df = pd.read_csv("../input/uspppm-folds/train_folds.csv")

    context_mapping = {
        "A": "Human Necessities",
        "B": "Operations and Transport",
        "C": "Chemistry and Metallurgy",
        "D": "Textiles",
        "E": "Fixed Constructions",
        "F": "Mechanical Engineering",
        "G": "Physics",
        "H": "Electricity",
        "Y": "Emerging Cross-Sectional Technologies",
    }

    df.context = df.context.apply(lambda x: context_mapping[x[0]])

    train_df = df[df["kfold"] != fold_].reset_index(drop=True)
    valid_df = df[df["kfold"] == fold_].reset_index(drop=True)

    tokenizer = AutoTokenizer.from_pretrained(args.model)
    train_dataset = PhraseDataset(
        anchor=train_df.anchor.values,
        target=train_df.target.values,
        context=train_df.context.values,
        score=train_df.score.values,
        tokenizer=tokenizer,
        max_len=args.max_len,
    )
    valid_dataset = PhraseDataset(
        anchor=valid_df.anchor.values,
        target=valid_df.target.values,
        context=valid_df.context.values,
        score=valid_df.score.values,
        tokenizer=tokenizer,
        max_len=args.max_len,
    )

    num_train_steps = int(len(train_dataset) / args.batch_size / args.accumulation_steps * args.epochs)
    steps_per_epoch = len(train_dataset) / args.batch_size

    model = PhraseModel(
        model_name=args.model,
        learning_rate=args.learning_rate,
        num_train_steps=num_train_steps,
        steps_per_epoch=steps_per_epoch,
    )

    # convert model to Tez model
    model = Tez(model)

    es = EarlyStopping(
        monitor="valid_pearsonr",
        model_path=f"model_f{fold_}.bin",
        patience=2,
        mode="max",
        save_weights_only=True,
    )

    config = TezConfig(
        training_batch_size=args.batch_size,
        validation_batch_size=2 * args.batch_size,
        gradient_accumulation_steps=args.accumulation_steps,
        epochs=args.epochs,
        step_scheduler_after="batch",
        fp16=True,
    )

    # just like keras ;)
    model.fit(
        train_dataset,
        valid_dataset=valid_dataset,
        config=config,
        callbacks=[es],
    )
    
    # remove break to train all 10 folds
    break