### This notebook is to analyze what `Tez` do under the hood in the original notebook from @Abhishek Thakur 

(A little dispointed that I do not find any documentation for Tez library but thankfully the source code is clean and readable.)

In [None]:
# this cell is totally same as the original notebook, which is familiar Pytorch/Transformer routines
import sys
sys.path.append("../input/tez-lib/")

import torch

import pandas as pd
import torch.nn as nn

from scipy import stats
from tez import Tez, TezConfig
from tez.callbacks import EarlyStopping
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup

class args:
    model = "../input/anferico-bert-for-patents/"
    max_len = 32
    accumulation_steps = 1
    batch_size = 64
    epochs = 5
    learning_rate = 2e-5
    
class PhraseDataset:
    def __init__(self, anchor, target, context, tokenizer, max_len):
        self.anchor = anchor
        self.target = target
        self.context = context
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.anchor)

    def __getitem__(self, item):
        anchor = self.anchor[item]
        context = self.context[item]
        target = self.target[item]

        encoded_text = self.tokenizer.encode_plus(
            context + " " + anchor,
            target,
            padding="max_length",
            max_length=self.max_len,
            truncation=True,
        )
        input_ids = encoded_text["input_ids"]
        attention_mask = encoded_text["attention_mask"]
        token_type_ids = encoded_text["token_type_ids"]

        return {
            "ids": torch.tensor(input_ids, dtype=torch.long),
            "mask": torch.tensor(attention_mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }
    
class PhraseModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model_name = model_name

        config = AutoConfig.from_pretrained(model_name)
        config.update(
            {
                "output_hidden_states": True,
                "add_pooling_layer": True,
                "num_labels": 1,
            }
        )
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.output = nn.Linear(config.hidden_size, 1)

    def forward(self, ids, mask, token_type_ids):
        transformer_out = self.transformer(ids, mask, token_type_ids)
        output = transformer_out.pooler_output
        output = self.dropout(output)
        output = self.output(output)
        return output, 0, {}
    
df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

context_mapping = {
    "A": "Human Necessities",
    "B": "Operations and Transport",
    "C": "Chemistry and Metallurgy",
    "D": "Textiles",
    "E": "Fixed Constructions",
    "F": "Mechanical Engineering",
    "G": "Physics",
    "H": "Electricity",
    "Y": "Emerging Cross-Sectional Technologies",
}

df.context = df.context.apply(lambda x: context_mapping[x[0]])

tokenizer = AutoTokenizer.from_pretrained(args.model)
test_dataset = PhraseDataset(
    anchor=df.anchor.values,
    target=df.target.values,
    context=df.context.values,
    tokenizer=tokenizer,
    max_len=args.max_len,
)

model = PhraseModel(model_name=args.model)

### Analyzing Tez-specific code

In [None]:

# `Tez` maintain two important attributes: Pytorch `self.model` and `self.config`
tez_model = Tez(
    model
)
print(type(tez_model.model), tez_model.config)

In [None]:
# the following lines do these things:
# 1. set config: tez_model.config = ...
# 2. set model device: model.to(torch.device("cuda:0"))
# 3. load model parameters: model.load_state_dict(torch.load(model_path, map_location='cuda'))
model_path = "../input/uspppm-tez-models/model_f0.bin"
config = TezConfig(test_batch_size=64, device="cuda",)
tez_model.load(model_path, weights_only=True, config=config)
print(tez_model.config)
# One important thing I notice is that TezConfig contains many default argument settings, which we may want to take control


In [None]:
# finally, it predicts (seem like to be a clean API)
# it do the following things:
# 1. make data loader: data_loader = DataLoader(dataset,  batch_size=batch_size, num_workers=-1, sampler=sampler, collate_fn=collate_fn, pin_memory=False,)
# 2. simply call python torch model with input from data_loader
preds_iter = tez_model.predict(test_dataset)

### Using Pure Pytorch

In [None]:
# here is pure pytorch code
from torch.utils.data import DataLoader
model.to(torch.device("cuda:0"))
model_path = "../input/uspppm-tez-models/model_f0.bin"
model.load_state_dict(torch.load(model_path, map_location='cuda'))
def predict(model, dataset, batch_size):
        data_loader = DataLoader(
            dataset,
            batch_size=batch_size,
            num_workers=0,
            sampler=None,
            collate_fn=None,
            pin_memory=False,
        )

        if model.training:
            model.eval()

        for data in data_loader:
            with torch.no_grad():  
                for key, value in data.items():
                    data[key] = value.to(torch.device("cuda:0"))
                # here the PhraseModel.forward retures two extra parameters for Tez template, we can remove them
                output, _, _ = model(**data)
                output = output.cpu().detach().numpy()
                yield output

test_batch_size=64
preds_iter = predict(model, test_dataset, test_batch_size)
final_preds = []
for preds in preds_iter:
    preds[preds < 0] = 0
    preds[preds > 1] = 1
    final_preds.extend(preds.ravel().tolist())

In [None]:
sample_submission = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
sample_submission.score = final_preds
sample_submission.to_csv("submission.csv", index=False)

In [None]:
sample_submission.head() # I have checked that the result is totally same as the original notebook