## Simple Pytorch Infernce
### for Training Notebook : https://www.kaggle.com/code/bibhabasumohapatra/train-us-patent-part-1-simple-pytorch

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
from itertools import chain

In [None]:
class config:
    device = 'cuda'
    model = "../input/bert-for-patents/bert-for-patents"
    tokenizer =  AutoTokenizer.from_pretrained(model)
    max_len = 128
    folds = 5
    train_batch_size = 16
    valid_batch_size = 16
    epochs = 6
    lr = 2e-5


# Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, text, targets):
        self.text = text
        self.targets = targets

        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        text = self.text[item]
        targets = self.targets[item]
        
        encoded_text = config.tokenizer.encode_plus(text, targets, padding="max_length",
                                                    max_length=config.max_len, truncation=True,)
        
        return {
            "ids": torch.tensor(encoded_text["input_ids"], dtype=torch.long),
            "mask": torch.tensor(encoded_text["attention_mask"], dtype=torch.long),
            "token_type_ids": torch.tensor(encoded_text["token_type_ids"], dtype=torch.long),
        }

# Model

In [None]:
class PhraseModel(nn.Module): 
    def __init__(self):
        super().__init__()
        
        model_config = AutoConfig.from_pretrained(config.model)  ## credits https://www.kaggle.com/code/abhishek/tez-training-phrase-matching
        model_config.update(
            {
                "output_hidden_states": True,
                "add_pooling_layer": True,
                "num_labels": 1,
            }
        )
        self.transformer = AutoModel.from_pretrained(config.model, config=model_config)
        self.dropout = nn.Dropout(model_config.hidden_dropout_prob)
        self.output = nn.Linear(model_config.hidden_size, 1)
        
    def forward(self, ids, mask, token_type_ids):
        transformer_out = self.transformer(ids, mask, token_type_ids)
        output = transformer_out.pooler_output
        output = self.dropout(output)
        output = self.output(output)
        
        return output

In [None]:
def inference_fn(data_loader, model, device):
    final_outputs = []
    with torch.inference_mode():
        for data in data_loader :
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)

            outputs = model(
             ids=ids,
             mask=mask,
             token_type_ids=token_type_ids
             )

            outputs = (torch.sigmoid(outputs).detach().cpu().numpy()).tolist()
            final_outputs.extend(outputs)
            
            
    return final_outputs

# manage CSV file and folds

In [None]:
df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
context_mapping = {
        "A": "Human Necessities",
        "B": "Operations and Transport",
        "C": "Chemistry and Metallurgy",
        "D": "Textiles",
        "E": "Fixed Constructions",
        "F": "Mechanical Engineering",
        "G": "Physics",
        "H": "Electricity",
        "Y": "Emerging Cross-Sectional Technologies",
    }

df.context = df.context.apply(lambda x: context_mapping[x[0]])
df["text"] = df.context + " " + df.anchor
df = df.drop(columns = ["context", "anchor"])
## folds from https://www.kaggle.com/code/abhishek/phrase-matching-folds


In [None]:
df.head()

## Main

In [None]:
model = PhraseModel()
model.to(config.device)

model.load_state_dict(torch.load("../input/train-us-patent-part-1-simple-pytorch/model-epoch4-fold-0.pth"))

test_dataset = CustomDataset(text = df.text.values, targets = df.target.values)
test_loader = DataLoader(test_dataset, batch_size = 16,shuffle=False)
    
outputs = inference_fn(test_loader, model, config.device)
outputs = list(chain.from_iterable(outputs))

final_preds = []
for preds in outputs:
    if preds > 1:
        preds = 1
    if preds < 0:
        preds = 0
    final_preds.append(preds)

In [None]:
submission = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
submission['score'] = outputs
submission.to_csv("submission.csv", index=False)

In [None]:
submission.head()