In [None]:
import os
import gc
import math
import time
import random
from scipy import stats
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import model_selection
from dataclasses import dataclass, field
from typing import List, Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
from transformers import AdamW, AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup

import warnings 
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything(seed=2019)

In [None]:
@dataclass(frozen=True)
class CFG:
    num_workers: Optional[int] = 4
    model_name: Optional[str] = 'roberta-large'
    epochs: Optional[int] = 5
    T_max: Optional[int] = 10 
    T_0: Optional[int] = 10 
    lr: Optional[float]= 2e-5
    min_lr: Optional[float] = 1e-6
    batch_size: Optional[int] = 32
    weight_decay: Optional[float] = 1e-6
    gradient_accumulation_steps: Optional[int] = 1
    max_grad_norm: Optional[int] = 1000
    max_len: Optional[int] = 128
    seed: Optional[int] = 2019
    target_size: Optional[int] = 1
    num_targets: Optional[int] = 1
    n_folds: Optional[int] = 5
    fp16: Optional[bool] = True
    wandb: Optional[bool] = True
    _wandb_kernel: Optional[str] = 'santos'
    competition: Optional[str] = 'PPPM'
    tokenizer = AutoTokenizer.from_pretrained('roberta-large')

In [None]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='PPPM-Public', 
                     name=CFG.model_name,
                     config=class2dict(CFG),
                     group=CFG.model_name,
                     job_type="train",
                     anonymous=anony)

In [None]:
PATH = '../input/us-patent-phrase-to-phrase-matching'
train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))
sub = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.shape, test.shape, sub.shape

In [None]:
def create_folds(data, num_splits):
    data["fold"] = -1
    data = data.sample(frac=1).reset_index(drop=True)
    num_bins = int(np.floor(1 + np.log2(len(data))))
        
    data.loc[:, "bins"] = pd.cut(
        data["score"], bins=num_bins, labels=False
    )
    
    kf = model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    data = data.drop("bins", axis=1)

    return data

In [None]:
#train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
   
context_mapping = {
        "A": "Human Necessities",
        "B": "Operations and Transport",
        "C": "Chemistry and Metallurgy",
        "D": "Textiles",
        "E": "Fixed Constructions",
        "F": "Mechanical Engineering",
        "G": "Physics",
        "H": "Electricity",
        "Y": "Emerging Cross-Sectional Technologies",
}
    
train.context = train.context.apply(lambda x: context_mapping[x[0]])

In [None]:
df = create_folds(train, num_splits=CFG.n_folds)

In [None]:
df.head()

In [None]:
class PhraseDataset:
    def __init__(self, anchor, target, context, score, tokenizer, max_len):
        self.anchor = anchor
        self.target = target
        self.context = context
        self.score = score
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.anchor)

    def __getitem__(self, item):
        anchor = self.anchor[item]
        context = self.context[item]
        target = self.target[item]
        score = self.score[item]

        encoded_text = CFG.tokenizer.encode_plus(
            context + " " + anchor,
            target,
            padding="max_length",
            max_length=self.max_len,
            truncation=True,
        )
        input_ids = encoded_text["input_ids"]
        attention_mask = encoded_text["attention_mask"]

        return {
            "ids": torch.tensor(input_ids, dtype=torch.long),
            "mask": torch.tensor(attention_mask, dtype=torch.long),
            "score": torch.tensor(score, dtype=torch.float),
        }

In [None]:
def train_fn(model, train_dataloader, optimizer, scheduler, loss_fn=None, fp16=False):
    model.train()
    
    scaler =  torch.cuda.amp.GradScaler()
    
    train_loss = 0
    
    for step, data in enumerate(train_dataloader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['score'].to(device, dtype = torch.float)
        
        with torch.cuda.amp.autocast(enabled=True):
            output = model(ids, mask)
            loss = loss_fn(output.squeeze(), targets.squeeze())
            
        train_loss +=loss.item()

        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps

        if CFG.fp16:
            scaler.scale(loss).backward()
        else:
            loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            if CFG.fp16:
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
            else:
                optimizer.step()
                scheduler.step()

            optimizer.zero_grad()
        
    return train_loss/len(train_dataloader)


def valid_fn(model, valid_dataloader, loss_fn=None):
    
    model.eval()
    predictions = []
    valid_loss = 0
    
    for data in valid_dataloader:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['score'].to(device, dtype = torch.float)
        
        with torch.no_grad():
            output = model(ids, mask)
            loss = loss_fn(output.squeeze(), targets.squeeze())
        valid_loss +=loss.item()
        predictions.append(output.sigmoid().detach().cpu().numpy().ravel())
        
    return valid_loss/len(valid_dataloader), np.concatenate(predictions)

In [None]:
class PatentModel(torch.nn.Module):
    def __init__(self):
        super(PatentModel, self).__init__()
        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7

        config = AutoConfig.from_pretrained(CFG.model_name)

        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        
        self.transformer = AutoModel.from_pretrained(CFG.model_name, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Linear(config.hidden_size, CFG.num_targets)
        
    def forward(self, ids, mask):
        transformer_out = self.transformer(input_ids=ids, attention_mask=mask)
        last_hidden_states = transformer_out[0]
        last_hidden_states = self.dropout(torch.mean(last_hidden_states, 1))
        logits1 = self.output(self.dropout1(last_hidden_states))
        logits2 = self.output(self.dropout2(last_hidden_states))
        logits3 = self.output(self.dropout3(last_hidden_states))
        logits4 = self.output(self.dropout4(last_hidden_states))
        logits5 = self.output(self.dropout5(last_hidden_states))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        logits = self.output(last_hidden_states)
        return logits

In [None]:
def run_fold(df, fold=0, seed=42):    

    best_score = 0
    
    seed_everything(seed)
    
    df_train=df.loc[df.fold!=fold].reset_index(drop=True)
    df_valid=df.loc[df.fold==fold].reset_index(drop=True)
    
    valid_targets = df_valid['score'].values

    train_dataset = PhraseDataset(
        df_train.anchor.values,
        df_train.target.values,
        df_train.context.values,
        df_train.score.values,
        CFG.tokenizer, 
        CFG.max_len
    ) 
    
    valid_dataset = PhraseDataset(
        df_valid.anchor.values,
        df_valid.target.values,
        df_valid.context.values,
        df_valid.score.values,
        CFG.tokenizer, 
        CFG.max_len
    ) 
    
    train_loader = DataLoader(train_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=True, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    model = PatentModel()
    model.to(device)
    
    criterion = nn.BCEWithLogitsLoss(reduction="mean")

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias"]
    
    optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.01,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
    
    optimizer = AdamW(optimizer_parameters, lr=CFG.lr, weight_decay=CFG.weight_decay)
    
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
                optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1
    )
        
    for epoch in range(CFG.epochs): 
        start_time = time.time()
        
        train_loss = train_fn(model, train_loader, optimizer, scheduler, loss_fn=criterion, fp16=CFG.fp16)
        valid_loss, valid_preds = valid_fn(model, valid_loader, loss_fn=criterion)
        score = stats.pearsonr(valid_targets, valid_preds)[0]
        elapsed = time.time() - start_time
        
        print(f'Epoch {epoch+1} - avg_train_loss: {train_loss:.4f}  avg_val_loss: {valid_loss:.4f}')
        print(f'Epoch {epoch+1} - pearson score: {score:.4f} time: {elapsed:.0f}s')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": train_loss, 
                       f"[fold{fold}] avg_val_loss": valid_loss,
                       f"[fold{fold}] score": score})
        if best_score < score:
            print(f'Validation Score Improved {best_score} ---> :{score} Save Model!!!')
            best_score = score
            torch.save(model.state_dict(), f'{CFG.model_name.replace("-", "_")}_patent_model_{fold}.pth')
            
    oof_preds = np.concatenate((valid_preds.reshape(-1, 1), valid_targets.reshape(-1, 1)), axis=1) 
    del model
    gc.collect()
    torch.cuda.empty_cache()
    return oof_preds

In [None]:
def train_model(train, seed):
    
    predictions = []
    
    for f in range(CFG.n_folds):    
        preds = run_fold(train, f, seed) 
        predictions.append(preds)
        
    oof_preds = np.concatenate(predictions)
        
    return oof_preds

In [None]:
if __name__ == '__main__':
    oof_preds =  train_model(df, CFG.seed) 
    np.save('patent_oof.npy', oof_preds)