# Library

In [None]:
import os
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup,AdamW
from transformers import AutoModel,AutoModelForSequenceClassification,AutoTokenizer

scaler = torch.cuda.amp.GradScaler() # GPUでの高速化。

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # cpuがgpuかを自動判断
device

# Config

In [None]:
#各種パラメータ設定
config = {
    'lr': 2e-5,
    'wd':0.01,
    'batch_size':32,
    'valid_step':10,
    'max_len':128,
    'epochs':5,
    'nfolds':5,
    'seed':42,
    'dropout_rate':0.2,
    'model_dir':"../input/roberta-transformers-pytorch/roberta-base",
}

for i in range(config['nfolds']):
    os.makedirs(f'./model{i}',exist_ok=True)

# Seed

In [None]:
def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

random_seed(config["seed"])

# Data Import

In [None]:
OUTPUT_DIR = './'
train = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
test = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
submit = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
titles = pd.read_csv("../input/cpc-codes/titles.csv")

train = train.merge(titles, left_on='context', right_on='code')
test = test.merge(titles, left_on='context', right_on='code')
train['text'] = train['anchor'] + '[SEP]' + train['target'] + '[SEP]'  + train['title']
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['title']
train = train[["id","anchor","target","context","score","text"]]
test = test[["id","anchor","target","context","text"]]

In [None]:
display(train.head())
display(test.head())

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
titles

# Kfold

In [None]:
def make_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["fold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    # num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["score"], bins=5, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=config['seed'])
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

train = make_folds(train,config['nfolds'])

# DataSet

In [None]:
class USPPPMDataset(Dataset):
    def __init__(self, df, model_name, include_labels=True):
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.df = df

        self.include_labels = include_labels

        self.text = df["text"].tolist()

        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = config["max_len"],
            truncation = True,
            return_attention_mask=True
        )
        
        if self.include_labels:
            self.labels = df["score"].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encoded['input_ids'][idx])
        attention_mask = torch.tensor(self.encoded['attention_mask'][idx])

        if self.include_labels:
            label = torch.tensor(self.labels[idx]).float()
            return input_ids, attention_mask, label

        return input_ids, attention_mask

# Model

In [None]:
class USPPPMModel(nn.Module):
    def __init__(self,model_name):
        super().__init__()

        self.model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=1)
        
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        model_output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        
        output = self.sigmoid(model_output.logits).squeeze()

        return output

In [None]:
# class USPPPMModel(nn.Module):
#     def __init__(self,model_name):
#         super().__init__()

#         self.model = AutoModel.from_pretrained(model_name)
        
#         self.linear = nn.Linear(768, 1)

#     def forward(self,  input_ids, attention_mask):
#         outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
#         output = torch.sum(self.linear(outputs[0]),dim=1).squeeze()
#         return output

In [None]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

# Train Function

In [None]:
def train_fn(train_loader, model, loss_fn, optimizer, epoch, device):
    start = end = time.time()
    losses = 0

    # switch to train mode
    model.train()

    for step, (input_ids, attention_mask, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        y_preds = model(input_ids, attention_mask)

        loss = loss_fn(y_preds, labels)

        # record loss
        losses += loss.item()
        loss.backward()

        optimizer.step()

        if step % 100 == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss: {losses/(step+1):.4f} "
            )

    return losses/(step+1)

In [None]:
def valid_fn(valid_loader, model, loss_fn, device):
    start = end = time.time()
    losses = 0

    # switch to evaluation mode
    model.eval()
    preds = []

    for step, (input_ids, attention_mask, labels) in enumerate(valid_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask)

        loss = loss_fn(y_preds, labels)
        losses += loss.item()

        # record score
        preds.append(y_preds.sigmoid().to("cpu").numpy())

        if step % 100 == 0 or step == (len(valid_loader) - 1):
            print(
                f"EVAL: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss: {losses/(step+1):.4f} "
            )

    predictions = np.concatenate(preds)
    return losses/(step+1), predictions

In [None]:
def get_score(y_true, y_pred):
    score = scipy.stats.pearsonr(y_true, y_pred)[0]
    return score

In [None]:
def train_loop(train, fold):

    print(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    trn_idx = train[train["fold"] != fold].index
    val_idx = train[train["fold"] == fold].index

    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)

    train_dataset = USPPPMDataset(train_folds, config["model_dir"])
    valid_dataset = USPPPMDataset(valid_folds, config["model_dir"])

    train_loader = DataLoader(
        train_dataset,
        batch_size=config["batch_size"],
        shuffle=True,
        num_workers=4,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=config["batch_size"],
        shuffle=False,
        num_workers=4,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Model
    # ====================================================
    model = USPPPMModel(config["model_dir"])
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=config["lr"])
    
    train_steps = int(len(train_folds)/config["batch_size"]*config["epochs"])
    num_steps = int(train_steps*0.1)

    loss_fn = nn.MSELoss()

    # ====================================================
    # Loop
    # ====================================================
    best_score = -1
    best_loss = np.inf
    
    for epoch in range(config["epochs"]):
        start_time = time.time()
        
        # train
        avg_loss = train_fn(train_loader, model, loss_fn, optimizer, epoch, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, loss_fn, device)
        valid_labels = valid_folds["score"].values

        # scoring
        score =  get_score(valid_labels, preds)
        
        elapsed = time.time() - start_time
        print(f"Epoch {epoch} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s")
        print(f"Epoch {epoch} - Score: {score}")

        if score > best_score:
            best_score = score
            print(f"Epoch {epoch} - Save Best Score: {best_score:.4f} ")
            torch.save(
                {"model": model.state_dict(), "preds": preds}, OUTPUT_DIR + f"model{fold}/model{fold}.pth"
            )

    check_point = torch.load(OUTPUT_DIR + f"model{fold}/model{fold}.pth")

    valid_folds["preds"] = check_point["preds"]

    return valid_folds

In [None]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["score"].values
    score = get_score(labels, preds)
    print(f"Score: {score:<.5f}")
    return score

# Train

In [None]:
# Training
oof_df = pd.DataFrame()
scores = []
for fold in range(config["nfolds"]):
    _oof_df = train_loop(train, fold)
    print(f"========== fold: {fold} result ==========")
    score = get_result(_oof_df)
    scores.append(score)
    
# CV result
print(f"CV: {np.mean(scores)}")

# # Save OOF result
# oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)