In [None]:
!pip install datasets transformers
# !pip install -U scikit-learn  # 预测不需要sklearn

In [None]:
import os
import pandas as pd
import numpy as np
#from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold
import shutil
import time
import gc
import random
import math
import torch
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForWholeWordMask
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from torch import nn
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
os.environ["WANDB_DISABLED"] = "true"

In [None]:
df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')

In [None]:
df

In [None]:
df_context = pd.read_csv('../input/patent-title/titles.csv')
df_context

In [None]:
df = df.merge(df_context, how='left', left_on='context', right_on='code')
df = df[['id', 'anchor', 'target', 'context', 'title', 'score']]
df

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('../input/debertav3small')

In [None]:
df['input'] = df['target'] + tokenizer.sep_token + df['title'].apply(str.lower)

In [None]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class TrainDataset(Dataset):
    def __init__(self, df):
        self.input = df['input'].values.astype(str)
        self.anchor = df['anchor'].values.astype(str)
        self.label = df['score'].values
        
    def __len__(self):
        return len(self.input)
    
    def __getitem__(self, item):
        inputs = self.input[item]
        anchor = self.anchor[item]
        label = self.label[item]
        
        model_inputs = tokenizer(inputs, anchor,
                                max_length=100,
                                padding='max_length',
                                truncation=True)
        
        return {**model_inputs,
               'label':torch.as_tensor(label, dtype=torch.float)}

In [None]:
from datasets import load_metric
metric = load_metric('../input/evalglue/glue.py', 'stsb')
metric

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('../input/debertav3small', num_labels=1)

In [None]:
from transformers import TrainingArguments, Trainer, get_scheduler, AdamW, get_cosine_schedule_with_warmup

In [None]:
metric_name = 'pearson'
batch_size = 128
args = TrainingArguments(
       'model_test',
        evaluation_strategy='steps',
#         evaluation_strategy='epoch',
#         save_strategy='epoch',
        learning_rate = 2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size*2,
        num_train_epochs=1,
        logging_steps=100,
        save_steps=100,
#     总是报错，(默认是 adam+linear warmup
#         lr_scheduler_type=get_cosine_schedule_with_warmup(
# #             name='cosine',
#             optimizer=AdamW(model.parameters()),
#             num_warmup_steps=200,
#             num_training_steps=1140),
        warmup_steps=400,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model=metric_name,
        save_total_limit=1
)

In [None]:
def compute_metrics(eval_pred):
    print(eval_pred)
    pred, label = eval_pred
    return metric.compute(predictions=pred, references=label)

In [None]:
train_dataset = TrainDataset(train_df)
val_dataset = TrainDataset(val_df)

trainer = Trainer(model,
                  args,
                  train_dataset=train_dataset,
                  eval_dataset=val_dataset,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics
                 )

In [None]:
trainer.train()

## Inference

In [None]:
test_df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')
test_df = test_df.merge(df_context, left_on='context', right_on='code')
test_df = test_df[['id', 'anchor', 'target', 'context', 'title']]
test_df['input'] = test_df['target'] + tokenizer.sep_token + test_df['title'].apply(str.lower)
test_df['input'] = test_df['input'].apply(str.lower)
test_df

In [None]:
class TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_input_length):
        self.text = df['input'].values.astype(str)
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        inputs = self.text[item]
        
        inputs = self.tokenizer(inputs,
                    max_length=self.max_input_length,
                    padding='max_length',
                    truncation=True )
        return torch.as_tensor(inputs['input_ids'], dtype=torch.long),\
               torch.as_tensor(inputs['token_type_ids'], dtype=torch.long),\
               torch.as_tensor(inputs['attention_mask'], dtype=torch.long)

In [None]:
predictions = []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('../input/debertav3small')
te_dataset = TestDataset(test_df, tokenizer, 100)
te_dataloader = DataLoader(te_dataset,
                              batch_size=100,
                              shuffle=False,
                            pin_memory=True, drop_last=False)
model = AutoModelForSequenceClassification.from_pretrained('../input/debertav3small', num_labels=1)
model.load_state_dict(torch.load('./model_test/checkpoint-200/pytorch_model.bin'))
model = model.to('cuda')

In [None]:
def valid_fn(valid_loader, model, device):
    model.eval()
    preds = []
    labels = []
    for step, batch in enumerate(valid_loader):
        input_ids, token_type_ids, attention_mask = [i.to(device) for i in batch]
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask, token_type_ids)
        preds.append(y_preds.logits.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
outputs = valid_fn(te_dataloader, model, 'cuda')
prediction = outputs.reshape(-1)
prediction

In [None]:
prediction[prediction<=0]=0
prediction[prediction>=1]=1
submission = pd.DataFrame({
    'id': test_df['id'],
    'score': prediction,
})

submission

In [None]:
submission.to_csv('submission.csv', index=False)

## TEST Model Resource

In [None]:
# from transformers import AutoModelForSequenceClassification

# model = AutoModelForSequenceClassification.from_pretrained('../input/debertav3small', num_labels=1)

## Zip to download

In [None]:
# import os
# import zipfile
# import datetime

# def file2zip(packagePath, zipPath):
#     '''
#   :param packagePath: 文件夹路径
#   :param zipPath: 压缩包路径
#   :return:
#   '''
#     zip = zipfile.ZipFile(zipPath, 'w', zipfile.ZIP_DEFLATED)
#     for path, dirNames, fileNames in os.walk(packagePath):
#         fpath = path.replace(packagePath, '')
#         for name in fileNames:
#             fullName = os.path.join(path, name)
#             name = fpath + '\\' + name
#             zip.write(fullName, name)
#     zip.close()

In [None]:
# file2zip("./model_test/checkpoint-2100", "./model_test/checkpoint-2100.zip")