In [None]:
import re
import numpy as np 
import pandas as pd 
import os
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn.model_selection import StratifiedKFold 
from transformers import AutoTokenizer, AutoConfig, TFAutoModel 
import gc
tf.keras.backend.clear_session()
from transformers import logging
logging.set_verbosity_error()
TRAIN = False

In [None]:
# https://www.kaggle.com/code/yasufuminakama/pppm-deberta-v3-large-baseline-w-w-b-train

def get_cpc_texts():
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir('../input/cpc-data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(f'../input/cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt') as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    return results

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    min_delta=1e-5, 
    patience=1, 
    verbose=1,
    mode='auto', 
    restore_best_weights=True
)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.9, 
    patience=1, 
    mode='auto', 
    min_delta=0.001,
    verbose = 1
)

tokenizer = AutoTokenizer.from_pretrained('../input/deberta-l') 

class CorrelationScore(tf.keras.metrics.Metric):
    def __init__(self, name='correlation', **kwargs):
        super(CorrelationScore, self).__init__(name=name, **kwargs)
        self.correlation = tfa.metrics.MatthewsCorrelationCoefficient(num_classes=1)

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.reshape(y_true, (-1,1))
        y_pred = tf.reshape(y_pred, (-1,1))
        self.correlation.update_state(y_true, y_pred)
        
    def reset_state(self):
        self.correlation.reset_state()
    
    def result(self):
        return self.correlation.result()

In [None]:
def create_data(df, train=True):
    text = df["text"].values.astype(str)
    target = df["target"].values.astype(str)
    if train:
        label = df['score'].values
        
    input_ids = []
    attention_mask = []
    token_ids = []
    labels = []
    
    for i in range(df.shape[0]):
        tokens = tokenizer(
        text[i],
        target[i],
        max_length=32,
        padding="max_length",
        truncation=True,
            )
        inputs, masks, ids = tokens['input_ids'], tokens["attention_mask"], tokens["token_type_ids"]
        input_ids.append(inputs)
        attention_mask.append(masks)
        token_ids.append(ids)
        if train:
            labels.append(label[i])
            
    if train:
        return (input_ids,attention_mask,token_ids), labels
    else:
        return (input_ids,attention_mask,token_ids)

def create_model():
    input_tokens = tf.keras.layers.Input(shape=(32,), dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(32,), dtype=tf.int32)
    token_ids = tf.keras.layers.Input(shape=(32,), dtype=tf.int32)
    
    config = AutoConfig.from_pretrained('../input/deberta-l')
    backbone = TFAutoModel.from_pretrained('../input/deberta-l', config=config)

    out = backbone(input_tokens, attention_mask=attention_mask, token_type_ids=token_ids)[0]
    out1 = tf.keras.layers.Dropout(0.9)(out)
    out = tf.keras.layers.Dropout(0.2)(out)
    out = tf.keras.layers.Flatten()(out)
    out1 = tf.keras.layers.Flatten()(out1)
    out = tf.keras.layers.Flatten()(out)
    out = tf.keras.layers.Dense((config.hidden_size/2), activation='relu', dtype='float32')(out)
    out1 = tf.keras.layers.Dense((config.hidden_size/2), activation='relu', dtype='float32')(out1)
    out = tf.keras.layers.Average()([out, out1])
    out = tf.keras.layers.Dense(1, activation='sigmoid', dtype='float32')(out)
    model = tf.keras.Model(inputs=[input_tokens, attention_mask, token_ids], outputs=out)
    return model

In [None]:
if TRAIN:
    for fold in range(5):
    
        tf.keras.backend.clear_session()
        train = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
        train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
        Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        for n, (train_index, val_index) in enumerate(Fold.split(train, train['score_map'])):
            train.loc[val_index, 'fold'] = n
        train['fold'] = train['fold'].astype(int)
        cpc_texts = get_cpc_texts()
        train['context_text'] = train['context'].map(cpc_texts)
        train['text'] = train['anchor'] + '[SEP]' + train['context_text'] 
        train['target'] = train['target']  + '[SEP]' + train['context_text'] 

        model_save = tf.keras.callbacks.ModelCheckpoint(
        f'deberta_{fold}.h5', 
        save_best_only = True, 
        save_weights_only = True,
        monitor = 'val_loss', 
        mode = 'min', verbose = 1
        )
    
        valid_df=train[train.fold==fold]
        train_df=train[train.fold!=fold]
        print('FOLD:',fold)    

        train_data, train_labels = create_data(train_df,train=True)
        val_data, val_labels = create_data(valid_df,train=True)
        train_data_X = (np.asarray(train_data[0]),np.asarray(train_data[1]),np.asarray(train_data[2]))
        train_data_Y = np.asarray(train_labels)
        valid_data = ((np.asarray(val_data[0]), np.asarray(val_data[1]), np.asarray(val_data[2])), np.asarray(val_labels))

        import gc
        del train_df, valid_df, train_data, train_labels, val_data, val_labels
        gc.collect()
    
        model = create_model()
        history=model.fit(
                train_data_X,
                train_data_Y, 
                epochs = 5,
                shuffle=True,
                batch_size = 31,
                validation_data= valid_data,
                callbacks = [model_save, early_stop, reduce_lr],   
            ) 

        del model
        gc.collect()

In [None]:
test = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
cpc_texts = get_cpc_texts()
test['context_text'] = test['context'].map(cpc_texts)
test['text'] = test['anchor'] + '[SEP]' + test['context_text'] 
test['target'] = test['target']  + '[SEP]' + test['context_text'] 
test_data = create_data(test, train=False)
model = create_model()
preds = []
for i in [0,1,2,3,4]:
    model.load_weights(f'../input/cpc-deberta/deberta_{i}.h5')
    pred = model.predict((np.asarray(test_data[0]),np.asarray(test_data[1]),np.asarray(test_data[2])))
    preds.append(pred)
    
sample = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/sample_submission.csv')
sample['score'] = np.mean(np.array(preds), axis=0)
sample.to_csv('submission.csv', index=False)    