In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from scipy.stats import pearsonr
from sklearn.model_selection import StratifiedKFold, GroupKFold
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Config

In [None]:
## MDL_PATH = "../input/roberta-base/"
#MDL_PATH = "../input/debertav3basetf/"
MDL_PATH = "../input/debertav3largetf/"
MAX_LEN = 42
batch_size = 16
NUM_EPOCHS = 4
VAL_SPLIT = 0.2
NROWS = None
LR = 7e-6
N_FOLDS = 5
dropout_val = 0.1
random_seed = 234

# Read data

In [None]:
train = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv", 
                    nrows=NROWS)

print(train.shape)
test = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")
print(test.shape)

train.head(10)

In [None]:
print(train.context.nunique())
train.score.value_counts()

# Merge with Codes desc

In [None]:
codes = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")
codes.head()

In [None]:
train = pd.merge(train, codes[["code","title"]], 
                 left_on = "context", right_on = "code",
                 how='left')

test = pd.merge(test, codes[["code","title"]], 
                 left_on = "context", right_on = "code",
                 how='left')

train['title'] = train.title.apply(lambda x: x.split(";")[0])
test['title'] = test.title.apply(lambda x: x.split(";")[0])


In [None]:
print(train.title.nunique())


In [None]:
train['title'].value_counts(dropna=False)[:5]

In [None]:
train['title'].value_counts(dropna=False)[-5:]

In [None]:
train = train.sample(frac=1, random_state=random_seed)

# Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MDL_PATH)

In [None]:
tokenizer

In [None]:
tokenizer.encode_plus(train.anchor[3], train.target[3])

# Data creation

In [None]:
def create_data(id_, anchor, target, context, score, train=True):
    input_ids = []
    attention_mask = []
    token_type_ids = []
    labels = []
    ids = []
    tok_txt = tokenizer.batch_encode_plus( 
        [(k[0], k[1] + ' [SEP] ' + k[2]) for k in zip(anchor,target,context)], 
                                    max_length = MAX_LEN, 
                                    padding='max_length',
                                    truncation=True)
    for i in range(len(anchor)): 
        ids.append(id_[i])
        input_ids.append(tok_txt['input_ids'][i])
        token_type_ids.append(tok_txt['token_type_ids'])
        attention_mask.append(tok_txt['attention_mask'][i])
        if train:
            labels.append(score[i])
    return {"input_ids":input_ids,
            "token_type_ids":token_type_ids,
            "attention_mask":attention_mask,
            "ids":ids, 
            }, labels
    

In [None]:
#help(tokenizer.encode_plus)
#dir(tf.data.Dataset.from_tensor_slices)


# Add Fold infor

In [None]:
train['title_score'] = train.apply(lambda x: x["title"] + str(x["score"]), axis=1)
train['title_score'].value_counts()

In [None]:
train['fold'] = 0

skf = GroupKFold(n_splits=5)
for i,(x,y) in enumerate(skf.split(X=train,y=train.score, groups = train.anchor)):
    train.loc[y,'fold']=i
train.fold.value_counts()

In [None]:
train.groupby("fold")['score'].mean()

In [None]:
del train['title_score']

In [None]:
fld = 0

train_data, train_labels = create_data(train[train.fold != fld]['id'].tolist(), 
                                       train[train.fold != fld]['anchor'].tolist(), 
                                       train[train.fold != fld]['target'].tolist(), 
                                       train[train.fold != fld]['title'].tolist(), 
                                       train[train.fold != fld]['score'].tolist(), 
                                       train=True)

val_data, val_labels = create_data(train[train.fold == fld]['id'].tolist(), 
                                   train[train.fold == fld]['anchor'].tolist(), 
                                   train[train.fold == fld]['target'].tolist(), 
                                   train[train.fold == fld]['title'].tolist(), 
                                   train[train.fold == fld]['score'].tolist(), 
                                   train=True)


In [None]:
test_data, test_labels = create_data(test['id'], test['anchor'], 
                                     test['target'], test['title'], 
                                     None, train=False)

# Simple Deberta / Roberta TF Model

In [None]:
def build_model():
    
    input__ids = tf.keras.Input(shape=(MAX_LEN, ), dtype = tf.int32)
    input__mask = tf.keras.Input(shape=(MAX_LEN, ), dtype = tf.int32)
    
    roberta_model = TFAutoModel.from_pretrained(MDL_PATH, trainable=True)
    x = roberta_model(input_ids = input__ids, 
                      attention_mask = input__mask)
    #print(x.last_hidden_state)
    x = tf.keras.layers.GlobalAveragePooling1D()(x.last_hidden_state)
    x = tf.keras.layers.Dropout(dropout_val)(x)

    fnl = tf.keras.layers.Dense(1)(x)
    
    model = tf.keras.Model(inputs = [input__ids, input__mask], 
                           outputs = fnl)
    
    model.compile(
        optimizer = tf.keras.optimizers.Adam(),
        loss = "mse",
        metrics=["mae"],
    )
    return model

# Add useful Callbacks

## Learning Rate Scheduler

In [None]:

def scheduler(epoch):
    learning_rate = LR
    if epoch == 0:
        return learning_rate * 0.2
    else:
        return learning_rate * (0.6**epoch)
    
callback_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

plt.plot([scheduler(e) for e in range(NUM_EPOCHS)])

## Pearson correlation tracking callback

In [None]:
class PearsonCallback(tf.keras.callbacks.Callback):
    def __init__(self, val_data):
    #    pass
        #print(dir(self.model))
        self.X_val, self.Y_val = val_data
    #def on_epoch_start(self,epoch):
    #    print(f"Learning rate: {self.model.optimize.learning_rate}")
    def on_epoch_end(self, epoch, logs):
        X_val_preds = self.model.predict(self.X_val)
        #print(X_val_preds.shape,self.Y_val.shape)
        pearson_corr = pearsonr(X_val_preds.ravel(), self.Y_val)
        print("pearsonr_val (from log) =", pearson_corr[0])
        logs["val_pearsonr"] = pearson_corr[0]


## Early stopping and model saving

In [None]:
callback_es = tf.keras.callbacks.EarlyStopping(monitor='val_pearsonr', 
                                               patience=2,
                                               mode='max', 
                                               verbose=1,
                                               restore_best_weights=True)
callback_save = tf.keras.callbacks.ModelCheckpoint(
        'roberta_patent.h5', monitor='val_pearsonr', 
        verbose=1, save_best_only=True,
        save_weights_only=True, mode='max', 
        save_freq='epoch')



In [None]:
model = build_model()


In [None]:
print(model.summary())


# Train

In [None]:
#help(model.fit)
val_data_ = ((np.asarray(val_data['input_ids']),
              np.asarray(val_data['attention_mask']),
             ),
             np.asarray(val_labels).ravel())


In [None]:

model.fit((np.asarray(train_data['input_ids']),
           np.asarray(train_data['attention_mask']),
          ),
          np.asarray(train_labels).ravel(), 
        epochs = NUM_EPOCHS,
        shuffle=True,
        callbacks = [callback_lr,
                     PearsonCallback(val_data_),
                     callback_es,
                     callback_save,
                    ],
        batch_size = batch_size,
        validation_data= val_data_
       )


# Error analysis

In [None]:
val_preds = model.predict( val_data_[0] )
 
del val_data_

## Cases with highest error

In [None]:

train['preds'] = 0
train.loc[train.fold == fld, 'preds'] = val_preds

print(pearsonr(np.asarray(train.loc[train.fold == fld, 'score']).ravel(), 
               np.asarray(train.loc[train.fold == fld, 'preds']).ravel()) )

train['diff'] = np.abs(train['score'] - train['preds'])
train[train.fold == fld].sort_values('diff', ascending=False).head(50)

## Categories with highest error

In [None]:
train[train.fold == fld].groupby("title").agg({"diff":['count','mean']})\
    .reset_index().sort_values(('diff','mean'), ascending=False).head(20)

In [None]:
train[train.fold == fld].groupby("anchor").agg({"diff":['count','mean']})\
    .reset_index().sort_values(('diff','mean'), ascending=False).head(20)

# Predict on test data

In [None]:
test_preds = model.predict((np.asarray(test_data['input_ids']),
                            np.asarray(test_data['attention_mask']),
                           #np.asarray(test_data['token_type_ids']),
                           ))

In [None]:
submission = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv")


# Process scores

In [None]:
submission['score'] = test_preds
submission['score'] = submission.score.apply(lambda x: 0 if x < 0 else x)
submission['score'] = submission.score.apply(lambda x: 1 if x > 1 else x)
#submission['score'] = submission['score'].map({y:x for x,y in lbl_map.items()})


In [None]:
submission['score'].hist()

# Submit

In [None]:
submission.to_csv('submission.csv',index=False)