In [None]:
import numpy as np, pandas as pd, time, tensorflow_addons as tfa, tensorflow as tf, tensorflow.keras as keras, os
import scipy
from keras.layers import Flatten, Dense
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import pearsonr
import random

import transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, TFAutoModel

transformers.logging.set_verbosity_error()

MAX_LEN = 50
LEARNING_RATE = 5e-5
MODEL_PATH = '../input/deberta-v3-large/deberta-v3-large'
    


#dataloads
df_train = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
df_test = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')
df_submission_template = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/sample_submission.csv')
df_cpc = pd.read_csv("../input/cpc-codes/titles.csv")


#adaptive LR
def schedule(epoch, lr):
    if epoch == 0:
        return lr * 0.25
    else:
        return lr * (0.75**epoch)
        
 
        
#pearson 
def correlation_coefficient_loss(y_true, y_pred):
    x = y_true
    y = y_pred
    mx = keras.backend.mean(x)
    my = keras.backend.mean(y)
    xm, ym = x-mx, y-my
    r_num = keras.backend.sum(tf.multiply(xm,ym))
    r_den = keras.backend.sqrt(tf.multiply(keras.backend.sum(keras.backend.square(xm)), keras.backend.sum(keras.backend.square(ym))))
    r = r_num / r_den

    r = keras.backend.maximum(keras.backend.minimum(r, 1.0), -1.0)
    return 1 - keras.backend.square(r)


#pearson2
class callback_pearson(tf.keras.callbacks.Callback):
    def __init__(self, val_data):
        self.X_val, self.Y_val = val_data
    def on_epoch_end(self, epoch, logs):
        X_val_preds = self.model.predict(self.X_val)
        pearson_corr = pearsonr(X_val_preds.ravel(), self.Y_val)
        print("pearsonr_val (from log) =", pearson_corr[0])
        logs["val_pearsonr"] = pearson_corr[0]


    
        
#tokenizer
def tokenize_data(word_list, max_len):
    tokens = tokenizer(
        [(w[0] + " " + w[2], w[1]) for w in word_list],
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors="np",
        verbose=False
    )
    
    # minimize memory usage with datatypes
    token_output = {'input_ids': tokens['input_ids'].astype('int32'),
                    'attention_mask': tokens['attention_mask'].astype('int32'),
                    'token_type_ids': tokens['token_type_ids'].astype('int32')}
    return token_output

 
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
#CPC Load
df_cpc['title'] = df_cpc['title'].str.replace(r'[^\w\s]', '', regex=True).str.lower()

#train data
df_train_merged = df_train.merge(df_cpc[['code','title']], left_on='context',right_on='code', how='left').drop(['code', 'context'], axis=1)
anchor_target_title = df_train_merged[["anchor", "target", "title"]].values.tolist()
token_output_train = tokenize_data(anchor_target_title, MAX_LEN)
trainX = [token_output_train['input_ids'], token_output_train['attention_mask'], token_output_train['token_type_ids']]
trainY = df_train_merged['score'].values
#test data
df_test_merged = df_test.merge(df_cpc[['code','title']], left_on='context',right_on='code', how='left').drop(['code', 'context'], axis=1)
anchor_target_title = df_test_merged[["anchor", "target", "title"]].values.tolist()
token_output_test = tokenize_data(anchor_target_title, MAX_LEN)
testX = [token_output_test['input_ids'], token_output_test['attention_mask'], token_output_test['token_type_ids']]

#lazy validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)
for train_index, test_index in kf.split(trainY):
    valX = [trainX[0][test_index], trainX[1][test_index], trainX[2][test_index]]
    valY = trainY[test_index]
    trainX[0] = np.delete(trainX[0], test_index, 0)
    trainX[1] = np.delete(trainX[1], test_index, 0)
    trainX[2] = np.delete(trainX[2], test_index, 0)
    trainY = np.delete(trainY, test_index)
    break

print(valX[1].shape)
val_combined = ((valX[0],valX[1], valX[2]), valY.ravel())




#model
random.seed(0)
np.random.seed(0)
tf.random.set_seed(0)   # always seed your experiments
Init = keras.initializers.GlorotUniform(seed=0)

callback_learningrate = tf.keras.callbacks.LearningRateScheduler(schedule)
callback_earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_pearsonr', patience=2,mode='max',restore_best_weights=True)
callback_checkpoint = tf.keras.callbacks.ModelCheckpoint('tf_model.h5', monitor='val_pearsonr', save_best_only=True, save_weights_only=True, mode='max', save_freq='epoch')

input_ids = tf.keras.layers.Input(
    shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.layers.Input(
    shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")
token_type_ids = tf.keras.layers.Input(
    shape=(MAX_LEN,), dtype=tf.int32, name="token_type_ids")

deberta_model = TFAutoModel.from_pretrained(MODEL_PATH, trainable=True)
deberta_model_output = deberta_model(
    input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    
avg_pool = tf.keras.layers.GlobalAveragePooling1D()(deberta_model_output.last_hidden_state)
dropout = tf.keras.layers.Dropout(0.3)(avg_pool)
output = tf.keras.layers.Dense(1, activation="linear", name="output")(dropout)

model = tf.keras.models.Model(
    inputs=[input_ids, attention_mask, token_type_ids], outputs=output)

model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss='mse')

model.summary()   

history = model.fit(
    trainX,
    trainY,
    epochs=20,
    callbacks = [callback_learningrate, callback_pearson(val_combined), callback_earlystop, callback_checkpoint],
    batch_size=16,
    shuffle=True,
    validation_data=val_combined)
    
    

pred = model.predict(testX)
df_submission_template['score'] = np.clip(pred, 0, 1)
df_submission_template.to_csv("submission.csv", index=False)









# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session