In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow as tf
from tensorflow import keras
# import bert
import math
import os

from tensorflow.keras.layers import Dense, GRU, LSTM, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras import callbacks
import tensorflow.keras.backend as K

import codecs
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

from transformers import BertTokenizer, TFBertModel, AutoTokenizer

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# environmental variables

In [None]:
dataset_path = "/kaggle/input/us-patent-phrase-to-phrase-matching/"
pt_model_dir = "/kaggle/input/bert-for-patents/bert-for-patents/"
ft_model_dir = "/kaggle/input/uspppm-bertforpatent-keras-train/usppm_bfp_v5_lstm.h5"
max_seq_len = 80
batch_size = 32
learning_rate = 2e-5

# BERT Tokenizer 

In [None]:
tokenizer = BertTokenizer.from_pretrained(pt_model_dir)
# tokenizer = AutoTokenizer.from_pretrained(pt_model_dir)
pad_idx = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
print(tokenizer)
print("Padding token index : ", pad_idx)

# encode funtion

In [None]:
def dataset_split(dataset, split_val):
    lengths = int(len(dataset) * split_val)
    train_data = dataset[:lengths]
    valid_data = dataset[lengths:]
    return train_data, valid_data

def dataset_load(train_url, test_url):
    train_data = pd.read_csv(train_url, sep=',')
    train_data['sep_token'] = '[SEP]'
    train_data['cls_token'] = '[CLS]'
    train_data['context_token'] = '[' + train_data.context + ']'
    context_tokens = list(train_data.context_token.unique())
    train_data = train_data.sample(frac=1).reset_index(drop=True)
    train_data, valid_data = dataset_split(dataset=train_data, split_val=0.9)
    test_data = pd.read_csv(test_url, sep=',')
    test_data['sep_token'] = '[SEP]'
    test_data['cls_token'] = '[CLS]'
    test_data['context_token'] = '[' + test_data.context + ']'
    
    return train_data, valid_data, test_data, context_tokens

def create_learning_rate_scheduler(max_learn_rate=5e-5,
                                   end_learn_rate=1e-7,
                                   warmup_epoch_count=10,
                                   total_epoch_count=90):

    def lr_scheduler(epoch):
        
        if epoch < warmup_epoch_count:
            res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
        else:
            res = max_learn_rate*math.exp(math.log(end_learn_rate/max_learn_rate)*(epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
        return float(res)
    learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)

    return learning_rate_scheduler

def encode_text(text, 
                tokenizer,
                max_length):
    
    # With tokenizer's batch_encode_plus batch of both the sentences are
    # encoded together and separated by [SEP] token.
    encoded = tokenizer.batch_encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors="tf",
    )

    # Convert batch of encoded features to numpy array.
    input_ids = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")
    token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

    return {
        "input_ids": input_ids,
        "attention_masks": attention_masks,
        "token_type_ids": token_type_ids
    }

# Test data load

In [None]:
train_data, valid_data, test_data, context_tokens = dataset_load(dataset_path + "train.csv", dataset_path + "test.csv")
labels = list(set(train_data["score"].values))
labels.sort()

print(len(train_data), len(valid_data), len(test_data))
print(labels)
print(context_tokens)

In [None]:
test_data

# load cpc title data

In [None]:
cpc_codes = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")
cpc_codes = cpc_codes[["code", "title"]]

condition = cpc_codes['code'].map(len) == 3
cpc_codes = cpc_codes[condition].reset_index(drop=True)
cpc_codes

In [None]:
print(cpc_codes.loc[cpc_codes.code == "B29"]["title"].values )

In [None]:
test_data = test_data.merge(cpc_codes, left_on='context', right_on='code', how='left')

In [None]:
test_data['title'] = test_data['title'].str.lower().str.replace(";","")
test_data['anchor'] = test_data['anchor'].str.lower()
test_data['target'] = test_data['target'].str.lower()

test_data['text'] = test_data['title'] + " " + test_data['anchor']

# test_data['text'] = test_data['cls_token'] + \
#                 test_data['context_token'] + test_data['title'] + \
#                 test_data['sep_token'] + test_data['anchor'] + \
#                 test_data['sep_token'] + test_data['target'] + \
#                 test_data['sep_token']

print(test_data["title"][0])
print(test_data['anchor'][0])
print(test_data['target'][0])
print(test_data['text'][0])

In [None]:
test_data

# tokenize and encode the test data

In [None]:
encoded_test_data = encode_text(test_data[["text", "target"]].values.tolist(), tokenizer, max_seq_len)
print(encoded_test_data["input_ids"][0])
print(encoded_test_data["attention_masks"][0])
print(encoded_test_data["token_type_ids"][0])

In [None]:
test_x = [encoded_test_data["input_ids"], encoded_test_data["attention_masks"], encoded_test_data["token_type_ids"]]
print("test x shape : ", test_x[0].shape, test_x[1].shape, test_x[2].shape)

# Load USPPM Fine tuning model trained by Google BFP model

In [None]:
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_seq_len,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_seq_len,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_seq_len,), dtype=tf.int32, name="token_type_ids"
    )
    # Loading pretrained BERT model.
    base_model = TFBertModel.from_pretrained(pt_model_dir, from_pt=True)

    base_model_output = base_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )

    last_hidden_state = base_model_output.last_hidden_state
    print(last_hidden_state.shape)
    
#     cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(last_hidden_state)
#     output = tf.keras.layers.Dense(1, activation="linear")(cls_out)

#     gru = GRU(units=max_seq_len, return_sequences=False)(last_hidden_state)
    lstm = Bidirectional(LSTM(units=max_seq_len, return_sequences=False))(last_hidden_state)
    output = tf.keras.layers.Dense(1, activation="linear", name="uspppm_output")(lstm)
    
#     avg_pool = tf.keras.layers.GlobalAveragePooling1D()(last_hidden_state)
#     dropout = tf.keras.layers.Dropout(0.1, name="uspppm_dropout")(avg_pool)
#     output = tf.keras.layers.Dense(1, activation="linear", name="uspppm_output")(dropout)

    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate),
#         optimizer = tf.keras.optimizers.Adam(),
        loss='mse'
#         loss=tf.keras.losses.BinaryCrossentropy()
    )

# 전체 신경망 모델 요약 출력
model.summary()

In [None]:
model.load_weights(ft_model_dir)

# Prediction

In [None]:
pred = model.predict(test_x)

# Submit

In [None]:
submission = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
submission['score'] = pred
submission['score'] = submission.score.apply(lambda x: 0 if x < 0 else x)
submission['score'] = submission.score.apply(lambda x: 1 if x > 1 else x)
submission.to_csv("submission.csv",index=False)
submission