In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install datasets

# Imports

In [None]:
from torch.utils.data import DataLoader
import warnings,transformers,logging,torch

from transformers import AutoModelForSequenceClassification,AutoTokenizer, AutoModel


import datasets
from datasets import load_dataset, Dataset, DatasetDict

import tensorflow as tf
from transformers import DataCollatorWithPadding
from transformers import TFAutoModelForSequenceClassification

from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam

In [None]:
from sklearn.model_selection import StratifiedGroupKFold

In [None]:
train_csv_path = '../input/us-patent-phrase-to-phrase-matching/train.csv'
test_csv_path = '../input/us-patent-phrase-to-phrase-matching/test.csv'

model_path = '../input/bert-for-patents/bert-for-patents'
model_name = 'anferico/bert-for-patents'

In [None]:
# load train file into df
train= pd.read_csv(train_csv_path)
test= pd.read_csv(test_csv_path)

# Manage dataset

In [None]:
class Dataset:
    
    def __init__(self, train_dataframe: pd.DataFrame, random_state: int):
        
        self.train_dataframe = train_dataframe
        self.random_state = random_state
        
        # Get indecis of df
        self.idxs = np.arange(len(self.train_dataframe))

        
    def Create_CV_folds(self, number_folds: int):
        
        self.number_folds = number_folds
        # Initialize stra-kfold object
        cv = StratifiedGroupKFold(n_splits=self.number_folds)
        
        # shuffle train dataframe
        self.train_dataframe = self.train_dataframe.sample(frac=1, random_state=self.random_state)
        
        # cast score to int--> required for StratifiedGroupKFold.split()
        scores = (self.train_dataframe.score*100).astype(int)
        
        # Generate folds 
        folds = list(cv.split(self.idxs, scores, self.train_dataframe.anchor))
        
        return folds
    
    def get_tokenized_dataset(self, model_path: str):
        
        self.model_path = model_path
        
        # Initialize tokenizer
        self.tokz = AutoTokenizer.from_pretrained(self.model_path)

        # Define tokenization func needed by dataset.map()
        def tok_func(x): return self.tokz(x["inputs"])
        
        # create input column
        self.train_dataframe['inputs'] = self.train_dataframe.context + self.tokz.sep_token + self.train_dataframe.anchor + self.tokz.sep_token + self.train_dataframe.target
        
        # Initialize HF dataset object
        ds = datasets.Dataset.from_pandas(self.train_dataframe).rename_column('score', 'label')
        
        # Creating tokenized dataset 
        inps = "anchor","target","context"
        tok_ds = ds.map(tok_func, batched=True, remove_columns=inps+('inputs','id'))
        
        return tok_ds
        
    def _getfold(self, folds: list,  fold_num: int, tok_ds: datasets.Dataset):

        train,val = folds[fold_num]
        return DatasetDict({"train":tok_ds.select(train), "test": tok_ds.select(val)})
    
    
    def get_tf_datasets(self, folds: list, fold_num: int, batch_size: int, tok_ds: datasets.Dataset):
        
        data_collator = DataCollatorWithPadding(tokenizer=self.tokz, return_tensors="tf")
        
        self._dataset= self._getfold(folds, fold_num, tok_ds)
        
        tf_train_dataset = self._dataset["train"].to_tf_dataset(
            columns=["attention_mask", "input_ids", "token_type_ids"],
            label_cols=["labels"],
            shuffle=True,
            collate_fn=data_collator,
            batch_size=batch_size,
        )
        
        tf_validation_dataset = self._dataset["test"].to_tf_dataset(
            columns=["attention_mask", "input_ids", "token_type_ids"],
            label_cols=["labels"],
            shuffle=False,
            collate_fn=data_collator,
            batch_size=batch_size,
        )
        
        return tf_train_dataset, tf_validation_dataset
    


In [None]:
# Initialize dataset object
dataset= Dataset(train, random_state= 42)

# Create validation folds
folds= dataset.Create_CV_folds(number_folds= 4)

# create tokenized dataset
toc_ds= dataset.get_tokenized_dataset(model_path= model_path)

# Training

In [None]:
number_folds=4
batch_size = 64
num_epochs = 5

for fold_num in range(number_folds):
    # Create tensorflow datasets
    tf_train_dataset, tf_validation_dataset= dataset.get_tf_datasets(folds, fold_num= fold_num, batch_size=batch_size, tok_ds =toc_ds)
    
    # initialize schedular
    num_train_steps = len(tf_train_dataset) * num_epochs
    lr_scheduler = PolynomialDecay(
        initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
    )
    
    # Initialzie optimizer
    opt = Adam(learning_rate=lr_scheduler)

    # Setting callbacks
    callback_es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                               patience=2,
                                               baseline=None,
                                               min_delta = 0.002,
                                               mode='min', 
                                               verbose=1,
                                               restore_best_weights=True)

    callback_save = tf.keras.callbacks.ModelCheckpoint(
        f'./bert-for-patents{fold_num}.h5', monitor='val_loss', 
        verbose=1, save_best_only=True,
        save_weights_only=True, mode='min', 
        save_freq='epoch')

    
    # Initialize and compile model
    model = TFAutoModelForSequenceClassification.from_pretrained(model_path, num_labels=1, from_pt=True)
    model.compile(optimizer=opt, loss='mse', metrics='mse')
    
    # Generate a print
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_num} ...')
    
    history = model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=10, callbacks=[callback_es, callback_save])
