In [None]:
import os
import shutil
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
import re
import json
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import Dataset
from sklearn.preprocessing import minmax_scale
os.environ["WANDB_DISABLED"] = "true"

In [None]:
TRAIN_EPOCHS = 5
BATCH_SIZE = 16
SEED = 222
INPUT_PATH = '../input/us-patent-phrase-to-phrase-matching'

CPC_PATH = '../input/cpc-codes/titles.csv'
MODEL_PATH = {
    'DB': ''
}

MODEL_SAVE_PATH = '../input/5-fold-atsc-b4p-atc-db-rb'


INPUT_TYPE = [
    '5_Fold_ATC_B4P', '5_Fold_ATC_DB', '5_Fold_ATC_RB',
    '5_Fold_ACT_B4P', '5_Fold_ACT_DB', '5_Fold_ACT_RB',
    '5_Fold_CAT_B4P', '5_Fold_CAT_DB', '5_Fold_CAT_RB',
    '5_Fold_ATSC_B4P', '5_Fold_ATSC_DB', '5_Fold_ATSC_RB',
    '5_Fold_SATC_B4P', '5_Fold_SATC_DB', '5_Fold_SATC_RB',
    '5_Fold_SCAT_B4P', '5_Fold_SCAT_DB', '5_Fold_SCAT_RB',
    '5_Fold_ATSCC_B4P', '5_Fold_ATSCC_DB', '5_Fold_ATSCC_RB',
    '5_Fold_SCATC_B4P', '5_Fold_SCATC_DB', '5_Fold_SCATC_RB',
]

In [None]:
def load_cpc_dict(df_cpc, clean=False):
    """
    if clean == True: will clean the cpc text, remove {} or () content
    """
    cpc_dict = {}
    if clean:
        for i in range(len(df_cpc)):
            title = df_cpc.iloc[i, 1]
            title = re.sub('\(.*?\)', '', title)
            title = re.sub('\{.{0,3}\}', '', title)
            title = re.sub('[\{\}]', '', title)
            cpc_dict[df_cpc.iloc[i, 0]] = title
    else:
        for i in range(len(df_cpc)):
            title = df_cpc.iloc[i, 1]
            # title = re.sub('\(.*?\)', '', title)
            # title = re.sub('\{.{0,3}\}', '', title)
            # title = re.sub('[\{\}]', '', title)
            cpc_dict[df_cpc.iloc[i, 0]] = title
    return cpc_dict


def preprocess_data(df, cpc_dict, data_type):

    df['section_text'] = df.context.apply(lambda x: cpc_dict[x[0]])
    df['context_text'] = df.context.apply(lambda x: cpc_dict[x])
    
    d_type = 'ATC'
    if '_' in data_type:
        d_type = data_type.split('_')[2]
    else:
        d_type = data_type
    if d_type == 'ATC':
        df['input'] = df['anchor'] + '[SEP]' + df['target'] + '[SEP]' + df['context_text']
    if d_type == 'ACT':
        df['input'] = df['anchor'] + '[SEP]' + df['context_text'] + '[SEP]' + df['target']
    if d_type == 'CAT':
        df['input'] = df['context_text'] + '[SEP]' + df['anchor'] + '[SEP]' + df['target']
    
    if d_type == 'ATSC':
        df['input'] = df['anchor'] + '[SEP]' + df['target'] + '[SEP]' + df['section_text'] + '[SEP]' + df['context_text']
    if d_type == 'SATC':
        df['input'] = df['section_text'] + '[SEP]' + df['anchor'] + '[SEP]' + df['target'] + '[SEP]' + df['context_text']
    if d_type == 'SCAT':
        df['input'] = df['section_text'] + '[SEP]' + df['context_text'] + '[SEP]' + df['anchor'] + '[SEP]' + df['target']
    
    
    if d_type == 'ATSCC':
        df['input'] = df['anchor'] + '[SEP]' + df['target'] + '[SEP]' + df['section_text'] + ' ' + df['context_text']
    if d_type == 'SATCC':
        df['input'] = df['section_text'] + ' ' + df['anchor'] + '[SEP]' + df['target'] + '[SEP]' + df['context_text']
    if d_type == 'SCATC':
        df['input'] = df['section_text'] + ' ' + df['context_text'] + '[SEP]' + df['anchor'] + '[SEP]' + df['target']

    return df



def split_n_folds(df, n_folds):
    """
    Use StratifiedKFold to split the data into n_folds
    """
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=SEED)

    # StratifiedKFold 引入标签信息，使得Fold后的与原来的分布保持一致
    df['score_map'] = df['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
    for n, (train_index, val_index) in enumerate(skf.split(df, df['score_map'])):
        df.loc[val_index, 'fold'] = int(n)
    df['fold'] = df['fold'].astype(int)
    return df

class TrainDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.inputs = df['input'].values.astype(str)
        self.label = df['score'].values
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
#         targets = self.targets[item]
        label = self.label[item]
        
        return {
        **self.tokenizer( inputs ),
        'label':label.astype(np.float32)
    }


class TestDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.inputs = df['input'].values.astype(str)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
#         targets = self.targets[item]
        
        return {
        **self.tokenizer( inputs )
    }


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }


def log_content(write_file, content):
    with open(write_file, 'a') as fwa:
        fwa.write(content)

In [None]:
def infer_single_model(model_save_path, df_test, log_file=None):

    model = AutoModelForSequenceClassification.from_pretrained(model_save_path, num_labels=1)
    tokenizer = AutoTokenizer.from_pretrained(model_save_path)

    test_dataset = TestDataset(df_test, tokenizer)

    training_args = TrainingArguments(
        output_dir=model_save_path,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="pearson",
        logging_strategy='epoch'
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    outputs = trainer.predict(test_dataset)
    for obj in trainer.state.log_history:
        print(obj)
        if log_file:
            log_content(log_file, str(obj) + '\n')
    return outputs

In [None]:
def infer_loop_folds(n_folds, df_test, model_saved_path, model_type):
    
#     log_file = os.path.join(model_saved_path, 'log_of_each_fold_infer.txt')
    
    print('length of test dataset:', len(df_test))

    predictions = []
    drop_fold = np.random.randint(5)
    for i in range(n_folds):
        if '-' in model_type:
            model_path = model_saved_path + '/fold-' + str(i) + model_type
        else:
            model_path = model_saved_path + '/fold_' + str(i) + model_type
        outputs = infer_single_model(model_path, df_test)    
        preds = outputs.predictions.reshape(-1).tolist()
        predictions.append(preds)
        print('Fold', i, 'predicting finished...')
    
#         df_pred_fold = pd.DataFrame({'id': df_test['id'], 'score': preds})
#         score = np.corrcoef(preds, df_test['score'].values)[0][1]
#         df_pred_fold.to_csv(model_saved_path + '/pred_single_fold_' + str(i) + '_(' + str(round(score, 4)) + ').csv', index=False)
    return predictions

In [None]:
def infer_pipeline(input_path, cpc_path, input_type, model_path, model_save_path, model_type):
    """
    load the and preprocess the data
    """
    
    test_file = os.path.join(input_path, 'test.csv')
    df_test = pd.read_csv(test_file)

    df_cpc = pd.read_csv(cpc_path)
    cpc_dict = load_cpc_dict(df_cpc)

    data_type = input_type.split('_')[-2]
    n_folds = int(input_type.split('_')[0])
    
    df_test_typed = preprocess_data(df_test, cpc_dict, data_type)


    model_saved_path_loop_folds = os.path.join(model_save_path, input_type)
    predictions = infer_loop_folds(n_folds, df_test_typed, model_saved_path_loop_folds, model_type)
    return predictions

In [None]:
def infer_other_type(input_path, cpc_path, input_type, model_saved_path):
    test_file = os.path.join(input_path, 'test.csv')
    df_test = pd.read_csv(test_file)

    df_cpc = pd.read_csv(cpc_path)
    cpc_dict = load_cpc_dict(df_cpc)

    data_type = input_type.split('_')[-2]
    n_folds = int(input_type.split('_')[0])
    
    df_test_typed = preprocess_data(df_test, cpc_dict, data_type)
    predictions = []
    
    for i in range(n_folds):
        
        model_path = model_saved_path + '/fold_' + str(i) + '/best_model_end'

        outputs = infer_single_model(model_path, df_test)    
        preds = outputs.predictions.reshape(-1).tolist()
        predictions.append(preds)
        print('Fold', i, 'predicting finished...')
    output_single = infer_single_model('../input/kfold-model-atc-b4p', df_test)
    pred_single = output_single.predictions.reshape(-1).tolist()
    predictions.append(pred_single)
    return predictions

In [None]:
score2word = {
    
}
def preprocess_mask_on_test(sentence, model, tokenizer):
    """
    anchor and target in context are scoremap[] similar.
    """
    
    tokenized_input = tokenizer(sentence, return_tensors='pt')
    mask_index = torch.where(tokenized_input.input_ids[0] == tokenizer.mask_token_id)
    output = model(**tokenized_input)
    logits = output.logits
    softmax = F.softmax(logits, dim=-1)
    mask_word = softmax[0, mask_index, :]
    top_1 = torch.topk(mask_word, 1, dim=1)[1][0]
    for token in top_1:
        word = tokenizer.decode([token])
        new_sentence = sentence.replace(tokenizer.mask_token, word)
        # print(new_sentence)
    return new_sentence


def preprocess_train_data(df, cpc_dict, score2word, tokenizer):

    df['word_score'] = df.score.map(score2word)
    df['context_text'] = df.context.map(cpc_dict)
    # tokenizer.add_tokens(list(cpc_dict.values()))
    # to make the 
    # df['input'] = df['section_text'] + ' ' + df['context_text'] + tokenizer.sep_token + df['anchor'] + tokenizer.sep_token + df['target']
    df['input'] = df['anchor'] + ' and ' + df['target'] + ' in ' + df['context_text'] + ' classifications are ' + tokenizer.mask_token + ' similar.'
    return df


def preprocess_test_data(df, cpc_dict, model_for_mask, tokenizer):
    df['context_text'] = df.context.map(cpc_dict)

    df['input'] = df['anchor'] + ' and ' + df['target'] + ' in ' + df['context_text'] + ' classifications are ' + tokenizer.mask_token + ' similar.'
    # for i in range(len(df)):
    #     df.loc[i, 'input'] = preprocess_mask_on_test(df.loc[i, 'input'], model_for_mask, tokenizer)
    # print(df.input.head())
    return df

In [None]:
def infer_single_model_sent(df_test, model, tokenizer):
    # process test
    test_dataset = TestDataset(df_test, tokenizer)
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
    )
    outputs = trainer.predict(test_dataset)
    preds = outputs.predictions.reshape(-1).tolist()
    return preds

In [None]:
def infer_pipeline_sent(input_path, cpc_path, score2word, model_saved_path):
    """
    infer from all models 
    """
    
    test_file = os.path.join(input_path, 'test.csv')
    
    df_test = pd.read_csv(test_file)


    df_cpc = pd.read_csv(cpc_path)
    cpc_dict = load_cpc_dict(df_cpc)

    model_list = os.listdir(model_saved_path)
    preds = []
    for model_name in model_list:
        model_file = os.path.join(model_saved_path, model_name)
        if os.path.exists(model_file + '/pytorch_model.bin'):
            tokenizer = AutoTokenizer.from_pretrained(model_file)
            model = AutoModelForSequenceClassification.from_pretrained(model_file, num_labels=1)
            model_for_mask = None
            df_test_processed = preprocess_test_data(df_test, cpc_dict, model_for_mask, tokenizer)
            pred = infer_single_model_sent(df_test_processed, model, tokenizer)
            preds.append(pred)
        else:
            print('Model with folds...')
            for i in range(5):
                model_path_fold = model_file + '/fold-' + str(i) + '_model'
                tokenizer = AutoTokenizer.from_pretrained(model_path_fold)
                model = AutoModelForSequenceClassification.from_pretrained(model_path_fold, num_labels=1)
                model_for_mask = None
                df_test_processed = preprocess_test_data(df_test, cpc_dict, model_for_mask, tokenizer)
                pred = infer_single_model_sent(df_test_processed, model, tokenizer)
                preds.append(pred)
                
    return preds

In [None]:
if __name__ == '__main__':
    # df_submission = pd.read_csv(os.path.join(INPUT_PATH, 'sample_submission.csv'))
    df_test = pd.read_csv(os.path.join(INPUT_PATH, 'test.csv'))
    df_submission = pd.read_csv(os.path.join(INPUT_PATH, 'sample_submission.csv'))
    predictions = []
    model_save_path = MODEL_SAVE_PATH
    model_save_path = '../input/model-all-with-higher-score'
    for in_type in INPUT_TYPE:
        if not os.path.exists(model_save_path + '/' + in_type):
            continue
        if 'DB' not in in_type:
            continue
        pred_in_type = infer_pipeline(INPUT_PATH, CPC_PATH, in_type, MODEL_PATH, model_save_path, '_model')
        for pred in pred_in_type:
            predictions.append(pred)
    model_save_path = '../input/trained-model-all'
    for in_type in INPUT_TYPE:
        if not os.path.exists(model_save_path + '/' + in_type) or in_type == '5_Fold_ATSCC_DB':
            continue
        if 'DB' not in in_type:
            continue
        pred_in_type = infer_pipeline(INPUT_PATH, CPC_PATH, in_type, MODEL_PATH, model_save_path, '_best_model_end')
        for pred in pred_in_type:
            predictions.append(pred)
#     model_saved_path = '../input/model-trans-to-sent'
#     preds = infer_pipeline_sent(INPUT_PATH, CPC_PATH, score2word, model_saved_path)
#     for pred in preds:
#         predictions.append(pred)
    model_save_path = '../input/model-gkf'
    for in_type in INPUT_TYPE:
        if not os.path.exists(model_save_path + '/' + in_type):
            continue
        if 'DB' not in in_type:
            continue
        pred_in_type = infer_pipeline(INPUT_PATH, CPC_PATH, in_type, MODEL_PATH, model_save_path, '-model')
        for pred in pred_in_type:
            predictions.append(pred)
    predictions_mean = np.mean(predictions, axis=0)
    df_submission['score'] = predictions_mean
    df_submission.to_csv('submission.csv', index=False)
    print('Infering finished...  Predictions size:', len(predictions))