# 1. Import & Def & Set & Load

In [None]:
import gc

import pandas as pd
import numpy as np

from transformers import (AutoModelForSequenceClassification,
                          AutoTokenizer,
                          Trainer)

from torch.utils.data import DataLoader, Dataset

from sklearn.preprocessing import MinMaxScaler

import seaborn as sns

import warnings 
warnings.filterwarnings('ignore')

In [None]:
def inference_fn(df, model_path, n_folds, invert_label=None):
    result = []

    tokenizer = AutoTokenizer.from_pretrained(f'{model_path}uspppm_0')
    
    if invert_label:
        te_dataset = InferDataset(df, tokenizer, invert_label)
    else:
        te_dataset = InferDataset(df, tokenizer)
            
    for fold in range(n_folds):
        model = AutoModelForSequenceClassification.from_pretrained(
            f'{model_path}uspppm_{fold}',
            num_labels=1
        )
        
        trainer = Trainer(model, tokenizer=tokenizer)

        predictions = trainer.predict(te_dataset).predictions
        
        result.append(predictions)
        
        del model, trainer
        gc.collect()
    
    del tokenizer, te_dataset
    gc.collect()
    
    return result


def upd_outputs(data, is_trim=False, is_minmax=False, is_reshape=False):
    min_max_scaler = MinMaxScaler()
    
    if is_trim == True:
        data = np.where(data <=0, 0, data)
        data = np.where(data >=1, 1, data)

    if is_minmax ==True:
        data = min_max_scaler.fit_transform(data)
    
    if is_reshape == True:
        data = data.reshape(-1)
        
    return data


def trim_outliers(data, q_left=None, q_right=None):
    if q_left:
        min_v = np.quantile(data, q=q_left)

    if q_right:
        max_v = np.quantile(data, q=q_right)

    if min_v:
        data = np.where(data >= min_v, data, np.nan)

    if max_v:
        data = np.where(data <= max_v, data, np.nan)
        
    return data


def get_all_mean(df):
    result = {}
    for col in df.columns.get_level_values(0).unique():
        result[col] = df[col].mean(axis=1)
    
    return pd.DataFrame(result)


def show_gradient(df, n_row=None):
    if not n_row:
        n_row = 5

    return df.head(n_row) \
                .assign(all_mean=lambda x: x.mean(axis=1)) \
                    .style.background_gradient(cmap=cm, axis=1)


def get_final_scores(df):
    # *** Models mean ***
    all_mean = get_all_mean(df)

    result = all_mean.copy()
    
    # *** Using models mean ****
    result['m-mean'] = all_mean.mean(axis=1) 
    result['m-median'] = all_mean.median(axis=1) 

    # *** Using all predictions ***
    result['a-mean'] = df.mean(axis=1)
    result['a-median'] = df.median(axis=1)
    
    # *** Carousel ***
    cols_list = all_mean.columns
    c_data = []
    for col in cols_list:
        short_cols_list = [x for x in cols_list if x != col]
        c_data.append(df[short_cols_list].mean(axis=1))
    
    result['c-mean'] = pd.concat(c_data, axis=1).mean(axis=1)
    result['c-median'] = pd.concat(c_data, axis=1).median(axis=1)    
    
    # *** Using weights ****
    carousel = [
        [.25, .25, .25, .25],
        [.3, .3, .3, .1],
        [.2, .2, .2, .4]
    ]
    
    for x in carousel:
        x_name = '-'.join(
            [str(i)[1:] for i in x]
        )
        result[f'{x_name}'] = all_mean.mul(x).sum(axis=1)
    
    return result

In [None]:
pd.set_option('display.precision', 3)
cm = sns.light_palette('green', as_cmap=True)
props_param = "color:white; font-weight:bold; background-color:green;"

N_ROW = 10

In [None]:
competition_dir = "../input/us-patent-phrase-to-phrase-matching/"

submission = pd.read_csv(competition_dir+'sample_submission.csv')
test_origin = pd.read_csv(competition_dir+'test.csv')
test_origin.head()

In [None]:
titles = pd.read_csv('../input/cpc-codes/titles.csv')

test = test_origin.copy()

test.reset_index(inplace=True)
test = test.merge(titles, left_on='context', right_on='code')
test.sort_values(by='index', inplace=True)
test.drop(columns='index', inplace=True)
test.reset_index(drop=True, inplace=True)

test.head()

# 2. Extract predictions

### [USPPPM] BERT for Patents Baseline [inference]
> https://www.kaggle.com/code/ksork6s4/uspppm-bert-for-patents-baseline-inference

|version|model|val strategy|CV|LB|
|---|---|---|---|---|
|02| BERT for Patents| hold out| 0.85 | 0.815 |
|05| BERT for Patents| 5folds| 0.853 | 0.825 |
|07| debert-v3-large| 5folds| 0.867 | 0.833 | 
|09| debert-v3-large| 5folds| 0.862 | 0.836 | 
|12| debert-v3-large| 5folds| 0.862 | 0.837 | 

**Please upvote the original notebook!**

In [None]:
class InferDataset(Dataset):
    def __init__(self, df, tokenizer, invert_label=False):
        self.inputs = df['text'].values
        self.targets = df['target'].values
        self.tokenizer = tokenizer
        self.invert_label = invert_label

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        
        result = {**self.tokenizer(inputs, targets)}
        
        if self.invert_label == True:
            result['label'] = -1
            
        return result

In [None]:
carousel = [
    {
        'name': 'ver-12',
        'path': '../input/uspppm-debertv3large-5folds-v2/',
        'folds': 5,
        'add_sep': True,
        'invert_label': False
    },
    {
        'name': 'ver-7',
        'path': '../input/uspppm-debertv3large-5folds/',
        'folds': 5,
        'add_sep': True,
        'invert_label': False
    },
    {
        'name': 'ver-5',
        'path': '../input/uspppm-bert-for-patents-baseline-5folds/',
        'folds': 5,
        'add_sep': False,
        'invert_label': True
    },
    {
        'name': 'ver-2',
        'path': '../input/usppm-bert-baseline-fold0/',
        'folds': 1,
        'add_sep': False,
        'invert_label': True
    }
]

In [None]:
extracted_data = []

for x in carousel:
    model_name = x.get('name')
    model_path = x.get('path')
    
    n_folds = x.get('folds')
    is_add_sep = x.get('add_sep')
    is_invert_label = x.get('invert_label')
    
    df = test.copy()    
    
    txt_sep = " "
    if is_add_sep:
        txt_sep = "[SEP]"
    
    df['text'] = df['title'] + txt_sep + df['anchor']
    
    print(f'\n{model_name}\n')
    display(df.head())
    
    prediction = inference_fn(df, model_path, n_folds, is_invert_label)
    
    prediction = [upd_outputs(x, is_minmax=True, is_reshape=True)
                      for x in prediction]
    
    prediction = pd.DataFrame(prediction).T
    
    _ = (model_name, prediction)
    extracted_data.append(_)
    
    del df; gc.collect()

# 3. Improve predictions

1. Save extracted predictions
1. Add additional folds
1. Trim outliers

In [None]:
data_dict = {}

data_dict['extracted'] = pd.concat(
    [data for _, data in extracted_data],
    keys=[name for name, _ in extracted_data],
    axis=1
)

In [None]:
show_gradient(
    data_dict.get('extracted'),
    N_ROW)

### Model 'ver-2' only has one fold, so I add more folds using data from other models

In [None]:
data_source = data_dict.get('extracted')

increased_data = []

for name, df in extracted_data:
    if name == 'ver-2':
        # *** Carousel ***
        use_names = [
            ('-1-', 'ver-12'),
            ('-2-', 'ver-7'),
            ('-3-', 'ver-5')
        ]

        all_mean_ = get_all_mean(data_source)

        for names in use_names:
            col_name, ver_name = names

            use_cols_ = [col for col in all_mean_.columns
                             if col != ver_name]

            df[col_name] = all_mean_[use_cols_].mean(axis=1)            
                        
    _ = (name, df)
    increased_data.append(_)

In [None]:
data_dict['increased'] = pd.concat(
    [data for _, data in increased_data],
    keys=[name for name, _ in increased_data],
    axis=1
)

show_gradient(
    data_dict.get('increased'),
    N_ROW)

In [None]:
check_data = data_dict.get('increased')

quantile_values = {'q_left': 0.05,
                   'q_right': 0.95}

check_data.head(N_ROW) \
        .style.highlight_quantile(
            axis=1, props=props_param, **quantile_values)

In [None]:
data_dict['trimmed'] = check_data.transform(
                        trim_outliers, axis=1,
                            **quantile_values)

show_gradient(
    data_dict.get('trimmed'),
    N_ROW)

# 4. Create & Select score

```
def get_final_scores(df):
    # *** Models mean ***
    all_mean = get_all_mean(df)

    result = all_mean.copy()
    
    # *** Using models mean ****
    all_mean = get_all_mean(df)
    result['m-mean'] = all_mean.mean(axis=1) 
    result['m-median'] = all_mean.median(axis=1) 

    # *** Using all predictions ***
    result['a-mean'] = df.mean(axis=1)
    result['a-median'] = df.median(axis=1)
    
    [...]
```

In [None]:
scores_dict = {
    'extracted': get_final_scores(data_dict.get('extracted')),
    'increased': get_final_scores(data_dict.get('increased')),
    'trimmed': get_final_scores(data_dict.get('trimmed'))
}

In [None]:
show_gradient(
    scores_dict.get('extracted'),
    N_ROW)

In [None]:
show_gradient(
    scores_dict.get('increased'),
    N_ROW)

In [None]:
show_gradient(
    scores_dict.get('trimmed'),
    N_ROW)

In [None]:
select_data = 'increased'     # 'extracted' | 'increased' | 'trimmed'

quantile_values = {'q_left': 0.15,
                   'q_right': 0.85}

scores_dict.get(select_data).head(N_ROW) \
        .style.highlight_quantile(
            axis=1, props=props_param, **quantile_values)

In [None]:
select_score = 'a-mean'

final_score = scores_dict.get(select_data)[select_score]

# 5. Create submission

In [None]:
submission = pd.DataFrame({
    'id': test_origin['id'],
    'score': final_score,
})

submission.head(N_ROW)

In [None]:
# 0	4112d61851461f60	0.4381
# 1	09e418c93a776564	0.6738
# 2	36baf228038e314b	0.4676
# 3	1f37ead645e7f0c8	0.2384
# 4	71a5b6ad068d531f	0.0073
# 5	474c874d0c07bd21	0.4691
# 6	442c114ed5c4e3c9	0.4548
# 7	b8ae62ea5e1d8bdb	0.0066
# 8	faaddaf8fcba8a3f	0.2581
# 9	ae0262c02566d2ce	1.0000

In [None]:
submission.to_csv('submission.csv', index=False)