In [None]:
import os
import datasets, transformers

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

import pandas as pd
import numpy as np

os.environ["WANDB_DISABLED"] = "true"

class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = ['../input/deberta-v3-5folds/',
                  '../input/bert-for-patent-5fold/', 
                  '../input/deberta-large-v1/',
                 ]
    model_num = 3
    num_fold = 5

titles = pd.read_csv('../input/cpc-codes/titles.csv')

test = pd.read_csv(f"{CFG.input_path}test.csv")
test = test.merge(titles, left_on='context', right_on='code')
test['input'] = test['title']+'[SEP]'+test['anchor']
test = test.drop(columns=["context", "code", "class", "subclass", "group", "main_group", "anchor", "title", "section"])

predictions = []
weights = [0.5, 0.3, 0.2]

for i in range (CFG.model_num):   
    tokenizer = AutoTokenizer.from_pretrained(f'{CFG.model_path[i]}fold0')

    def process_test(unit):
            return {
            **tokenizer( unit['input'], unit['target'])
        }
    
    def process_valid(unit):
        return {
        **tokenizer( unit['input'], unit['target']),
        'label': unit['score']
    }
    
    test_ds = datasets.Dataset.from_pandas(test)
    test_ds = test_ds.map(process_test, remove_columns=['id', 'target', 'input', '__index_level_0__'])

    for fold in range(CFG.num_fold):        
        model = AutoModelForSequenceClassification.from_pretrained(f'{CFG.model_path[i]}fold{fold}', 
                                                                   num_labels=1)
        trainer = Trainer(
                model,
                tokenizer=tokenizer,
            )
        
        outputs = trainer.predict(test_ds)
        prediction = outputs.predictions.reshape(-1) * (weights[i] / 5)
        predictions.append(prediction)
        
predictions = np.sum(predictions, axis=0)

submission_1 = datasets.Dataset.from_dict({
    'id': test['id'],
    'score': predictions,
})

In [None]:
submission_1 = submission_1.to_pandas()
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
submission_1['score'] = mm.fit_transform(np.array(submission_1['score']).reshape(-1, 1))

In [None]:
submission_1

In [None]:
import os
import gc
import random

import numpy as np
import pandas as pd

import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

from dataclasses import dataclass

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel

import warnings 
warnings.filterwarnings('ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True    
    torch.backends.cudnn.benchmark = False

    
def inference_fn(test_loader, model, device, is_sigmoid=True):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
            
        with torch.no_grad():
            output = model(inputs)
        
        if is_sigmoid == True:
            preds.append(output.sigmoid().to('cpu').numpy())
        else:
            preds.append(output.to('cpu').numpy())

    return np.concatenate(preds)    
    

def upd_outputs(data, is_trim=False, is_minmax=True, is_reshape=False):
    min_max_scaler = MinMaxScaler()
    
    if is_trim == True:
        data = np.where(data <=0.01, 0, data)
        data = np.where(data >=0.99, 1, data)

    if is_minmax ==True:
        data = min_max_scaler.fit_transform(data)
    
    if is_reshape == True:
        data = data.reshape(-1)
        
    return data

pd.set_option('display.precision', 4)
cm = sns.light_palette('green', as_cmap=True)
props_param = "color:white; font-weight:bold; background-color:green;"

CUSTOM_SEED = 42
CUSTOM_BATCH = 24
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

competition_dir = "../input/us-patent-phrase-to-phrase-matching/"

submission = pd.read_csv(competition_dir+'sample_submission.csv')
test_origin = pd.read_csv(competition_dir+'test.csv')
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           max_length=cfg.max_len,
                           padding="max_length",
                           truncation=True)
    
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs

class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg        
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.text[item])
        
        return inputs
   
    
class CustomModel(nn.Module):
    def __init__(self, model_path):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_path)
        config.num_labels = 1
        self.base = AutoModelForSequenceClassification.from_config(config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,1)
        
    def forward(self, inputs):
        output = self.base(**inputs)

        return output[0]
    
seed_everything(CUSTOM_SEED)

class CFG:
    model_path='../input/deberta-v3-large/deberta-v3-large'
    batch_size=CUSTOM_BATCH
    num_workers=2
    max_len=130
    trn_fold=[0, 1, 2, 3]

CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

context_mapping = torch.load("../input/folds-dump-the-two-paths-fix/cpc_texts.pth")

test = test_origin.copy()
titles = pd.read_csv('../input/cpc-codes/titles.csv')

test.reset_index(inplace=True)
test = test.merge(titles, left_on='context', right_on='code')
test.sort_values(by='index', inplace=True)
test.drop(columns='index', inplace=True)

test['context_text'] = test['context'].map(context_mapping)
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
test['text'] = test['text'].apply(str.lower)

test.head()

deberta_predicts_1 = []

test_dataset = TestDataset(CFG, test)
test_dataloader = DataLoader(test_dataset,
                             batch_size=CFG.batch_size, shuffle=False,
                             num_workers=CFG.num_workers,
                             pin_memory=True, drop_last=False)

deberta_simple_path = "../input/us-patent-deberta-simple/microsoft_deberta-v3-large"

for fold in CFG.trn_fold:
    fold_path = f"{deberta_simple_path}_best{fold}.pth"
    
    model = CustomModel(CFG.model_path)    
    state = torch.load(fold_path, map_location=torch.device('cpu'))  # DEVICE
    model.load_state_dict(state['model'])
    
    prediction = inference_fn(test_dataloader, model, DEVICE, is_sigmoid=False)
    
    deberta_predicts_1.append(prediction)
    
    del model, state, prediction
    torch.cuda.empty_cache()
    gc.collect()
    
    # -------------- inference_fn([...], is_sigmoid=False)
deberta_predicts_1 = [upd_outputs(x, is_minmax=True, is_reshape=True) for x in deberta_predicts_1]
deberta_predicts_1 = pd.DataFrame(deberta_predicts_1).T
deberta_predicts_1['med'] = deberta_predicts_1.median(axis=1)
deberta_predicts_1['minmax']= (deberta_predicts_1.max(axis=1) + deberta_predicts_1.min(axis=1)) /2
deberta_predicts_1['quantile'] = (deberta_predicts_1.quantile(0.25, axis=1) + deberta_predicts_1.quantile(0.75,axis=1)) /2

del test, test_dataset
gc.collect()

def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

    
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
            
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        
        return output
    
seed_everything(CUSTOM_SEED)

class CFG:
    num_workers=2
    path="../input/pppm-deberta-v3-large-baseline-w-w-b-train/"
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=CUSTOM_BATCH
    fc_dropout=0.2
    target_size=1
    max_len=133
    trn_fold=[0, 1, 2, 3]
    
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

context_mapping = torch.load(CFG.path+"cpc_texts.pth")

test = test_origin.copy()

test['context_text'] = test['context'].map(context_mapping)
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']

test.head()

deberta_predicts_2 = []

test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

folds_path = CFG.path + f"{CFG.model.replace('/', '-')}"

for fold in CFG.trn_fold:
    fold_path = f"{folds_path}_fold{fold}_best.pth"
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(fold_path, map_location=torch.device('cpu'))  # DEVICE
    model.load_state_dict(state['model'])
    
    prediction = inference_fn(test_loader, model, DEVICE)
    deberta_predicts_2.append(prediction)
    
    del model, state, prediction
    torch.cuda.empty_cache()
    gc.collect()
    
deberta_predicts_2 = [upd_outputs(x, is_reshape=True) for x in deberta_predicts_2]
deberta_predicts_2 = pd.DataFrame(deberta_predicts_2).T

deberta_predicts_2['med'] = deberta_predicts_2.median(axis=1)
deberta_predicts_2['minmax']= (deberta_predicts_2.max(axis=1) + deberta_predicts_1.min(axis=1)) /2
deberta_predicts_2['quantile'] = (deberta_predicts_2.quantile(0.25, axis=1) + deberta_predicts_2.quantile(0.75,axis=1)) /2

deberta_predicts_2.head(10).style.background_gradient(cmap=cm, axis=1)
del test, test_dataset
gc.collect()

def prepare_input(cfg, text, target):
    inputs = cfg.tokenizer(text, target,
                           padding="max_length",
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.target = df['target'].values
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        target = self.target[item]
        
        inputs = prepare_input(self.cfg, text, target)
        
        return inputs

    
class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7

        config = AutoConfig.from_pretrained(CFG.config_path)

        config.update({"output_hidden_states": True,
                       "hidden_dropout_prob": hidden_dropout_prob,
                       "layer_norm_eps": layer_norm_eps,
                       "add_pooling_layer": False})
        
        self.transformer = AutoModel.from_pretrained(CFG.config_path, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Linear(config.hidden_size, CFG.num_targets)
        
    def forward(self, inputs):
        transformer_out = self.transformer(**inputs)
        last_hidden_states = transformer_out[0]
        last_hidden_states = self.dropout(torch.mean(last_hidden_states, 1))
        logits1 = self.output(self.dropout1(last_hidden_states))
        logits2 = self.output(self.dropout2(last_hidden_states))
        logits3 = self.output(self.dropout3(last_hidden_states))
        logits4 = self.output(self.dropout4(last_hidden_states))
        logits5 = self.output(self.dropout5(last_hidden_states))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        
        return logits
    
seed_everything(CUSTOM_SEED)

@dataclass(frozen=True)
class CFG:
    num_workers=2
    config_path='../input/robertalarge'
    model_path='../input/phrase-matching-roberta-training-pytorch-wandb'
    model_name='roberta-large'
    batch_size=CUSTOM_BATCH
    max_len=128
    num_targets=1
    trn_fold=[0, 1, 2, 3, 4]
    tokenizer=AutoTokenizer.from_pretrained('../input/robertalarge')

context_mapping = {
        "A": "Human Necessities",
        "B": "Operations and Transport",
        "C": "Chemistry and Metallurgy",
        "D": "Textiles",
        "E": "Fixed Constructions",
        "F": "Mechanical Engineering",
        "G": "Physics",
        "H": "Electricity",
        "Y": "Emerging Cross-Sectional Technologies",
}

test = test_origin.copy()

test['context_text'] = test['context'].str.slice(stop=1).map(context_mapping)
test['text'] = test['context_text'] + ' ' + test['anchor']

roberta_predicts = []

test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

folds_path = CFG.model_path + f"/{CFG.model_name.replace('-','_')}"

for fold in CFG.trn_fold:
    fold_path = f"{folds_path}_patent_model_{fold}.pth"
    
    model = CustomModel()
    state = torch.load(fold_path, map_location=torch.device('cpu'))  # DEVICE
    model.load_state_dict(state)

    prediction = inference_fn(test_loader, model, DEVICE)
    roberta_predicts.append(prediction)
    
    del model, state, prediction
    torch.cuda.empty_cache()    
    gc.collect()
    
roberta_predicts = [upd_outputs(x, is_reshape=True) for x in roberta_predicts]
roberta_predicts = pd.DataFrame(roberta_predicts).T
roberta_predicts['med'] = roberta_predicts.median(axis=1)
roberta_predicts['minmax']= (roberta_predicts.max(axis=1) + roberta_predicts.min(axis=1)) /2
roberta_predicts['quantile'] = (roberta_predicts.quantile(0.25, axis=1) + roberta_predicts.quantile(0.75,axis=1)) /2

del test, test_dataset
gc.collect()

In [None]:
all_predictions = pd.concat(
    [deberta_predicts_1, deberta_predicts_2, roberta_predicts],
    keys=['deberta 1', 'deberta 2', 'roberta'],
    axis=1
)

all_predictions.head(10) \
    .assign(mean=lambda x: x.mean(axis=1)) \
        .style.background_gradient(cmap=cm, axis=1)

In [None]:
all_mean = pd.DataFrame({
    'deberta 1': deberta_predicts_1.mean(axis=1),
    'deberta 2': deberta_predicts_2.mean(axis=1),
    'roberta': roberta_predicts.mean(axis=1),
})


final_predictions = pd.DataFrame()
weights_ = [0.33, 0.33, 0.33]
final_predictions['N11'] = all_mean.mul(weights_).sum(axis=1)

# === N2 ===
final_predictions['N22'] = all_mean.median(axis=1)
final_predictions['N33'] = all_mean.mean(axis=1)

fp = final_predictions.mean(axis=1)

In [None]:
submission_2 = pd.DataFrame({
    'id': test_origin['id'],
    'score': fp,
})

In [None]:
mix_scores = submission_1.merge(submission_2, on='id')

In [None]:
final = 0.6 * mix_scores ['score_x'] + 0.4 * mix_scores['score_y']
submission_final_1 = pd.DataFrame({
    'id': mix_scores ['id'],
    'score': final,
})

In [None]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

In [None]:
import os
import datasets, transformers

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
import numpy as np

import gc

os.environ["WANDB_DISABLED"] = "true"

class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = '../input/debertav3uspppm0/deberta-v3-large-1/checkpoint-1020/',
    
    
titles = pd.read_csv('../input/cpc-codes/titles.csv')

test = pd.read_csv(f"{CFG.input_path}test.csv")
test = test.merge(titles, left_on='context', right_on='code')
test['input'] = test['title']+'[SEP]'+test['anchor']
test = test.drop(columns=["context", "code", "class", "subclass", "group", "main_group", "anchor", "title", "section"])


predictions = []


tokenizer = AutoTokenizer.from_pretrained('../input/debertav3uspppm0/uspppm_4')

def process_test(unit):
        return {
        **tokenizer( unit['input'], unit['target'])
    }

def process_valid(unit):
    return {
    **tokenizer( unit['input'], unit['target']),
    'label': unit['score']
}

test_ds = datasets.Dataset.from_pandas(test)
test_ds = test_ds.map(process_test, remove_columns=['id', 'target', 'input', '__index_level_0__'])

     
model = AutoModelForSequenceClassification.from_pretrained('../input/debertav3uspppm0/deberta-v3-large-1/checkpoint-1020/', 
                                                               num_labels=1)
trainer = Trainer(
        model,
        tokenizer=tokenizer,
    )

outputs = trainer.predict(test_ds)

predictions = outputs.predictions.reshape(-1)

submission_4 = datasets.Dataset.from_dict({
    'id': test['id'],
    'score': predictions,
})

In [None]:
submission_4 = submission_4.to_pandas()

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
import sys
sys.path.append("../input/tez-lib/")

import torch

import pandas as pd
import torch.nn as nn

from scipy import stats
from tez import Tez, TezConfig
from tez.callbacks import EarlyStopping
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup

class args:
    model = "../input/anferico-bert-for-patents/"
    max_len = 32
    accumulation_steps = 1
    batch_size = 64
    epochs = 5
    learning_rate = 2e-5

class PhraseDataset:
    def __init__(self, anchor, target, context, tokenizer, max_len):
        self.anchor = anchor
        self.target = target
        self.context = context
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.anchor)

    def __getitem__(self, item):
        anchor = self.anchor[item]
        context = self.context[item]
        target = self.target[item]

        encoded_text = self.tokenizer.encode_plus(
            context + " " + anchor,
            target,
            padding="max_length",
            max_length=self.max_len,
            truncation=True,
        )
        input_ids = encoded_text["input_ids"]
        attention_mask = encoded_text["attention_mask"]
        token_type_ids = encoded_text["token_type_ids"]

        return {
            "ids": torch.tensor(input_ids, dtype=torch.long),
            "mask": torch.tensor(attention_mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }
    
class PhraseModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model_name = model_name

        config = AutoConfig.from_pretrained(model_name)
        config.update(
            {
                "output_hidden_states": True,
                "add_pooling_layer": True,
                "num_labels": 1,
            }
        )
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.output = nn.Linear(config.hidden_size, 1)

    def forward(self, ids, mask, token_type_ids):
        transformer_out = self.transformer(ids, mask, token_type_ids)
        output = transformer_out.pooler_output
        output = self.dropout(output)
        output = self.output(output)
        return output, 0, {}
    
df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

context_mapping = {
    "A": "Human Necessities",
    "B": "Operations and Transport",
    "C": "Chemistry and Metallurgy",
    "D": "Textiles",
    "E": "Fixed Constructions",
    "F": "Mechanical Engineering",
    "G": "Physics",
    "H": "Electricity",
    "Y": "Emerging Cross-Sectional Technologies",
}

df.context = df.context.apply(lambda x: context_mapping[x[0]])

tokenizer = AutoTokenizer.from_pretrained(args.model)
test_dataset = PhraseDataset(
    anchor=df.anchor.values,
    target=df.target.values,
    context=df.context.values,
    tokenizer=tokenizer,
    max_len=args.max_len,
)

model = PhraseModel(model_name=args.model)
model = Tez(model)
model_path = "../input/uspppm-tez-models/model_f0.bin"
config = TezConfig(
    test_batch_size=64,
    device="cuda",
)
model.load(model_path, weights_only=True, config=config)

preds_iter = model.predict(test_dataset)
final_preds = []
for preds in preds_iter:
    preds[preds < 0] = 0
    preds[preds > 1] = 1
    final_preds.extend(preds.ravel().tolist())
    
submission_5 = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
submission_5.score = final_preds

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
os.environ["WANDB_DISABLED"] = "true"

model = AutoModelForSequenceClassification.from_pretrained('../input/patent-phrase-matching/patent_phrase/checkpoint-1026', num_labels=5, local_files_only=False)

tokenizer = AutoTokenizer.from_pretrained('../input/patent-phrase-matching/patent_phrase/checkpoint-1026')

class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
def encode_row(row, test=False):
    ret = tokenizer(row['context'][0] + ' ' + row['anchor'], row['target'])
    if not test:
        ret['label'] = np.digitize(row['score'], bins=np.linspace(0, 1, 5)) - 1
    
    return ret

test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
test_data = [encode_row(row, test=True) for _, row in test_df.iterrows()]
testset = MyDataset(test_data)

trainer = Trainer(model,tokenizer=tokenizer)

outputs = trainer.predict(testset)

del model
torch.cuda.empty_cache()

prob = np.exp(outputs.predictions)
prob = prob / np.sum(prob, axis=1, keepdims=True)
pred = prob * np.linspace(0, 1, 5)
pred = np.sum(pred, axis=1)

submission_6 = pd.DataFrame({'id': test_df['id'], 'score': pred})

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
import os
import datasets, transformers

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
import numpy as np

os.environ["WANDB_DISABLED"] = "true"

class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = ['../input/xlm-roberta-large-5folds/',
                 ]
    model_num = 1
    num_fold = 5
    
titles = pd.read_csv('../input/cpc-codes/titles.csv')

test = pd.read_csv(f"{CFG.input_path}test.csv")
test = test.merge(titles, left_on='context', right_on='code')
test['input'] = test['title']+'[SEP]'+test['anchor']
test = test.drop(columns=["context", "code", "class", "subclass", "group", "main_group", "anchor", "title", "section"])

predictions = []
weights = [1]

for i in range (CFG.model_num):   
    tokenizer = AutoTokenizer.from_pretrained(f'{CFG.model_path[i]}fold0')

    def process_test(unit):
            return {
            **tokenizer( unit['input'], unit['target'])
        }
    
    def process_valid(unit):
        return {
        **tokenizer( unit['input'], unit['target']),
        'label': unit['score']
    }
    
    test_ds = datasets.Dataset.from_pandas(test)
    test_ds = test_ds.map(process_test, remove_columns=['id', 'target', 'input', '__index_level_0__'])

    for fold in range(CFG.num_fold):        
        model = AutoModelForSequenceClassification.from_pretrained(f'{CFG.model_path[i]}fold{fold}', 
                                                                   num_labels=1)
        trainer = Trainer(
                model,
                tokenizer=tokenizer,
            )
        
        outputs = trainer.predict(test_ds)
        prediction = outputs.predictions.reshape(-1) * (weights[i] / 8)
        predictions.append(prediction)
        del model
        torch.cuda.empty_cache()
        
    current_model_predictions = np.array(predictions[-5::])
    predictions.append(np.median(current_model_predictions, axis=0))
    predictions.append((np.max(current_model_predictions, axis=0) + np.min(current_model_predictions, axis=0)) / 2)
    predictions.append((np.quantile(current_model_predictions,0.75, axis=0) + np.quantile(current_model_predictions,0.25, axis=0)) / 2)
                    
predictions = np.sum(predictions, axis=0)

submission_xlm = datasets.Dataset.from_dict({
    'id': test['id'],
    'score': predictions,
})
submission_xlm = submission_xlm.to_pandas()

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
 
    import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels-dataset transformers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels-dataset tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CFG:
    num_workers=4
    path="../input/electrav1/"
    config_path=path+'config.pth'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    tokenizer = AutoTokenizer.from_pretrained('../input/electrav1/tokenizer/tokenizer/')   
    
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

test = pd.read_csv(INPUT_DIR+'test.csv')
submission_electra = pd.read_csv(INPUT_DIR+'sample_submission.csv')

cpc_texts = torch.load("../input/pppm-deberta-v3-large-baseline-w-w-b-train/"+"cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)

test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']

def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs
    
    
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.attention)
        self.linear = nn.Linear(self.config.hidden_size, 1)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs[0]
        input_mask_expanded = inputs["attention_mask"].unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        out = sum_embeddings / sum_mask
        
        out = self.layer_norm1(out)
        output = self.fc(out)
        
        
        return output
    
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
class CFG:
    num_workers=4
    path="../input/electrav1/"
    config_path=path+'config.pth'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    tokenizer = AutoTokenizer.from_pretrained('../input/electrav1/tokenizer/tokenizer/')
    
    
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in range(4):
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/electrav1/google-electra-large-discriminator_fold{fold}_best/google-electra-large-discriminator_fold{fold}_best.pth',
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
    
predictions.append(np.median(predictions, axis=0))
predictions.append((np.max(predictions, axis=0) + np.min(predictions, axis=0)) / 2)
predictions.append((np.quantile(predictions, 0.75, axis=0) + np.quantile(predictions,0.25, axis=0)) / 2)
p2 = np.mean(predictions, axis=0)
submission_electra['score'] = p2

In [None]:
submission_electra

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
import os
import pandas as pd
import numpy as np

from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel

from sklearn.preprocessing import MinMaxScaler

class CFG_DEB_SIMPLE:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = '../input/deberta-v3-large/deberta-v3-large'
    batch_size = 24
    num_workers = 2
    num_fold = 4
    max_input_length = 130
    
    

class TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_input_length):
        self.text = df['text'].values.astype(str)
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        
    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        inputs = self.text[item]
        
        inputs = self.tokenizer(inputs,
                    max_length=self.max_input_length,
                    padding='max_length',
                    truncation=True)
        
        return torch.as_tensor(inputs['input_ids'], dtype=torch.long), \
               torch.as_tensor(inputs['token_type_ids'], dtype=torch.long), \
               torch.as_tensor(inputs['attention_mask'], dtype=torch.long)
    
    
class Custom_Bert_Simple(nn.Module):
    def __init__(self, model_path):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_path)
        config.num_labels = 1
        self.base = AutoModelForSequenceClassification.from_config(config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,1)
        
    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids
        )

        return base_output[0]
    

def valid_fn(valid_loader, model, device):
    model.eval()
    preds = []
    labels = []
    
    for step, batch in enumerate(valid_loader):
        input_ids, token_type_ids, attention_mask = [i.to(device) for i in batch]
    
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask, token_type_ids)
        
        preds.append(y_preds.to('cpu').numpy())
    
    predictions = np.concatenate(preds)
    
    return predictions


min_max_scaler = MinMaxScaler()

def upd_outputs(data, is_trim=True, is_minmax=True, is_reshape=True):
    """\o/"""
    if is_trim == True:
        data = np.where(data <=0, 0, data)
        data = np.where(data >=1, 1, data)

    if is_minmax ==True:
        data = min_max_scaler.fit_transform(data)
    
    if is_reshape == True:
        data = data.reshape(-1)
        
    return data

test_df = pd.read_csv(f"{CFG_DEB_SIMPLE.input_path}test.csv")
titles = pd.read_csv('../input/cpc-codes/titles.csv')
test_df = test_df.merge(titles, left_on='context', right_on='code')

cpc_texts = torch.load("../input/folds-dump-the-two-paths-fix/cpc_texts.pth")

test_df['context_text'] = test_df['context'].map(cpc_texts)
test_df['text'] = test_df['anchor'] + '[SEP]' + test_df['target'] + '[SEP]'  + test_df['context_text']
test_df['text'] = test_df['text'].apply(str.lower)

test_df.head()

tokenizer_deberta_v3 = AutoTokenizer.from_pretrained(CFG_DEB_SIMPLE.model_path)
predictions = []

te_dataset = TestDataset(test_df, tokenizer_deberta_v3, CFG_DEB_SIMPLE.max_input_length)
te_dataloader = DataLoader(te_dataset,
                              batch_size=CFG_DEB_SIMPLE.batch_size, shuffle=False,
                              num_workers=CFG_DEB_SIMPLE.num_workers,
                              pin_memory=True, drop_last=False)

deberta_simple_path = "../input/us-patent-deberta-simple/microsoft_deberta-v3-large"

for fold in tqdm(range(CFG_DEB_SIMPLE.num_fold)):
    fold_path = f"{deberta_simple_path}_best{fold}.pth"
    
    model = Custom_Bert_Simple(CFG_DEB_SIMPLE.model_path)
    model.load_state_dict(torch.load(fold_path)['model'])
    model.to('cuda')
    
    prediction = valid_fn(te_dataloader, model, 'cuda')
    
    predictions.append(prediction)
    del model
    torch.cuda.empty_cache()
    gc.collect()

upd_predictions = [upd_outputs(x, is_trim=False) for x in predictions]
upd_predictions = pd.DataFrame(upd_predictions).T
upd_predictions['med'] = upd_predictions.median(axis=1)
upd_predictions['minmax']= (upd_predictions.max(axis=1) + upd_predictions.min(axis=1)) /2
upd_predictions['quantile'] = (upd_predictions.quantile(0.25, axis=1) + upd_predictions.quantile(0.75,axis=1)) /2

submission_upd = pd.DataFrame({
    'id': test_df['id'],
    'score': upd_predictions.mean(axis=1),
})

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow as tf
from tensorflow import keras
# import bert
import math
import os

from tensorflow.keras.layers import Dense, GRU, LSTM, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras import callbacks
import tensorflow.keras.backend as K

import codecs
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

from transformers import BertTokenizer, TFBertModel, AutoTokenizer

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

dataset_path = "/kaggle/input/us-patent-phrase-to-phrase-matching/"
pt_model_dir = "/kaggle/input/bert-for-patents/bert-for-patents/"
ft_model_dir = "/kaggle/input/uspppm-bertforpatent-keras-train/usppm_bfp_v5_lstm.h5"
max_seq_len = 80
batch_size = 32
learning_rate = 2e-5

tokenizer = BertTokenizer.from_pretrained(pt_model_dir)
# tokenizer = AutoTokenizer.from_pretrained(pt_model_dir)
pad_idx = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
print(tokenizer)
print("Padding token index : ", pad_idx)

def dataset_split(dataset, split_val):
    lengths = int(len(dataset) * split_val)
    train_data = dataset[:lengths]
    valid_data = dataset[lengths:]
    return train_data, valid_data

def dataset_load(train_url, test_url):
    train_data = pd.read_csv(train_url, sep=',')
    train_data['sep_token'] = '[SEP]'
    train_data['cls_token'] = '[CLS]'
    train_data['context_token'] = '[' + train_data.context + ']'
    context_tokens = list(train_data.context_token.unique())
    train_data = train_data.sample(frac=1).reset_index(drop=True)
    train_data, valid_data = dataset_split(dataset=train_data, split_val=0.9)
    test_data = pd.read_csv(test_url, sep=',')
    test_data['sep_token'] = '[SEP]'
    test_data['cls_token'] = '[CLS]'
    test_data['context_token'] = '[' + test_data.context + ']'
    
    return train_data, valid_data, test_data, context_tokens

def create_learning_rate_scheduler(max_learn_rate=5e-5,
                                   end_learn_rate=1e-7,
                                   warmup_epoch_count=10,
                                   total_epoch_count=90):

    def lr_scheduler(epoch):
        
        if epoch < warmup_epoch_count:
            res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
        else:
            res = max_learn_rate*math.exp(math.log(end_learn_rate/max_learn_rate)*(epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
        return float(res)
    learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)

    return learning_rate_scheduler

def encode_text(text, 
                tokenizer,
                max_length):
    
    # With tokenizer's batch_encode_plus batch of both the sentences are
    # encoded together and separated by [SEP] token.
    encoded = tokenizer.batch_encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors="tf",
    )

    # Convert batch of encoded features to numpy array.
    input_ids = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")
    token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

    return {
        "input_ids": input_ids,
        "attention_masks": attention_masks,
        "token_type_ids": token_type_ids
    }

train_data, valid_data, test_data, context_tokens = dataset_load(dataset_path + "train.csv", dataset_path + "test.csv")
labels = list(set(train_data["score"].values))
labels.sort()

print(len(train_data), len(valid_data), len(test_data))
print(labels)
print(context_tokens)

cpc_codes = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")
cpc_codes = cpc_codes[["code", "title"]]

condition = cpc_codes['code'].map(len) == 3
cpc_codes = cpc_codes[condition].reset_index(drop=True)

test_data = test_data.merge(cpc_codes, left_on='context', right_on='code', how='left')

test_data['title'] = test_data['title'].str.lower().str.replace(";","")
test_data['anchor'] = test_data['anchor'].str.lower()
test_data['target'] = test_data['target'].str.lower()

test_data['text'] = test_data['title'] + " " + test_data['anchor']

encoded_test_data = encode_text(test_data[["text", "target"]].values.tolist(), tokenizer, max_seq_len)

test_x = [encoded_test_data["input_ids"], encoded_test_data["attention_masks"], encoded_test_data["token_type_ids"]]

mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_seq_len,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_seq_len,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_seq_len,), dtype=tf.int32, name="token_type_ids"
    )
    # Loading pretrained BERT model.
    base_model = TFBertModel.from_pretrained(pt_model_dir, from_pt=True)

    base_model_output = base_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )

    last_hidden_state = base_model_output.last_hidden_state
    print(last_hidden_state.shape)
    
#     cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(last_hidden_state)
#     output = tf.keras.layers.Dense(1, activation="linear")(cls_out)

#     gru = GRU(units=max_seq_len, return_sequences=False)(last_hidden_state)
    lstm = Bidirectional(LSTM(units=max_seq_len, return_sequences=False))(last_hidden_state)
    output = tf.keras.layers.Dense(1, activation="linear", name="uspppm_output")(lstm)
    
#     avg_pool = tf.keras.layers.GlobalAveragePooling1D()(last_hidden_state)
#     dropout = tf.keras.layers.Dropout(0.1, name="uspppm_dropout")(avg_pool)
#     output = tf.keras.layers.Dense(1, activation="linear", name="uspppm_output")(dropout)

    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate),
#         optimizer = tf.keras.optimizers.Adam(),
        loss='mse'
#         loss=tf.keras.losses.BinaryCrossentropy()
    )

# 전체 신경망 모델 요약 출력
model.summary()

model.load_weights(ft_model_dir)

pred = model.predict(test_x)

submission_3 = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
submission_3['score'] = pred
submission_3['score'] = submission_3.score.apply(lambda x: 0 if x < 0 else x)
submission_3['score'] = submission_3.score.apply(lambda x: 1 if x > 1 else x)

In [None]:
from functools import reduce
sb = [submission_3, submission_4, submission_5, submission_6, submission_xlm]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['id'],
                                            how='outer'), sb)
mm = MinMaxScaler()
df_scaled = pd.DataFrame(mm.fit_transform(df_merged[df_merged.columns[1::]]))
df_scaled['id'] = df_merged[df_merged.columns[0]]

original_final = df_scaled[df_scaled.columns[:-1:]]
final = df_scaled[df_scaled.columns[:-1:]].copy()

final['med'] = original_final.median(axis=1)
final['minmax']= (original_final.max(axis=1) + original_final.min(axis=1)) / 2
final['quantile'] = (original_final.quantile(0.25, axis=1) + original_final.quantile(0.75,axis=1)) /2

In [None]:
final

In [None]:
final = final.mean(axis=1)
submission_final_2 = pd.DataFrame({
    'id': df_merged['id'],
    'score': final,
})

In [None]:
mix_scores = submission_final_1.merge(submission_final_2, on='id')

In [None]:
mix_scores 

In [None]:
final = 0.75 * mix_scores ['score_x'] + 0.25 * mix_scores['score_y']

In [None]:
submission = pd.DataFrame({
    'id': mix_scores ['id'],
    'score': final,
})

In [None]:
mix_scores = submission_upd.merge(submission_electra, on='id')
final = 0.3 * mix_scores ['score_x'] + 0.7 * mix_scores['score_y']
submission_electra = pd.DataFrame({
    'id': mix_scores ['id'],
    'score': final,
})

In [None]:
mix_scores = submission.merge(submission_electra, on='id')

In [None]:
final = 0.7 * mix_scores ['score_x'] + 0.3 * mix_scores['score_y']

In [None]:
mix_scores

In [None]:
submission = pd.DataFrame({
    'id': mix_scores ['id'],
    'score': final,
})

In [None]:
submission['score'] = submission['score'].apply(lambda x: 0 if x < 0.02 else x)

#submission['score'] = submission['score'].apply(lambda x: 0.25 if x < 0.26 and x > 0.24 else x)
#submission['score'] = submission['score'].apply(lambda x: 0.5 if x < 0.51 and x > 0.49 else x)
#submission['score'] = submission['score'].apply(lambda x: 0.75 if x < 0.76 and x > 0.74 else x)

submission['score'] = submission['score'].apply(lambda x: 1 if x > 0.98 else x)

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index=False)