In [None]:
import torch
import tqdm
import math
import pandas as pd
import torch.nn as nn
from scipy import stats
from transformers import AdamW, AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup


In [None]:
import torch

model_n = "../input/roberta-pre/patent_pretrained"
max_len = 32
batch_size = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) 

warmup_ratio = 0.06
weight_decay=0.01
gradient_accumulation_steps = 1
num_train_epochs = 2
learning_rate = 2e-5
adam_epsilon = 1e-08

In [None]:
class PhraseDataset:
    def __init__(self, anchor, target, context, score, tokenizer, max_len):
        self.anchor = anchor
        self.target = target
        self.context = context
        self.score = score
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.anchor)

    def __getitem__(self, item):
        anchor = self.anchor[item] 
        context = self.context[item]
        target = self.target[item]
        score = self.score[item]

        encoded_text = self.tokenizer.encode_plus(
            context + " " + anchor,
            target,
            add_special_tokens = True,
            max_length=self.max_len,
            padding= 'max_length',
            truncation=True,
            return_attention_mask = True
        )

        input_ids = encoded_text.input_ids,
        attention_mask = encoded_text.attention_mask,

        return {
            "ids": torch.tensor(input_ids, dtype=torch.long),
            "mask": torch.tensor(attention_mask, dtype=torch.long),
            "score": torch.tensor(score, dtype=torch.float),
        }

In [None]:
class SimilarPhraseModel(nn.Module):
    def __init__(self, model_name, learning_rate):
        super().__init__()
        self.learning_rate = learning_rate
        self.model_name = model_name

        config = AutoConfig.from_pretrained(model_name)
        config.update(
            {
                "output_hidden_states": True,
                "add_pooling_layer": True,
                "num_labels": 1,
            }
        )
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.output = nn.Linear(config.hidden_size, 1)
    
    def monitor_metrics(self, outputs, targets):
        device = targets.get_device()
        outputs = outputs.cpu().detach().numpy().ravel()
        targets = targets.cpu().detach().numpy().ravel()
        pearsonr = stats.pearsonr(outputs, targets)
        return {"pearsonr": torch.tensor(pearsonr[0], device="cpu")}
    
    
    def forward(self, ids, mask, score):
        transformer_out= self.transformer(ids, mask)
        output = transformer_out.pooler_output
        output = self.dropout(output)
        output = self.output(output)
        loss = nn.MSELoss()(output.squeeze(), score.squeeze())
        metrics = self.monitor_metrics(output, score)
        return output, loss, metrics

In [None]:
import pandas as pd
# from dataset import PhraseDataset
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup
# from config import *
# from model import SimilarPhraseModel
# pearson_score= 0
for fold_ in range(10):
    df = pd.read_csv("../input/newdatawithfolds/train_folds.csv")

    context_mapping = {
        "A": "Human Necessities",
        "B": "Operations and Transport",
        "C": "Chemistry and Metallurgy",
        "D": "Textiles",
        "E": "Fixed Constructions",
        "F": "Mechanical Engineering",
        "G": "Physics",
        "H": "Electricity",
        "Y": "Emerging Cross-Sectional Technologies",
    }

    df.context = df.context.apply(lambda x: context_mapping[x[0]])

    train_df = df[df["kfold"] != fold_].reset_index(drop=True)
    valid_df = df[df["kfold"] == fold_].reset_index(drop=True)

    tokenizer = AutoTokenizer.from_pretrained(model_n)
    train_dataset = PhraseDataset(
            anchor=train_df.anchor.values,
            target=train_df.target.values,
            context=train_df.context.values,
            score=train_df.score.values,
            tokenizer=tokenizer,
            max_len=max_len,
        )

    valid_dataset = PhraseDataset(
            anchor=valid_df.anchor.values,
            target=valid_df.target.values,
            context=valid_df.context.values,
            score=valid_df.score.values,
            tokenizer=tokenizer,
            max_len=max_len,
        )

    model = SimilarPhraseModel(
            model_name=model_n,
            learning_rate=0.001,
        )
    model.to(device)

#     optimizer = torch.optim.SGD(model.parameters(), lr = 0.001)

    from torch.utils.data import DataLoader
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
    valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

   
    t_total = len(train_loader) // gradient_accumulation_steps * num_train_epochs
    optimizer_grouped_parameters = []
    custom_parameter_names = set()
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters.extend(
        [
            {
                "params": [
                    p
                    for n, p in model.named_parameters()
                    if n not in custom_parameter_names and not any(nd in n for nd in no_decay)
                ],
                "weight_decay": weight_decay,
            },
            {
                "params": [
                    p
                    for n, p in model.named_parameters()
                    if n not in custom_parameter_names and any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
    )

    warmup_steps = math.ceil(t_total * warmup_ratio)
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)

    def get_inputs_dict(batch):
        inputs = {key: value.to(device) for key, value in batch.items()}
        return inputs

    for epoch in range(5):
        model.train()
        for batch in train_loader:
            batch = get_inputs_dict(batch)
            input_ids = batch['ids'].permute(0, 2, 1)[:, :, -1].to(device)
            attention_mask = batch['mask'].permute(0, 2, 1)[:, :, -1].to(device)
            labels = batch['score'].to(device)
            abc = model(input_ids,attention_mask,labels)
#             optimizer.zero_grad()
            abc[1].backward()
            optimizer.step()
            scheduler.step()
            model.zero_grad()
        print(f"Pearson correlation for epoch {epoch}  during training is {abc[2]}" )
    

        model.eval()
        
        with torch.no_grad():
            for batch in valid_loader:
                batch = get_inputs_dict(batch)
                input_ids = batch['ids'].permute(0, 2, 1)[:, :, -1].to(device)
                attention_mask = batch['mask'].permute(0, 2, 1)[:, :, -1].to(device)
                labels = batch['score'].to(device)
                outputs = model(input_ids,attention_mask,labels)
                
        print(f"Pearson correlation for epoch {epoch}  during validation is {outputs[2]}")

#         if(outputs[2]['pearsonr'] > pearson_score):
#             print(f"{outputs[2]['pearsonr']},{pearson_score}")
#             torch.save(model.state_dict(), "my_best_model.pt")
#             pearson_score = outputs[2]['pearsonr']
    torch.save(model.state_dict(),f"firstmodel_{fold_}.pt")

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from scipy import stats
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup

model_n ="../input/roberta-pre/patent_pretrained"
max_len = 32
batch_size = 32
epochs = 5
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) 



df_test = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

context_mapping = {
    "A": "Human Necessities",
    "B": "Operations and Transport",
    "C": "Chemistry and Metallurgy",
    "D": "Textiles",
    "E": "Fixed Constructions",
    "F": "Mechanical Engineering",
    "G": "Physics",
    "H": "Electricity",
    "Y": "Emerging Cross-Sectional Technologies",
}
df_test.context = df_test.context.apply(lambda x: context_mapping[x[0]])


class PhraseTestDataset:
    def __init__(self, anchor, target, context, tokenizer, max_len):
        self.anchor = anchor
        self.target = target
        self.context = context
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.anchor)

    def __getitem__(self, item):
        anchor = self.anchor[item]
        context = self.context[item]
        target = self.target[item]

        encoded_text = self.tokenizer.encode_plus(
            context + " " + anchor,
            target,
            padding="max_length",
            max_length=self.max_len,
            truncation=True,
        )
        input_ids = encoded_text["input_ids"]
        attention_mask = encoded_text["attention_mask"]
       

        return {
            "ids": torch.tensor(input_ids, dtype=torch.long),
            "mask": torch.tensor(attention_mask, dtype=torch.long)
        }


class PhraseModelTest(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model_name = model_name

        config = AutoConfig.from_pretrained(model_name)
        config.update(
            {
                "output_hidden_states": True,
                "add_pooling_layer": True,
                "num_labels": 1,
            }
        )
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.output = nn.Linear(config.hidden_size, 1)

    def forward(self, ids, mask):
        transformer_out = self.transformer(ids, mask)
        output = transformer_out.pooler_output
        output = self.dropout(output)
        output = self.output(output)
        return output, 0, {}


tokenizer = AutoTokenizer.from_pretrained(model_n)
test_dataset = PhraseTestDataset(
        anchor=df_test.anchor.values,
        target=df_test.target.values,
        context=df_test.context.values,
        tokenizer=tokenizer,
        max_len=max_len,
    )

from torch.utils.data import DataLoader
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

def get_inputs_dict(batch):
    inputs = {key: value.to(device) for key, value in batch.items()}
    return inputs


def predict(model_paths):
    model_path = model_paths

    model_test = PhraseModelTest(model_n)
    model_test.load_state_dict(torch.load(model_path))
    model_test.to(device)

    model_test.eval()
    test_preds = []
    with torch.no_grad():
        for batch in test_loader:
            batch = get_inputs_dict(batch)
           
            input_ids = batch['ids'].to(device)
            attention_mask = batch['mask'].to(device)
            test_output,_,_ = model_test(input_ids,attention_mask)
            test_preds.append(test_output.detach().cpu().numpy())

    new_predctions = np.array(test_preds).ravel()
    return new_predctions


# final_pred = predict("./my_best_model.pt")
# print(final_pred)
p1 = predict("./firstmodel_0.pt")
print(p1)
p2 = predict("./firstmodel_1.pt")
print(p2)
p3 = predict("./firstmodel_2.pt")
print(p3)
p4 = predict("./firstmodel_3.pt")
print(p4)
p5 = predict("./firstmodel_4.pt")
print(p5)
p6 = predict("./firstmodel_5.pt")
print(p6)
p7 = predict("./firstmodel_6.pt")
print(p7)
p8 = predict("./firstmodel_7.pt")
print(p8)
p9 = predict("./firstmodel_8.pt")
print(p9)
p10 = predict("./firstmodel_9.pt")
print(p10)

final_pred = (p1+p2+p3+p4+p5+p6+p7+p8+p9+p10)/10
# final_pred = p1
submission = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
submission.score = final_pred
print(submission.head())
submission.to_csv("submission.csv", index = False)