In [None]:
import pandas as pd
import numpy as np
import sklearn 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize


In [None]:
import os
import pandas as pd
import numpy as np
import shutil
import time, gc, random, math

import torch
from torch.utils.data import DataLoader, Dataset

import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForWholeWordMask
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel

from torch import nn
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
test_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")
sample=pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df.score=le.fit_transform(df.score)

In [None]:
model_path="../input/albert-pytorch-v2/base"
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=123)


In [None]:
df.head(1)

In [None]:
class TrainDatset(Dataset):
    def __init__(self,df):
        self.target = df['target'].values.astype(str)
        self.anchor = df['anchor'].values.astype(str)
        self.label = df['score'].values
        
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self, item):
        inputs = self.target[item]
        anchor = self.anchor[item]
        label = self.label[item]
        
        model_inputs = tokenizer(inputs, anchor,
                                max_length=100,
                                padding='max_length',
                                truncation=True)
        
        return {**model_inputs, 'label':torch.as_tensor(label, dtype=torch.float)}

In [None]:
class TestDatset(Dataset):
    def __init__(self,df):
        self.target = df['target'].values.astype(str)
        self.anchor = df['anchor'].values.astype(str)
        
        
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self, item):
        inputs = self.target[item]
        anchor = self.anchor[item]
        
        model_inputs = tokenizer(inputs, anchor,
                                max_length=100,
                                padding='max_length',
                                truncation=True)
        
        return {**model_inputs}

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    
    return {
        'pearson':np.corrcoef(predictions, labels)[0][1]
    }

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=1)


In [None]:
metric_name = 'pearson'
batch_size = 128

args = TrainingArguments(
        'model',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate = 2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size*2,
        num_train_epochs=10,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model=metric_name,
        save_total_limit=1
        )

In [None]:
train_dataset = TrainDatset(train_df)
valid_dataset = TrainDatset(val_df)

In [None]:
trainer = Trainer(
            model, 
            args,
            train_dataset=train_dataset,
            eval_dataset=valid_dataset,
            tokenizer= tokenizer,
            compute_metrics=compute_metrics
        )

In [None]:
os.environ['WANDB_DISABLED'] = 'true'


In [None]:
trainer.train()

In [None]:
test_dataset = TestDatset(test_df)
outputs = trainer.predict(test_dataset).predictions.astype('float')


In [None]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': test_df['id'],
    'score': outputs.flatten()
})

In [None]:
def convert_pred(num):
    if num>=0 and num<=0.15:
        return 0
    elif num>0.15 and num<=0.35:
        return 0.25
    elif num>0.35 and num<=0.65:
        return 0.5
    elif num>0.65 and num<=0.85:
        return 0.75
    else:
         return 1

In [None]:
v=pd.DataFrame(submission)
v["score"]=v["score"].apply(lambda x: convert_pred(x))


In [None]:
v.to_csv("submission.csv",index=False)