# Inferring predictions using an ensemble model of DeBERTa-v3-large and DeBERTa-v3-small model with single class classification

Refer to the training notebook:

https://www.kaggle.com/bhavesjain/train-deberta-single-class-k-fold

In [None]:
# Import relevant modules
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from numpy.linalg import norm
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split

import os
        
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag

import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
import datasets
from transformers import TrainingArguments, Trainer
from transformers import DebertaTokenizer, DebertaForSequenceClassification, AdamW, AutoModel, AutoConfig, AutoTokenizer
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification

## Lemmatizer and prerocessing functions

In [None]:
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(corpus, remove_stop_words = True):
    '''
    Function to clean a given corpus - lower the words, strip of the spaces, remove stopwords and lemmatize the corpus
    Args:
        corpus: the text to be cleaned
        remove_stop_words: whether to remove stopwords
    Returns:
        filtered_sentence: cleaned corpus
    '''
    corpus = corpus.lower().strip()
    word_tokens = word_tokenize(corpus)
    if remove_stop_words:
        filtered_sentence = " ".join([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(corpus)) if i not in stop_words])
    else:
        filtered_sentence = " ".join([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(corpus))])
    return filtered_sentence

def cosine(a,b):
    '''
    Function to calculate cosine similarity of two vectors
    Args:
        a,b: vectors to calculate cosine between
    Returns:
        cosine similarity of the given vectors
    '''
    return np.dot(a,b)/(norm(a)*norm(b))

## Loading and preprocessing the data

In [None]:
code_df = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")[["code","title"]]
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
test_df = pd.merge(test_df, code_df, left_on="context",right_on="code",how="left")

In [None]:
# Cleaning the text
test_df["anchor"] = test_df["anchor"].apply(lambda x: clean_text(x,False))
test_df["target"] = test_df["target"].apply(lambda x: clean_text(x,False))
test_df["title"] = test_df["title"].apply(lambda x: clean_text(x,False))

# Concatenating the anchor, target and context
test_df["text"] = test_df.apply(lambda x:'[CLS] '+ x["anchor"]+' [SEP] '+x["title"]+' [SEP] '+x["target"],axis=1)
test_df = test_df.drop(columns = ["anchor", "target", "context", "code", "title"])

## Loading the model and tokenzier

In [None]:
folds = 5

In [None]:
predictions = []

In [None]:
test_tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/deberta-5-fold/deberta-fold-1/kaggle/working/uspppm_1')

def process(unit, eval = False):
    x = unit["text"]    
    return {**test_tokenizer(x,truncation=True, padding=True)}

def InferDataset(dataframe):
    data = datasets.Dataset.from_pandas(dataframe)
    data = data.map(process,remove_columns=["id","text","__index_level_0__"])
    return data

test_data = InferDataset(test_df)

for fold in range(folds):
    model_path = f"/kaggle/input/deberta-5-fold/deberta-fold-{fold}/kaggle/working/uspppm_{fold}"
    
    model = DebertaV2ForSequenceClassification.from_pretrained(model_path,num_labels=1)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    print("Model loaded")
    
    trainer = Trainer(
                model,
                tokenizer=test_tokenizer,
            )

    outputs = trainer.predict(test_data)
    prediction = outputs.predictions.reshape(-1)
    predictions.append(prediction)
    
    del outputs, prediction, trainer, model

In [None]:
test_tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/deberta-small-5-fold/deberta-fold-1/kaggle/working/uspppm_1')

def process(unit, eval = False):
    x = unit["text"]    
    return {**test_tokenizer(x,truncation=True, padding=True)}

def InferDataset(dataframe):
    data = datasets.Dataset.from_pandas(dataframe)
    data = data.map(process,remove_columns=["id","text","__index_level_0__"])
    return data

test_data = InferDataset(test_df)

for fold in range(folds):
    model_path = f"/kaggle/input/deberta-small-5-fold/deberta-fold-{fold}/kaggle/working/uspppm_{fold}"
    
    model = DebertaV2ForSequenceClassification.from_pretrained(model_path,num_labels=1)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    print("Model loaded")
    
    trainer = Trainer(
                model,
                tokenizer=test_tokenizer,
            )

    outputs = trainer.predict(test_data)
    prediction = outputs.predictions.reshape(-1)
    predictions.append(prediction)
    
    del outputs, prediction, trainer, model

In [None]:
preds = predictions.copy()

In [None]:
predictions = preds.copy()

In [None]:
def lim(x):
    if x<0:
        return 0
    elif x>1:
        return 1
    return x

# def lim1(x):
#     if x<0.1:
#         return 0
#     elif x>0.9:
#         return 1
#     return x

# def lim2(x):
#     if x<0.1:
#         return 0
#     elif x>0.9:
#         return 1
#     elif x>0.2 and x<0.3:
#         return 0.25
#     elif x>0.45 and x<0.55:
#         return 0.5
#     elif x>0.7 and x<0.8:
#         return 0.75
#     return x
# def lim3(x):
#     if x<0.1:
#         return 0
#     elif x>0.9:
#         return 1
#     elif x>0.15 and x<0.35:
#         return 0.25
#     elif x>0.4 and x<0.6:
#         return 0.5
#     elif x>0.65 and x<0.85:
#         return 0.75
#     return x
# def lim4(x):
#     if x<0.01:
#         return 0
#     elif x>0.99:
#         return 1
#     elif x>0.24 and x<0.26:
#         return 0.25
#     elif x>0.49 and x<0.51:
#         return 0.5
#     elif x>0.74 and x<0.76:
#         return 0.75
#     return x

In [None]:
predictions = preds.copy()
predictions = np.mean(predictions,axis=0)
predictions = [lim(x) for x in predictions]

# np.corrcoef(predictions,test_df["score"].tolist())

In [None]:
test_df["score"] = predictions

In [None]:
submission_df = test_df.drop(columns=["text"])
submission_df.to_csv("submission.csv",index=False)

In [None]:
submission_df.head()