# Inferring predictions using the DeBERTa model with sinle class classification

Refer to the training notebook:

https://www.kaggle.com/code/bhavesjain/train-deberta-single-class

In [None]:
# Import relevant modules
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from numpy.linalg import norm
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag

import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
import datasets
from transformers import TrainingArguments, Trainer
from transformers import DebertaTokenizer, DebertaForSequenceClassification, AdamW, AutoModel, AutoConfig, AutoTokenizer
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification

## Lemmatizer and prerocessing functions

In [None]:
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(corpus, remove_stop_words = True):
    '''
    Function to clean a given corpus - lower the words, strip of the spaces, remove stopwords and lemmatize the corpus
    Args:
        corpus: the text to be cleaned
        remove_stop_words: whether to remove stopwords
    Returns:
        filtered_sentence: cleaned corpus
    '''
    corpus = corpus.lower().strip()
    word_tokens = word_tokenize(corpus)
    if remove_stop_words:
        filtered_sentence = " ".join([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(corpus)) if i not in stop_words])
    else:
        filtered_sentence = " ".join([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(corpus))])
    return filtered_sentence

def cosine(a,b):
    '''
    Function to calculate cosine similarity of two vectors
    Args:
        a,b: vectors to calculate cosine between
    Returns:
        cosine similarity of the given vectors
    '''
    return np.dot(a,b)/(norm(a)*norm(b))

## Loading and preprocessing the data

In [None]:
code_df = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")[["code","title"]]
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
test_df = pd.merge(test_df, code_df, left_on="context",right_on="code",how="left")

In [None]:
# Cleaning the text
test_df["anchor"] = test_df["anchor"].apply(lambda x: clean_text(x,False))
test_df["target"] = test_df["target"].apply(lambda x: clean_text(x,False))
test_df["title"] = test_df["title"].apply(lambda x: clean_text(x,False))

# Concatenating the anchor, target and context
test_df["text"] = test_df.apply(lambda x: x["anchor"]+' [SEP] '+x["title"]+' [SEP] '+x["target"],axis=1)
test_df = test_df.drop(columns = ["anchor", "target", "context", "code", "title"])

## Loading the model and tokenzier

In [None]:
model_path = "/kaggle/input/debertabase/"

In [None]:
test_tokenizer = AutoTokenizer.from_pretrained(model_path)
model = DebertaForSequenceClassification.from_pretrained(model_path,num_labels=1)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
print("Model loaded")

## Tokenizing the test inputs

In [None]:
X = test_tokenizer.batch_encode_plus(test_df["text"].tolist(), truncation=True,return_tensors="pt",padding=True)['input_ids']
test_inputs = torch.tensor(X, dtype=torch.int)

## Generating predictions

In [None]:
batch_size = 32
i = 0
y_pred = []

while i<len(test_df):
    outputs = model(test_inputs[i:i+batch_size].to(device))[0].detach().to('cpu').numpy()
    i+=batch_size
#     print(outputs)
    y_pred.extend([i[0] for i in outputs])

In [None]:
test_df["score"] = y_pred

In [None]:
submission_df = test_df.drop(columns=["text"])
submission_df.to_csv("submission.csv",index=False)

In [None]:
submission_df