In this notebook we utilise the model trained in the notebook 

https://www.kaggle.com/bhavesjain/train-w-context-deberta/ to run it on the actual test data

In [None]:
# Import relevant modules
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from numpy.linalg import norm
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag

import torch
from transformers import DebertaTokenizer, DebertaForSequenceClassification, AdamW
# from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification

In [None]:
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(corpus, remove_stop_words = True):
    '''
    Function to clean a given corpus - lower the words, strip of the spaces, remove stopwords and lemmatize the corpus
    Args:
        corpus: the text to be cleaned
        remove_stop_words: whether to remove stopwords
    Returns:
        filtered_sentence: cleaned corpus
    '''
    corpus = corpus.lower().strip()
    word_tokens = word_tokenize(corpus)
    if remove_stop_words:
        filtered_sentence = " ".join([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(corpus)) if i not in stop_words])
    else:
        filtered_sentence = " ".join([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(corpus))])
    return filtered_sentence

def cosine(a,b):
    '''
    Function to calculate cosine similarity of two vectors
    Args:
        a,b: vectors to calculate cosine between
    Returns:
        cosine similarity of the given vectors
    '''
    return np.dot(a,b)/(norm(a)*norm(b))

In [None]:
code_df = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")[["code","title"]]
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

In [None]:
test_df.head()

In [None]:
code_df.head()

In [None]:
test_df = pd.merge(test_df, code_df, left_on="context",right_on="code",how="left")
test_df.head()

In [None]:
# Cleaning the text
test_df["anchor"] = test_df["anchor"].apply(lambda x: clean_text(x,False))
test_df["target"] = test_df["target"].apply(lambda x: clean_text(x,False))
test_df["title"] = test_df["title"].apply(lambda x: clean_text(x,False))

In [None]:
# Concatenating the anchor, target and context
test_df["text"] = test_df.apply(lambda x: x["anchor"]+' [SEP] '+x["title"]+' [SEP] '+x["target"],axis=1)

In [None]:
path = "/kaggle/input/debertav201/deberta-v2-01"
# Loading the pretrained DeBERTa model
num_labels=5
tokenizer = DebertaTokenizer.from_pretrained(path)
model = DebertaForSequenceClassification.from_pretrained(path, num_labels=num_labels)


In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()
print("Model loaded")

In [None]:
X = tokenizer.batch_encode_plus(test_df["text"].tolist(), truncation=False,return_tensors="pt",padding=True)['input_ids']
test_inputs = torch.tensor(X, dtype=torch.int)

In [None]:
batch_size = 64
i = 0
y_pred = []

while i<len(test_df):
    outputs = model(test_inputs[i:i+batch_size].to(device))[0].detach().to('cpu').numpy()
    i+=batch_size
    y_pred.extend(np.argmax(outputs,axis=1))

In [None]:
test_df["score"] = [i*0.25 for i in y_pred]

In [None]:
submission_df = test_df.drop(columns=["anchor","target","context","code","title","text"])

In [None]:
submission_df.to_csv("submission.csv",index=False)