# Train- DeBERTa with context multi-class

In this noteboook we fine tune pre-trained DeBERTa model for sequence classification problem

We take into consideration the context also in this notebook

In [None]:
# Import relevant modules
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from numpy.linalg import norm
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag

import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import torch
from transformers import DebertaTokenizer, DebertaForSequenceClassification, AdamW
# from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification

In [None]:
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(corpus, remove_stop_words = True):
    '''
    Function to clean a given corpus - lower the words, strip of the spaces, remove stopwords and lemmatize the corpus
    Args:
        corpus: the text to be cleaned
        remove_stop_words: whether to remove stopwords
    Returns:
        filtered_sentence: cleaned corpus
    '''
    corpus = corpus.lower().strip()
    word_tokens = word_tokenize(corpus)
    if remove_stop_words:
        filtered_sentence = " ".join([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(corpus)) if i not in stop_words])
    else:
        filtered_sentence = " ".join([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(corpus))])
    return filtered_sentence

def cosine(a,b):
    '''
    Function to calculate cosine similarity of two vectors
    Args:
        a,b: vectors to calculate cosine between
    Returns:
        cosine similarity of the given vectors
    '''
    return np.dot(a,b)/(norm(a)*norm(b))

In [None]:
code_df = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")[["code","title"]]
train_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")

In [None]:
train_df.head()

In [None]:
code_df.head()

In [None]:
train_df = pd.merge(train_df, code_df, left_on="context",right_on="code",how="left")
train_df.head()

In [None]:
# Cleaning the text
train_df["anchor"] = train_df["anchor"].apply(lambda x: clean_text(x,False))
train_df["target"] = train_df["target"].apply(lambda x: clean_text(x,False))
train_df["title"] = train_df["title"].apply(lambda x: clean_text(x,False))

In [None]:
# Concatenating the anchor, target and context
train_df["text"] = train_df.apply(lambda x: x["anchor"]+' [SEP] '+x["title"]+' [SEP] '+x["target"],axis=1)

In [None]:
num_labels = 5

In [None]:
# Loading the pretrained DeBERTa model
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base",num_labels=num_labels)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
print("Model loaded")

In [None]:
X = tokenizer.batch_encode_plus(train_df["text"].tolist(), truncation=False,return_tensors="pt",padding=True)['input_ids']
Y = train_df["score"].apply(lambda x: int(x*4)).tolist()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=1)

In [None]:
batch_size = 64

In [None]:
train_inputs = torch.tensor(X_train, dtype=torch.int)
train_labels = torch.tensor(Y_train, dtype=torch.float32)
validation_inputs = torch.tensor(X_test, dtype=torch.int)
validation_labels = torch.tensor(Y_test, dtype=torch.float32)
print(train_labels.shape,X_train.shape)
print(validation_labels.shape,X_test.shape)

In [None]:
train_data = TensorDataset(train_inputs, train_labels )
validation_data = TensorDataset(validation_inputs, validation_labels)

In [None]:
train_sampler = SequentialSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_sampler = SequentialSampler(validation_data)
validation_loader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, 
                  eps = 1e-8
                )

In [None]:
# Training the model
loss_values = []
epochs = 1

train_len = len(train_loader)
eval_len = len(validation_loader)

for epoch in range(epochs):
    print(f"Epoch: {epoch+1}")
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_loader):
        if step % 50 == 0 and not step == 0:
            print(f"Step {step} loss: ",total_loss/(step*batch_size))
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)

        model.zero_grad()        
        outputs = model(input_ids, labels=labels)  
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
    avg_train_loss = total_loss / train_len      
    loss_values.append(avg_train_loss)
    print(f"Train loss {avg_train_loss}")
    
    model.eval()
    
    # Predicting the classes for validation data
    i = 0
    y_pred = []
    while i<len(validation_inputs):
        y_ = np.argmax(model(validation_inputs[i:i+16].to(device))[0].detach().to('cpu').numpy(),axis=1).tolist()
        i+=16
        y_pred.extend(y_)
        
    # Calculating correlation on the validation data
    y1 = validation_labels.detach().to('cpu').numpy()/4
    y2 = np.array([i/4 for i in y_pred])
    corr,_ = pearsonr(y1,y2)

    print("Validation Score: {0:.2f}".format(corr))


This score is not that good

But this is an improvement over past 0.46 without the use of context