In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import ast

In [None]:
train_bengali = pd.read_json("<path-to-training-data-bengali>")
train_english = pd.read_json("<path-to-training-data-english>")
train_hindi = pd.read_json("<path-to-training-data-hindi>")
train_codemix = pd.read_json("<path-to-training-data-codemix>")

val_english = pd.read_json("<path-to-validation-data-english>")
val_hindi = pd.read_json("<path-to-validation-data-codemix>")

In [None]:
train_df = pd.concat([train_bengali,train_english,train_hindi,train_codemix], axis = 0).reset_index(drop = True)
train_df = train_df.sample(frac = 1, random_state = 123).reset_index(drop = True)
train_df

In [None]:
bengali_chars = r"\u0980-\u09FF"
hindi_chars = r"\u0900-\u097F"

In [None]:
def shorten_link(text_tokens):
    p_tokens = []
    for i in range(len(text_tokens)):
        if text_tokens[i].startswith("http"):
            p_tokens.append("http")
        else:
            p_tokens.append(text_tokens[i])

    return p_tokens

In [None]:
def link_preprocess(i,text):
    text = re.sub("\n"," ",text)
    text_tokens = text.split()
    pre_tokens = shorten_link(text_tokens)
    if len(text_tokens) != len(pre_tokens):
        print(f"Error in {i}")
    p_text = " ".join(pre_tokens)
    return p_text

In [None]:
def text_preprocess(df):
    preprocessed_text = []
    for i in range(len(df)):
        text = df["text"].iloc[i]
        text = str(text) if pd.notna(text) else ""
        p_t = link_preprocess(i,text)
        preprocessed_text.append(p_t)

    return preprocessed_text

In [None]:
train_df["preprocessed_text"] = text_preprocess(train_df)
val_hindi["preprocessed_text"] = text_preprocess(val_hindi)
val_english["preprocessed_text"] = text_preprocess(val_english)

In [None]:
train_df.head()

In [None]:
val_hindi.head()

In [None]:
val_english.head()

In [None]:
def text_cleaning(text):
#     text = re.sub(r"[^\w\s.]", "", text)
    text = re.sub(rf"[^{bengali_chars}{hindi_chars}\w\s#@']", " ", text)
    return text

In [None]:
clean_text = []
for i in range(len(train_df)):
    c_text = text_cleaning(train_df["preprocessed_text"].iloc[i])
    clean_text.append(c_text)

train_df["clean_text"] = clean_text

In [None]:
train_df

In [None]:
clean_text_val_hi = []
for i in range(len(val_hindi)):
    c_text = text_cleaning(val_hindi["preprocessed_text"].iloc[i])
    clean_text_val_hi.append(c_text)

val_hindi["clean_text"] = clean_text_val_hi
val_hindi.head()

In [None]:
clean_text_val_en = []
for i in range(len(val_english)):
    c_text = text_cleaning(val_english["preprocessed_text"].iloc[i])
    clean_text_val_en.append(c_text)

val_english["clean_text"] = clean_text_val_en
val_english.head()

In [None]:
train_df["claims"] = train_df["claims"].fillna("[]")

In [None]:
train_df[train_df["claims"].isna() == True]

In [None]:
def claim_preprocessing(claims):
    p_claims =  []
    if(claims == []):
        return p_claims
    for text in claims:
        text = re.sub("\n"," ", text)
        text = re.sub(rf"[^{bengali_chars}{hindi_chars}\w\s#@']", " ", text)
        p_claims.append(text)

    return p_claims

In [None]:
processed_claim = []
for i in range(len(train_df)):
    claim = train_df["claims"].iloc[i]
    p_claim = claim_preprocessing(claim)
    processed_claim.append(p_claim)

train_df["preprocessed_claims"] = processed_claim

In [None]:
i

In [None]:
def claim_preprocessing_val(df):
  processed_claim_val = []
  for i in range(len(df)):
      claim = df["claims"].iloc[i]
      p_claim = claim_preprocessing(claim)
      processed_claim_val.append(p_claim)
  return processed_claim_val

val_english["preprocessed_claims"] = claim_preprocessing_val(val_english)
val_hindi["preprocessed_claims"] = claim_preprocessing_val(val_hindi)

In [None]:
text_tokens = []
for i in range(len(train_df)):
    text = train_df["clean_text"].iloc[i]
    text = text.strip()
    tokens = text.split()
    text_tokens.append(tokens)

train_df["text_tokens"] = text_tokens

In [None]:
def text_to_tokens(df):
  text_tokens_val = []
  for i in range(len(df)):
    text = df["clean_text"].iloc[i]
    text = text.strip()
    tokens = text.split()
    text_tokens_val.append(tokens)
  return text_tokens_val

text_tokens_val_hi = text_to_tokens(val_hindi)
text_tokens_val_en = text_to_tokens(val_english)

val_hindi["text_tokens"] = text_tokens_val_hi
val_english["text_tokens"] = text_tokens_val_en

In [None]:
train_df.head()

In [None]:
val_hindi.head()

In [None]:
def BIO_tagging(text_tokens,spans):
    label = [0]*len(text_tokens)
    if(spans != []):
        for claim in spans:
            claim_tokens = claim.split()
            if(len(claim_tokens) > 0):
                for i, word in enumerate(text_tokens):
                    if word == claim_tokens[0]:
                        if text_tokens[i : i + len(claim_tokens)] == claim_tokens:
                            label[i] = 1
                            label[i+1 : i + len(claim_tokens)] = [2]*(len(claim_tokens) - 1)
                            break

    return label

In [None]:
claim_label= []
for i in range(len(train_df)):
    text_tokens = train_df["text_tokens"].iloc[i]
    spans = train_df["preprocessed_claims"].iloc[i]
    label = BIO_tagging(text_tokens,spans)
    claim_label.append(label)

train_df["claim_label"] = claim_label
train_df

In [None]:
def calim_label_tagging(df):
  claim_label_val = []
  for i in range(len(df)):
      text_tokens = df["text_tokens"].iloc[i]
      spans = df["preprocessed_claims"].iloc[i]
      label = BIO_tagging(text_tokens,spans)
      claim_label_val.append(label)
  return claim_label_val

claim_label_val_hi = calim_label_tagging(val_hindi)
claim_label_val_en = calim_label_tagging(val_english)

val_hindi["claim_label"] = claim_label_val_hi
val_english["claim_label"] = claim_label_val_en

In [None]:
val_hindi.head()

In [None]:
val_english.head()

In [None]:
label_list = ["O", "B-Claim", "I-Claim"]

In [None]:
model_name = "microsoft/mdeberta-v3-base" # you can use other models also ...

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize_and_align_labels(text_tokens, claim_label):
    tokenized_inputs = tokenizer(text_tokens, truncation=True, is_split_into_words=True, max_length=512)

    label = []
    word_ids = tokenized_inputs.word_ids()
    for word_idx in word_ids:
        if word_idx is None:
            label.append(-100)
        else:
            label.append(claim_label[word_idx])

    tokenized_inputs["label"] = label
    return tokenized_inputs

In [None]:
from tqdm.auto import tqdm
train_input_ids = []
train_attention_masks = []
train_labels = []
for index, row in tqdm(train_df.iterrows()):
    text_tokens = row["text_tokens"]
    claim_label = row["claim_label"]
    tokenized_inputs = tokenize_and_align_labels(text_tokens,claim_label)
    input = tokenized_inputs["input_ids"]
    train_input_ids.append(input)
    train_attention_masks.append(tokenized_inputs["attention_mask"])
    label = tokenized_inputs["label"]
    train_labels.append(label)

In [None]:
len(train_df), len(train_input_ids)

In [None]:
def tokenize_and_align_labels_val(df):
  val_input_ids = []
  val_attention_masks = []
  val_labels = []
  for index, row in tqdm(df.iterrows()):
      text_tokens = row["text_tokens"]
      claim_label = row["claim_label"]
      tokenized_inputs = tokenize_and_align_labels(text_tokens,claim_label)
      input = tokenized_inputs["input_ids"]
      val_input_ids.append(input)
      val_attention_masks.append(tokenized_inputs["attention_mask"])
      label = tokenized_inputs["label"]
      val_labels.append(label)
  return val_input_ids, val_attention_masks, val_labels

In [None]:
val_input_ids_en, val_attention_masks_en, val_labels_en = tokenize_and_align_labels_val(val_english)
len(val_english), len(val_input_ids_en)

In [None]:
val_input_ids_hi, val_attention_masks_hi, val_labels_hi = tokenize_and_align_labels_val(val_hindi)
len(val_hindi), len(val_input_ids_hi)

In [None]:
dict_train = {'input_ids' : train_input_ids, 'attention_mask' : train_attention_masks, 'labels' : train_labels}
dict_val_en = {'input_ids' : val_input_ids_en, 'attention_mask' : val_attention_masks_en, 'labels' : val_labels_en}
dict_val_hi = {'input_ids' : val_input_ids_hi, 'attention_mask' : val_attention_masks_hi, 'labels' : val_labels_hi}

In [None]:
data_train = pd.DataFrame(dict_train)
data_val_en = pd.DataFrame(dict_val_en)
data_val_hi = pd.DataFrame(dict_val_hi)

In [None]:
data_val = pd.concat([data_val_en,data_val_hi], axis = 0).reset_index(drop = True)

In [None]:
data_train.to_json('data_train.json', orient = 'records')
data_val.to_json('data_val.json', orient = 'records')

In [None]:
import json
with open("data_train.json", 'r') as file:
  data_train = json.load(file)

with open("data_val.json", 'r') as file:
  data_val = json.load(file)

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

muril_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
def flatten(list_of_lists):
  flattened = [val for sublist in list_of_lists for val in sublist]
  return flattened

In [None]:
from sklearn.metrics import f1_score, accuracy_score, jaccard_score, precision_score, recall_score

def compute_metrics(logits_and_labels):
      logits, labels = logits_and_labels
      preds = np.argmax(logits, axis=-1)

      # remove -100 from labels and predictions
      ground = [[t for t in label if t != -100] for label in labels]

      # do the same for predictions whenever true label is -100
      preds_jagged = [[p for p, t in zip(ps, ts) if t != -100] for ps, ts in zip(preds, labels)]

      # flatten labels and preds
      labels_flat = flatten(ground)
      preds_flat = flatten(preds_jagged)

      acc = accuracy_score(labels_flat, preds_flat)
      f1 = f1_score(labels_flat, preds_flat, average='macro')
      pre = precision_score(labels_flat, preds_flat, average='macro')
      re = recall_score(labels_flat, preds_flat, average='macro')
      jaccard = jaccard_score(labels_flat, preds_flat, average='macro')
      return {
        'accuracy': acc,
        'precision' : pre,
        'recall' : re,
        'f1': f1,
        'jaccard': jaccard,
      }

In [None]:
args = TrainingArguments(
    "/content/drive/MyDrive/Claim Span/Models/DeBERTa-multilingual-BIO",
    eval_strategy = "epoch",
    save_strategy="no",
    learning_rate= 3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [None]:
trainer = Trainer(
    muril_model,
    args,
    train_dataset=data_train,
    eval_dataset=data_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(data_val)

In [None]:
trainer.save_model('/content/drive/MyDrive/Claim Span/Models/DeBERTa-multilingual-BIO')

## Evaluation

In [None]:
import torch

In [None]:
model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/Claim Span/Models/DeBERTa-multilingual-BIO")

In [None]:
def prediction(text_tokens):
    inputs = tokenizer(text_tokens, return_tensors="pt", padding=True, truncation=True, is_split_into_words=True)

    inputs_dict = {key: value for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs_dict)
        predictions = torch.argmax(outputs.logits, dim=-1)

    predictions = predictions[0].tolist()

    return predictions, inputs.word_ids()

In [None]:
# Converting subword predictions to word predictions using word ids

def subword_to_word_predictions(subword_predictions, word_ids):
    actual_pred_labels = []
    for w_ids, prediction in zip(word_ids, subword_predictions):
        if w_ids[-2] != None:
            allign_pred = [0]*(w_ids[-2] + 1)
            for w_idx, pred in zip(w_ids, prediction):
                if w_idx == None:
                    continue
                elif pred == 1:
                    allign_pred[w_idx] = 1
                elif pred == 2:
                    allign_pred[w_idx] = 2
            actual_pred_labels.append(allign_pred)
        else:
            allign_pred = []
            actual_pred_labels.append(allign_pred)

    return actual_pred_labels

In [None]:
def prediction_and_alignment(df):
  predicted_labels = []
  word_ids = []
  for i in range(len(df)):
    p_labels,w_ids = prediction(df["text_tokens"].iloc[i])
    if(len(p_labels) == len(w_ids)):
      predicted_labels.append(p_labels)
      word_ids.append(w_ids)
    else:
      print(i) # Error in prediction

  predictions = subword_to_word_predictions(predicted_labels, word_ids)

  ground = df["claim_label"].tolist()

  for i in range(len(ground)):
    if(len(ground[i]) == len(predictions[i])):
        continue
    else:
      remaining_length = len(ground[i]) - len(predictions[i])
      zero_padded = [0] * remaining_length
      predictions[i] = predictions[i] + zero_padded

  df["predicted_labels"] = predictions

  return predictions, ground

In [None]:
p_ben, g_ben = prediction_and_alignment(val_bengali)
p_eng, g_eng = prediction_and_alignment(val_english)
p_hi, g_hi = prediction_and_alignment(val_hindi)
p_cm, g_cm = prediction_and_alignment(val_codemix)

In [None]:
def compute_metrics(ground, preds):
        # flatten labels and preds
      labels_flat = flatten(ground)
      preds_flat = flatten(preds)

      acc = accuracy_score(labels_flat, preds_flat)
      f1 = f1_score(labels_flat, preds_flat, average='macro')
      pre = precision_score(labels_flat, preds_flat, average='macro')
      re = recall_score(labels_flat, preds_flat, average='macro')
      jaccard = jaccard_score(labels_flat, preds_flat, average='macro')

      return acc,pre,re,f1,jaccard

In [None]:
acc, pre, re, f1, jac = compute_metrics(g_eng, p_eng)

print(f"Accuracy = {acc}, precision = {pre}, recall = {re}, F1 = {f1}, Jaccard = {jac}")

In [None]:
acc, pre, re, f1, jac = compute_metrics(g_hi, p_hi)

print(f"Accuracy = {acc}, precision = {pre}, recall = {re}, F1 = {f1}, Jaccard = {jac}")