<a href="https://colab.research.google.com/github/rohitdutta2510/Claim-Span-identification-using-LLMs/blob/main/TC_xlm_roberta_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import gc
import os
from random import randrange
from tqdm import tqdm, trange
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, TensorDataset, dataloader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForTokenClassification, get_scheduler
import re
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
import argparse
import json
import os
import sys
from tqdm import tqdm, trange
import random
import gc
import csv
import torch
import numpy as np

In [None]:
class QAModel:
    def __init__(self, params):
        # print("Loading Model...")
        self.tokenizer = AutoTokenizer.from_pretrained(params.model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(params.model_name).to(params.device)
        self.params = params

        name = str(randrange(1e8, 1e9))
        while os.path.exists(os.path.join(params.save_dir, name)):
            name = str(randrange(1e8, 1e9))

        self.save_path = os.path.join(params.save_dir, name)




    def getTrainDataset(self, data, max_len = 500):
        tokenizer = self.tokenizer

        tokenized_tweet, attention_mask, tweet_text, start_token, end_token = [], [], [], [], []

        for idx, dat in enumerate(data):

            temp = [tokenizer.cls_token] + dat['text_tokens'] + [tokenizer.sep_token]


            tweet, idx_map = tokenize_and_map(temp, tokenizer, max_len)


            # TODO: CHECK
            if not dat['claims']:
                tweet_text.append(tokenizer.decode(tweet))
                tokenized_tweet.append(tweet)
                attention_mask.append(np.ones_like(tweet))
                start_token.append(0)
                end_token.append(0)



            for span in dat['claims']:

                try:
                    start_token.append(idx_map[span['start']])
                    end_token.append(idx_map[span['end']] - 1)

                    tweet_text.append(tokenizer.decode(tweet))
                    tokenized_tweet.append(tweet)
                    attention_mask.append(np.ones_like(tweet))


                except IndexError:
                    if len(start_token) > len(end_token):
                        start_token.pop(-1)



        tweets = pad_sequence([torch.LongTensor(x) for x in tokenized_tweet], batch_first = True, padding_value = tokenizer.pad_token_id)
        attn = pad_sequence([torch.LongTensor(x) for x in attention_mask], batch_first = True, padding_value = 0)
        start_positions = torch.tensor(start_token)
        end_positions = torch.tensor(end_token)

        dataset = TensorDataset(tweets, attn, start_positions, end_positions)
        dataloader = DataLoader(dataset, batch_size=self.params.batch_size)
        return dataloader


    def getTestDataset(self, data, max_len = 500):
        tokenizer = self.tokenizer

        tokenized_tweet, tweet_text, attention_mask, seqlabs = [], [], [], []

        for idx, dat in enumerate(data):

            temp = [tokenizer.cls_token] + dat['text_tokens'] + [tokenizer.sep_token]


            tweet, idx_map = tokenize_and_map(temp, tokenizer, max_len)
            tweet_text.append(tokenizer.decode(tweet))
            tokenized_tweet.append(tweet)
            attention_mask.append(np.ones_like(tweet))

            seq = np.zeros(len(tweet))

            for span in dat['claims']:
                try:
                    seq[idx_map[span['start']] : idx_map[span['end']]] = 1
                except IndexError:
                    # print("Truncated")
                    pass

            seqlabs.append(seq)


        tokenized_tweet = pad_sequence([torch.LongTensor(x) for x in tokenized_tweet], batch_first = True, padding_value = tokenizer.pad_token_id)
        attention_mask = pad_sequence([torch.LongTensor(x) for x in attention_mask], batch_first = True, padding_value = 0)
        seqlabs = pad_sequence([torch.LongTensor(x) for x in seqlabs], batch_first = True, padding_value = 0)

        dataset = TensorDataset(tokenized_tweet, attention_mask, seqlabs)
        dataloader = DataLoader(dataset, batch_size=self.params.batch_size)
        return dataloader

    @staticmethod
    def logit2sequence(start_logit, end_logit, n_best=50, max_answer_length=100, thresh = 0.85):

        # get best answers
        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        answers = []

        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (end_index < start_index or end_index - start_index + 1 > max_answer_length):
                    continue

                answers.append({"logit_score": start_logit[start_index] + end_logit[end_index],
                                "start": start_index,
                                "end": end_index})

        best_answers = [max(answers, key=lambda x: x["logit_score"])]

        # look for other answers
        for ans in answers:
            if ans['end'] <  best_answers[0]['start'] or best_answers[0]['end'] < ans['start']:
                if ans['logit_score'] >= thresh * best_answers[0]['logit_score']:
                    best_answers.append(ans)

                    # print("Added answers")



        # convert to sequences
        sequence = np.zeros(len(start_logit), dtype=int)
        for ans in best_answers:
            sequence[ans['start']:ans['end'] + 1] = 1

        return sequence


    def eval(self, dataloader, print_scores= True):
        model = self.model

        answers = []
        ground, preds = [], []

        model.eval()
        with torch.no_grad():
            for batch in dataloader:
                input_ids =  batch[0].to(self.params.device)
                attn = batch[1].to(self.params.device)

                outputs = model(input_ids, attention_mask = attn)
                start_logits = outputs.start_logits.detach().cpu().numpy()
                end_logits = outputs.end_logits.detach().cpu().numpy()
                inputs_ids = input_ids.detach().cpu().numpy()
                attn = attn.detach().cpu().numpy()
                labels = batch[2].detach().cpu().numpy()

                for t, at, ps, pe, gr in zip(input_ids, attn, start_logits, end_logits, labels):
                    seq = self.logit2sequence(ps, pe)
                    at = at.astype(bool)

                    ans = t[at][seq[at].astype(bool)]
                    answers.append(self.tokenizer.decode(ans))

                    preds.append(seq[at][1:-1])
                    ground.append(gr[at][1:-1])

        scores = custom_metrics(ground, preds)

        if print_scores:
            spre, srec, sf1, mtpre, mtrec, mtf1 = scores
            print("Span pre, rec, f1: %0.5f %0.5f %0.5f"%(spre, srec, sf1))
            print("MTok pre, rec, f1: %0.5f %0.5f %0.5f"%(mtpre, mtrec, mtf1))
        return scores, answers



    def test(self, test_data):
        test_dataloader = self.getTestDataset(test_data)
        print("Test set performance:")
        scores, answer = self.eval(test_dataloader)

        return np.array(scores), answer


    def train(self, train_data, val_data):
        model = self.model

        train_dataloader = self.getTrainDataset(train_data)
        val_dataloader = self.getTestDataset(val_data)

        optimizer = AdamW(model.parameters(), lr=self.params.LR)
        scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=self.params.num_epochs * len(train_dataloader))



        max_score = -1

        for epoch in trange(self.params.num_epochs, desc="Epoch", ncols = 100):
            model.train()
            total_loss= 0
            total_precision = 0
            total_f1 = 0
            total_recall = 0

            # Train
            for batch in train_dataloader:
                input_ids, attn, start_positions, end_positions = [t.to(self.params.device) for t in batch]

                gc.collect()
                torch.cuda.empty_cache()

                outputs = model(input_ids, attention_mask= attn, start_positions=start_positions, end_positions=end_positions)
                loss = outputs.loss
                loss.backward()

                total_loss += loss.item()

                optimizer.step()
                scheduler.step()

                optimizer.zero_grad()



            # Validate
            total_loss /= len(input_ids)
            scores, answers = self.eval(val_dataloader, print_scores=False)
            print(f"\nEpoch {epoch+1}: Train Loss: {total_loss}, Val F1: {scores[2]}")


            # Save the fine-tuned model_kgp if needed
            if max_score < scores[-1]:
                print("Saving model at epoch %d at:"%(epoch + 1), self.save_path)
                model.save_pretrained(self.save_path)
                max_score = scores[-1]


        self.model = AutoModelForQuestionAnswering.from_pretrained(self.save_path).to(self.params.device)

        gc.collect()
        torch.cuda.empty_cache()

        os.system("rm -r " + self.save_path)

In [None]:
class TCModel:
    def __init__(self, params):
        # print("Loading Model...")
        self.tokenizer = AutoTokenizer.from_pretrained(params.model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(params.model_name).to(params.device)
        self.params = params

        name = str(randrange(1e8, 1e9))
        while os.path.exists(os.path.join(params.save_dir, name)):
            name = str(randrange(1e8, 1e9))

        self.save_path = os.path.join(params.save_dir, name)




    def getDataset(self, data, max_len = 500):
        tokenizer = self.tokenizer

        tokenized_tweet, tweet_text, attention_mask, seqlabs = [], [], [], []

        for idx, dat in enumerate(data):

            temp = [tokenizer.cls_token] + dat['text_tokens'] + [tokenizer.sep_token]


            tweet, idx_map = tokenize_and_map(temp, tokenizer, max_len)
            tweet_text.append(tokenizer.decode(tweet))
            tokenized_tweet.append(tweet)
            attention_mask.append(np.ones_like(tweet))

            seq = np.zeros(len(tweet))

            for span in dat['claims']:
                try:
                    seq[idx_map[span['start']] : idx_map[span['end']]] = 1
                except IndexError:
                    # print("Truncated!")
                    pass


            seqlabs.append(seq)


        tokenized_tweet = pad_sequence([torch.LongTensor(x) for x in tokenized_tweet], batch_first = True, padding_value = tokenizer.pad_token_id)
        attention_mask = pad_sequence([torch.LongTensor(x) for x in attention_mask], batch_first = True, padding_value = 0)
        seqlabs = pad_sequence([torch.LongTensor(x) for x in seqlabs], batch_first = True, padding_value = 0)

        dataset = TensorDataset(tokenized_tweet, attention_mask, seqlabs)
        dataloader = DataLoader(dataset, batch_size=self.params.batch_size)

        return dataloader


    def eval(self, dataloader, print_scores=True):
        model = self.model

        answers = []
        ground, preds = [], []

        model.eval()
        with torch.no_grad():
            for batch in dataloader:
                input_ids =  batch[0].to(self.params.device)
                attn = batch[1].to(self.params.device)

                outputs = model(input_ids, attention_mask = attn)
                seqpred = outputs[0].argmax(dim = -1).detach().cpu().numpy()

                inputs_ids = input_ids.detach().cpu().numpy()
                attn = attn.detach().cpu().numpy()
                labels = batch[2].detach().cpu().numpy()


                for t, at, seq, gr in zip(input_ids, attn, seqpred, labels):
                    at = at.astype(bool)

                    ans = t[at][seq[at].astype(bool)]
                    answers.append(self.tokenizer.decode(ans))
                    preds.append(seq[at][1:-1])
                    ground.append(gr[at][1:-1])

        scores = custom_metrics(ground, preds)

        if print_scores:
            spre, srec, sf1, mtpre, mtrec, mtf1 = scores
            print("Span pre, rec, f1: %0.5f %0.5f %0.5f"%(spre, srec, sf1))
            print("MTok pre, rec, f1: %0.5f %0.5f %0.5f"%(mtpre, mtrec, mtf1))
            print()

        return scores, answers

    def test(self, test_data):
        test_dataloader = self.getDataset(test_data)
        print("Test set performance:")
        scores, answer = self.eval(test_dataloader)

        return np.array(scores), answer



    def train(self, train_data, val_data):
        model = self.model

        train_dataloader = self.getDataset(train_data)
        val_dataloader = self.getDataset(val_data)

        optimizer = AdamW(model.parameters(), lr=self.params.LR)
        scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=self.params.num_epochs * len(train_dataloader))



        max_score = -1

        for epoch in trange(self.params.num_epochs, desc="Epoch", ncols = 100):  # Replace with your desired number of epochs
            model.train()
            total_loss= 0
            total_precision = 0
            total_f1 = 0
            total_recall = 0

            # Train
            for batch in train_dataloader:
                input_ids, attn, seqlabs = [t.to(self.params.device) for t in batch]

                gc.collect()
                torch.cuda.empty_cache()

                outputs = model(input_ids, attention_mask= attn, labels=seqlabs)
                loss = outputs.loss
                loss.backward()

                total_loss += loss.item()

                optimizer.step()
                scheduler.step()

                optimizer.zero_grad()



            # Validate
            total_loss /= len(input_ids)
            scores, answers = self.eval(val_dataloader, print_scores=False)
            print(f"\nEpoch {epoch+1}: Train Loss: {total_loss}, Val F1: {scores[2]}")

            # Save the fine-tuned model_kgp if needed
            if max_score < scores[-1]:
                print("Saving model at epoch %d at:"%(epoch + 1), self.save_path)
                model.save_pretrained(self.save_path)
                max_score = scores[-1]

        # reload best model
        self.model = AutoModelForTokenClassification.from_pretrained(self.save_path).to(self.params.device)

        gc.collect()
        torch.cuda.empty_cache()


        os.system("rm -r " + self.save_path)


In [None]:
def custom_metrics(ground, preds):
    ## inputs should be list of np arrays.
    ## Each array is of size #tokens and contains 0s and 1s

    mf1 = [f1_score(l, p, average='macro', zero_division=0) for l, p in list(zip(ground, preds))]
    mpr = [precision_score(l, p, average='macro', zero_division=0) for l, p in list(zip(ground, preds))]
    mre = [recall_score(l, p, average='macro', zero_division=0) for l, p in list(zip(ground, preds))]




    span_p, span_r, span_f1 = [], [], []
    for (gold_labels, pred_labels) in zip(ground, preds):
        assert len(gold_labels) == len(pred_labels)
        gold_labels, pred_labels = np.array(gold_labels).astype(bool), np.array(pred_labels).astype(bool)

        # print(len(gold_labels), gold_labels)
        # print(len(pred_labels), pred_labels)
        # print("\n")


        intersection = np.sum(gold_labels * pred_labels)
        precision = intersection / np.sum(pred_labels) if np.sum(pred_labels) != 0 else 0
        recall = intersection / np.sum(gold_labels) if np.sum(gold_labels) != 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0

        span_p.append(precision)
        span_r.append(recall)
        span_f1.append(f1)

    spanf1 = np.mean(span_f1)
    spanrec = np.mean(span_r)
    spanpre = np.mean(span_p)

    mtokf1 = np.nanmean(mf1)
    mtokrec = np.nanmean(mre)
    mtokpre = np.nanmean(mpr)

    return spanpre, spanrec, spanf1, mtokpre, mtokrec, mtokf1




def tokenize_and_map(text_words, tokenizer, max_len):

    assert text_words[0] == tokenizer.cls_token
    tokenized_text = [tokenizer.cls_token_id]
    i = 1
    idx_map = []

    for j, word in enumerate(text_words[1:-1]):
        word = re.sub("https://\S+", "HTTPURL", word)

        encoded_words = tokenizer.encode(' ' + word, add_special_tokens = False)
        if i + len(encoded_words) > max_len:
            break

        tokenized_text += encoded_words
        idx_map.append(i)
        i += len(encoded_words)


    assert text_words[-1] == tokenizer.sep_token
    tokenized_text.append(tokenizer.sep_token_id)
    idx_map.append(i)

    return tokenized_text, idx_map

In [None]:
# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%% PARAMS


# parser = argparse.ArgumentParser()
# parser.add_argument('--model_type', type=str, choices=['qa', 'tc'], default='tc')
# parser.add_argument('--model_name', type=str, default='xlm-roberta-base')
# parser.add_argument('--device', type=str, default='cuda:0')
# parser.add_argument('--batch_size', type=int, default=32)
# parser.add_argument('--LR', type=float, default=2e-5)
# parser.add_argument('--num_epochs', type=int, default=1)
# parser.add_argument('--num_exp', type=int, default=1)
# parser.add_argument('--save_dir', type=str, default='models/')
# args = parser.parse_args()

class Args:
    model_type = 'tc'
    model_name = 'xlm-roberta-base'
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    batch_size = 32
    LR = 2e-5
    num_epochs = 5
    num_exp = 1
    save_dir = '/content/drive/MyDrive/MTP/Models/'  # Adjust save directory as needed

args = Args()


if args.model_type == 'tc':
    MODEL_CLASS = TCModel
else:
    MODEL_CLASS = QAModel


if not os.path.exists(args.save_dir):
    os.makedirs(args.save_dir)


#%%

def read_data(trainfiles:list, valfiles:list, testfiles:list):
    random.seed()

    train_data = []
    for fn in trainfiles:
        with open(fn) as fp:
            train_data.extend(json.load(fp))

    random.shuffle(train_data)
    random.shuffle(train_data)
    random.shuffle(train_data)

    val_data = []
    for fn in valfiles:
        with open(fn) as fp:
            val_data.extend(json.load(fp))


    test_data = []
    for fn in testfiles:
        with open(fn) as fp:
            test_data.append(json.load(fp))

    return train_data, val_data, test_data



def repeat_experiment(traindata, valdata, testdata, out_names, save_out_path = None):
    assert len(out_names) == len(testdata)

    all_scores = [[] for i in range(len(testdata))]
    for IDX in range(args.num_exp):
        gc.collect()
        torch.cuda.empty_cache()

        print("INDEX:", IDX + 1)
        print("#" * 50)

        model = MODEL_CLASS(args)

        model.train(traindata, valdata)

        for i, tset in enumerate(testdata):
            scores, answers = model.test(tset)

            all_scores[i].append(scores)

        if save_out_path:
            with open(save_out_path, 'w') as fp:
                json.dump(answers, fp, indent = 4)

            return answers




    with open(os.path.join(args.save_dir, args.model_type + "_scores.csv"), 'a') as fo:
        for i, name in enumerate(out_names):
            mean_scores = np.mean(all_scores[i], axis = 0)
            std_scores = np.std(all_scores[i], axis = 0)

            mean_scores *= 100
            mean_scores = np.round(mean_scores, 2).tolist()
            std_scores = np.round(std_scores, 3).tolist()

            print("#" * 50)
            print("\n Mean scores and STD on %s:"%(name))
            print(mean_scores)
            print(std_scores)

            print("%s, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f"%(name, *mean_scores, *std_scores), file = fo)



In [None]:
# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%% MAIN
if __name__ == '__main__':
    XCLAIMDIR = '/content/drive/MyDrive/MTP/Dataset/xclaim'
    KCLAIMDIR = '/content/drive/MyDrive/MTP/Dataset/kgpclaim'
    # CCLAIMDIR = "../CURT/"

    # languages = ['hi', 'en']
    languages = ['en']


    for lang in languages:
        # tr = os.path.join(KCLAIMDIR, "train-%s.json"%(lang))
        # va = os.path.join(KCLAIMDIR, "dev-%s.json"%(lang))
        # te = os.path.join(KCLAIMDIR, "test-%s.json"%(lang))
        # data = read_data([tr], [va], [te])
        # answersk = repeat_experiment(*data, ["kc_%s_%s_%s"%(args.model_name, lang, lang)])

        tr = os.path.join(KCLAIMDIR, f"train-{lang}.json")
        # tr = '/content/drive/MyDrive/MTP/Dataset/gen_claims.json'
        va = os.path.join(KCLAIMDIR, f"dev-{lang}.json")
        te = os.path.join(KCLAIMDIR, f"test-{lang}.json")
        data = read_data([tr], [va], [te])
        answersk = repeat_experiment(*data, [f"kc_{args.model_name}_{lang}_{lang}"])


INDEX: 1
##################################################


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  name = str(randrange(1e8, 1e9))
Epoch:   0%|                                                                  | 0/5 [00:00<?, ?it/s]


Epoch 1: Train Loss: 2.5931649401783945, Val F1: 0.6085809366031709
Saving model at epoch 1 at: /content/drive/MyDrive/MTP/Models/106875441


Epoch:  20%|███████████▍                                             | 1/5 [04:05<16:21, 245.44s/it]


Epoch 2: Train Loss: 1.8918029507001242, Val F1: 0.6114516111079438
Saving model at epoch 2 at: /content/drive/MyDrive/MTP/Models/106875441


Epoch:  60%|██████████████████████████████████▏                      | 3/5 [12:11<08:05, 242.67s/it]


Epoch 3: Train Loss: 1.710935039818287, Val F1: 0.6184996354536936

Epoch 4: Train Loss: 1.552525794506073, Val F1: 0.602438392551399
Saving model at epoch 4 at: /content/drive/MyDrive/MTP/Models/106875441


Epoch:  80%|█████████████████████████████████████████████▌           | 4/5 [16:14<04:02, 242.43s/it]


Epoch 5: Train Loss: 1.4333021894097329, Val F1: 0.5889507202492706
Saving model at epoch 5 at: /content/drive/MyDrive/MTP/Models/106875441


Epoch: 100%|█████████████████████████████████████████████████████████| 5/5 [20:25<00:00, 245.09s/it]


Test set performance:


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Span pre, rec, f1: 0.61337 0.62956 0.59105
MTok pre, rec, f1: 0.77933 0.78017 0.74752

##################################################

 Mean scores and STD on kc_xlm-roberta-base_en_en:
[61.34, 62.96, 59.11, 77.93, 78.02, 74.75]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
