## flant5 task2

In [1]:
import nltk
import torch
import numpy as np
from huggingface_hub import HfFolder
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, f1_score, precision_score,recall_score,f1_score
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
import os
import json
import pandas as pd
from datasets import Dataset, DatasetDict

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [5]:
label2id = {"0": 0, "1": 1}
id2label = {id: label for label, id in label2id.items()}

def format_data_T2(row, flag=-1):
    prompt = f"Text: {row['old_cleaned_content']}\nAnswer:"
    completion = "Yes" if row['Ground-Truth'] == 1 else "No"

    base_prompt = """Analyze the given text based on its contextual understanding to determine whether any factual updates (e.g., date changes, numerical updates, score modifications, or status changes) are likely to occur in the future. 
    Return a response indicating "Yes" if an update is predicted and "No" otherwise.\n"""

    formatted_entry = {"prompt": base_prompt + prompt, "completion": completion} if flag == 1 else {"prompt": base_prompt + prompt, "completion": ''}
    return json.dumps(formatted_entry)

def load_dataset(model_type: str = "") -> Dataset:
    if model_type == "AutoModelForSequenceClassification":
        train_df = pd.read_csv("/mnt/Data/rishav_2311mc12/Revision/data/train.csv", encoding='ISO-8859-1')
        test_df = pd.read_csv("/mnt/Data/rishav_2311mc12/Revision/data/val.csv", encoding='ISO-8859-1')

        train_df["text"] = train_df.apply(lambda row: format_data_T2(row, flag=0), axis=1)
        test_df["text"] = test_df.apply(lambda row: format_data_T2(row, flag=0), axis=1)


        # Apply the function to each row and store the result in a new column 'text'
        train_df.drop(columns=['old_cleaned_content','new_cleaned_content'], inplace=True)
        test_df.drop(columns=['old_cleaned_content','new_cleaned_content'], inplace=True)

        train_df.rename(columns={'Ground-Truth': 'labels'}, inplace=True)
        test_df.rename(columns={'Ground-Truth': 'labels'}, inplace=True) 

        dataset_train = Dataset.from_pandas(train_df)
        dataset_test = Dataset.from_pandas(test_df)

        dataset = DatasetDict({
            'train': dataset_train,
            'test': dataset_test
        })  

    return dataset

MODEL_ID = "google/flan-t5-large"
REPOSITORY_ID = "rishavranaut/flanT5_Task2"

config = AutoConfig.from_pretrained(
    MODEL_ID, num_labels=len(label2id), id2label=id2label, label2id=label2id
)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, config=config)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model.to("cuda") if torch.cuda.is_available() else model.to("cpu")

training_args = TrainingArguments(
    num_train_epochs=5,
    output_dir = REPOSITORY_ID,
    learning_rate = 1e-4,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    weight_decay = 0.01,
    logging_strategy="steps",
    logging_steps=2500,
    report_to="tensorboard",
    evaluation_strategy = 'steps',
    save_strategy='steps',
    save_steps=2500, 
    load_best_model_at_end = False,
    save_total_limit=2,
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id= REPOSITORY_ID,
    hub_token='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',
    resume_from_checkpoint=True
)


def tokenize_function(examples) -> dict:
    """Tokenize the text column in the dataset"""
    sentences = [
        text for text in examples['text']
    ]
    return tokenizer(sentences, truncation=True, padding="max_length", max_length=512)

def compute_metrics(eval_pred) -> dict:
    """Compute metrics for evaluation"""
    logits, labels = eval_pred
    if isinstance(logits, tuple):  # if the model also returns hidden_states or attentions
        logits = logits[0]
    predictions = np.argmax(logits, axis=-1)
    
    metrics = {
        'Accuracy': accuracy_score(labels, predictions),
        'Precision': precision_score(labels, predictions,average='binary'),
        'Recall': recall_score(labels, predictions,average='binary'),
        'F1 Score': f1_score(labels, predictions,average='binary'),
        # 'Classification Report': classification_report(labels, predictions, output_dict=True)  # output_dict=True ensures a dict is returned
    }
    torch.cuda.empty_cache()
    
    return metrics



def train() -> None:
    """
    Train the model and save it to the Hugging Face Hub.
    """
    dataset = load_dataset("AutoModelForSequenceClassification")
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    nltk.download("punkt")

    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        compute_metrics=compute_metrics,
    )

    # TRAIN
    trainer.train()

    # SAVE AND EVALUATE
    tokenizer.save_pretrained(REPOSITORY_ID)
    trainer.create_model_card()
    trainer.push_to_hub()
    print(trainer.evaluate())

if __name__ == "__main__":
    train()

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5945 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

[nltk_data] Downloading package punkt to
[nltk_data]     /mnt/Data/rishav_2311mc12/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 score
2500,1.0303,0.827047,0.747059,0.808824,0.647059,0.718954
5000,0.9652,0.686063,0.711765,0.924528,0.461176,0.615385
7500,0.9232,0.944154,0.747059,0.801724,0.656471,0.721863
10000,0.8457,0.931072,0.747059,0.786885,0.677647,0.728192
12500,0.7519,1.088727,0.768235,0.806452,0.705882,0.752823
15000,0.6462,1.178025,0.770588,0.8125,0.703529,0.754098
17500,0.642,1.243401,0.771765,0.795396,0.731765,0.762255
20000,0.4436,1.302643,0.76,0.793103,0.703529,0.745636
22500,0.3762,1.60512,0.765882,0.767773,0.762353,0.765053
25000,0.2798,1.901078,0.767059,0.768322,0.764706,0.766509


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

{'eval_loss': 1.8542985916137695, 'eval_Accuracy': 0.7752941176470588, 'eval_Precision': 0.7896039603960396, 'eval_Recall': 0.7505882352941177, 'eval_F1 Score': 0.7696019300361882, 'eval_runtime': 93.1322, 'eval_samples_per_second': 9.127, 'eval_steps_per_second': 9.127, 'epoch': 5.0}


## FlanT5 task 1 

In [1]:
import nltk
import torch
import numpy as np
from huggingface_hub import HfFolder
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, f1_score, precision_score,recall_score,f1_score
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
import os
import json
import pandas as pd
from datasets import Dataset, DatasetDict

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


In [None]:
label2id = {"0": 0, "1": 1}
id2label = {id: label for label, id in label2id.items()}

def format_data_T2(row, flag=-1):
    prompt = f"Old sentence: {row['old_cleaned_content']}\n[SEP]\nNew sentence: {row['new_cleaned_content']}\nAnswer:"
    completion = "Yes" if row['Ground-Truth'] == 1 else "No"

    base_prompt = """'Determine whether a text passage has been updated by identifying changes in date, numbers, scores, statuses, or other relevant information between two given sentences. Provide a binary answer (Yes/No) indicating if the new sentence represents an update to the old sentence.'"""

    formatted_entry = {"prompt": base_prompt + prompt, "completion": completion} if flag == 1 else {"prompt": base_prompt + prompt, "completion": ''}
    return json.dumps(formatted_entry)

def load_dataset(model_type: str = "") -> Dataset:
    if model_type == "AutoModelForSequenceClassification":
        train_df = pd.read_csv("train.csv", encoding='ISO-8859-1')
        test_df = pd.read_csv("val.csv", encoding='ISO-8859-1')

        train_df["text"] = train_df.apply(lambda row: format_data_T2(row, flag=0), axis=1)
        test_df["text"] = test_df.apply(lambda row: format_data_T2(row, flag=0), axis=1)


        # Apply the function to each row and store the result in a new column 'text'
        train_df.drop(columns=['old_cleaned_content','new_cleaned_content'], inplace=True)
        test_df.drop(columns=['old_cleaned_content','new_cleaned_content'], inplace=True)

        train_df.rename(columns={'Ground-Truth': 'labels'}, inplace=True)
        test_df.rename(columns={'Ground-Truth': 'labels'}, inplace=True) 

        dataset_train = Dataset.from_pandas(train_df)
        dataset_test = Dataset.from_pandas(test_df)

        dataset = DatasetDict({
            'train': dataset_train,
            'test': dataset_test
        })  

    return dataset

MODEL_ID = "google/flan-t5-large"
REPOSITORY_ID = "rishavranaut/flanT5_Task1"

config = AutoConfig.from_pretrained(
    MODEL_ID, num_labels=len(label2id), id2label=id2label, label2id=label2id
)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, config=config)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model.to("cuda") if torch.cuda.is_available() else model.to("cpu")

training_args = TrainingArguments(
    num_train_epochs=5,
    output_dir = REPOSITORY_ID,
    learning_rate = 1e-4,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    weight_decay = 0.01,
    logging_strategy="steps",
    logging_steps=2500,
    report_to="tensorboard",
    evaluation_strategy = 'steps',
    save_strategy='steps',
    save_steps=2500, 
    load_best_model_at_end = False,
    save_total_limit=2,
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id= REPOSITORY_ID,
    hub_token='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',
    resume_from_checkpoint=True
)


def tokenize_function(examples) -> dict:
    """Tokenize the text column in the dataset"""
    sentences = [
        text for text in examples['text']
    ]
    return tokenizer(sentences, truncation=True, padding="max_length", max_length=512)

def compute_metrics(eval_pred) -> dict:
    """Compute metrics for evaluation"""
    logits, labels = eval_pred
    if isinstance(logits, tuple):  # if the model also returns hidden_states or attentions
        logits = logits[0]
    predictions = np.argmax(logits, axis=-1)
    
    metrics = {
        'Accuracy': accuracy_score(labels, predictions),
        'Precision': precision_score(labels, predictions,average='binary'),
        'Recall': recall_score(labels, predictions,average='binary'),
        'F1 Score': f1_score(labels, predictions,average='binary'),
        # 'Classification Report': classification_report(labels, predictions, output_dict=True)  # output_dict=True ensures a dict is returned
    }
    torch.cuda.empty_cache()
    
    return metrics



def train() -> None:
    """
    Train the model and save it to the Hugging Face Hub.
    """
    dataset = load_dataset("AutoModelForSequenceClassification")
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    nltk.download("punkt")

    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        compute_metrics=compute_metrics,
    )

    # TRAIN
    trainer.train()

    # SAVE AND EVALUATE
    tokenizer.save_pretrained(REPOSITORY_ID)
    trainer.create_model_card()
    trainer.push_to_hub()
    print(trainer.evaluate())

if __name__ == "__main__":
    train()

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5945 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

[nltk_data] Downloading package punkt to
[nltk_data]     /mnt/Data/rishav_2311mc12/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
