[Source Link](https://github.com/VanekPetr/flan-t5-text-classifier/tree/main/classifier/AutoModelForSeq2SeqLM)

[HF upload link](https://huggingface.co/steve1989/FlanT5_financial_Sentiment_finetuned/tree/main/)

[FlanT5 Theory](https://www.datacamp.com/tutorial/flan-t5-tutorial)

[Evaluate](https://medium.com/nerd-for-tech/fine-tuning-pretrained-bert-for-sentiment-classification-using-transformers-in-python-931ed142e37)

[Dataset - FPB](https://huggingface.co/datasets/financial_phrasebank)

**Applications of Flan-T5**

chat and dialogue summarization,

text classification, and

Fast Healthcare Interoperability Resources (FHIR).

# Install Libraries

In [1]:
!pip install datasets
# !pip install datasets
!pip install evaluate
# !pip install accelerate
# !pip install peft
# !pip install -q -U bitsandbytes scipy einops accelerate trl
# !pip install -i https://pypi.org/simple/bitsandbytes

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets

# Import Libraries

In [3]:
from typing import List, Tuple

import evaluate
import nltk
import numpy as np
import pandas as pd
from datasets import Dataset, concatenate_datasets
from huggingface_hub import HfFolder
from nltk.tokenize import sent_tokenize
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback, IntervalStrategy
)
import keras
# from classifier.data_loader import load_dataset
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score,f1_score
import datasets
import torch
# from transformers import BitsAndBytesConfig
# import bitsandbytes as bnb
# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, PeftConfig

# Base Model ID

In [4]:
MODEL_ID = "google/flan-t5-base"
REPOSITORY_ID = f"{MODEL_ID.split('/')[1]}-FPB-finetuned"

# Load dataset
# dataset = load_dataset()


# LOAD DATASET

In [5]:
dataset = load_dataset("financial_phrasebank",'sentences_allagree')
# df = dataset['train'].to_pandas()
# df = df.dropna(subset=['sentence', 'label']) ## drop missing values
# df_train, df_test, = train_test_split(df, stratify=df['label'], test_size=0.1, random_state=42)
# # df_train, df_val = train_test_split(df_train, stratify=df_train['label'],test_size=0.1, random_state=42)
# print(df_train.shape, df_test.shape)
# print(len(df_train['sentence']))

Downloading data:   0%|          | 0.00/171k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

In [None]:
df_train = datasets.Dataset.from_pandas(df_train)
df_test = datasets.Dataset.from_pandas(df_test)
type(df_test)

# LOAD TOKENIZER AND TOKENIZE

In [None]:
# Load tokenizer of FLAN-t5
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([df_train, df_test]).map(
    lambda x: tokenizer(x["sentence"], truncation=True),
    batched=True,
    remove_columns=["sentence", "label"],
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([df_train, df_test]).map(
    lambda x: tokenizer(x["sentence"], truncation=True),
    batched=True,
    remove_columns=["sentence", "label"],
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")


Map:   0%|          | 0/2264 [00:00<?, ? examples/s]

Max source length: 172


Map:   0%|          | 0/2264 [00:00<?, ? examples/s]

Max target length: 172


# Load model from the hub

In [None]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID,device_map='auto',
    quantization_config=bnb_config,
    trust_remote_code=True,)

model.config.use_cache = False

# Prepare and preprocess the model for PeFT training

In [None]:
# from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v", "k", "o"], # you have to know the target modules, it varies from model to model
    lora_dropout=0.05,
    bias="none",
    # task_type="CAUSAL_LM"
    task_type="SEQ_2_SEQ_LM"
)

# SEQ_CLS = "SEQ_CLS"
#     SEQ_2_SEQ_LM = "SEQ_2_SEQ_LM"
#     CAUSAL_LM = "CAUSAL_LM"
#     TOKEN_CLS = "TOKEN_CLS"
#     QUESTION_ANS = "QUESTION_ANS"
#     FEATURE_EXTRACTION = "FEATURE_EXTRACTION"


model = get_peft_model(model, config) # Wrap the base model with get_peft_model() to get a trainable PeftModel

# Trainable parameters

In [None]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 1769472 | total: 249347328 | Percentage: 0.7096%


**Error:pyarrow.lib.ArrowInvalid: Column 1 named input_ids expected length 599 but got length 1500**

[Solution](https://github.com/huggingface/datasets/issues/1817#issuecomment-774066254)

# DATA PREPROCESSING AND TOKENIZATION

In [None]:
# Load tokenizer of FLAN-t5
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([df_train, df_test]).map(
    lambda x: tokenizer(x["sentence"], truncation=True),
    batched=True,
    remove_columns=["sentence", "label"],
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([df_train, df_test]).map(
    lambda x: tokenizer(x["sentence"], truncation=True),
    batched=True,
    remove_columns=["sentence", "label"],
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")


In [None]:
def preprocess_function(sample: Dataset, padding: str = "max_length") -> dict:
    """Preprocess the dataset."""

    # add prefix to the input for t5
    inputs = [item for item in sample["sentence"]]

    # tokenize inputs
    model_inputs = tokenizer(
        inputs, max_length=max_source_length, padding=padding, truncation=True
    )
    print(len(model_inputs["input_ids"]))
    print("labels len ",len(sample["label"]))
    # Tokenize targets with the `text_target` keyword argument

    inputs_labels = [str(item) for item in sample["label"]]
    labels = tokenizer(
        text_target=inputs_labels,
        max_length=max_target_length,
        padding=padding,
        truncation=True,
        # return_overflowing_tokens=True
    )
    print("after tokenize  ",len(labels["input_ids"]), len(labels["attention_mask"]))
    # sample_map = labels.pop("overflow_to_sample_mapping")
    # for key, values in sample.items():
    #     labels[key] = [values[i] for i in sample_map]
    # return labels

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    # if padding == "max_length":
    #     labels["input_ids"] = [
    #         (label if label != tokenizer.pad_token_id else -100)
    #         for label in labels["input_ids"]
    #     ]

    if padding == "max_length":
        labels["input_ids"] = [
            (label if label != tokenizer.pad_token_id else -100)
            for label in labels["input_ids"]
        ]

    # concatenated_examples = {k: sum(sample[k], []) for k in sample.keys()}
    print("After padding  ",len(labels["input_ids"]))
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# def preprocess_function(sample, padding: str = "max_length"):
#     # Encode the sentence and label
#     sentence = tokenizer(sample["sentence"], max_length=max_source_length, padding=padding,truncation=True,)
#     labels = tokenizer(
#         text_target=sample["label"],max_length=max_target_length,
#         padding=padding,
#         truncation=True,
#     )

#     # Return the encoded inputs
#     return {"input_ids": sentence["input_ids"], "labels": labels["input_ids"]}

def postprocess_text(preds: List[str], labels: List[str]) -> Tuple[List[str], List[str]]:
    """helper function to postprocess text"""
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

# CALL PREPROCESSING FUNCTION

In [None]:
tokenized_dataset = dataset.map(
        preprocess_function, batched=True, remove_columns=["sentence","label"]
    )
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
# print(len(tokenized_dataset["train"]["labels"]))
# print(tokenized_dataset)
# type(tokenized_dataset["train"])

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


# TOKENIZE TEST SET

In [None]:
tokenized_testset = df_test.map(preprocess_function, batched=True, remove_columns=["sentence","label"])
tokenized_testset

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

227
labels len  227
after tokenize   227 227
After padding   227


Dataset({
    features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 227
})

# SET TRAINING ARGUMENTS

In [None]:
# Metric
metric = evaluate.load("f1")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, average="macro"
    )
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    return result

# def compute_metrics(p):
#     pred, labels = p
#     pred = np.argmax(pred, axis=1)
#     accuracy = accuracy_score(y_true=labels, y_pred=pred)
#     recall = recall_score(y_true=labels, y_pred=pred)
#     precision = precision_score(y_true=labels, y_pred=pred)
#     f1 = f1_score(y_true=labels, y_pred=pred)
#     return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# callback = keras.callbacks.EarlyStopping(monitor='loss',patience=3)

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=REPOSITORY_ID,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,  # Overflows with fp16
    learning_rate=3e-4,
    num_train_epochs=10,
    logging_dir=f"{REPOSITORY_ID}/logs",  # logging & evaluation strategies
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="tensorboard",
    # push_to_hub=True,
    # hub_strategy="every_save",
    # hub_model_id=REPOSITORY_ID,
    # hub_token=HfFolder.get_token(),
)

In [None]:
# # NOT NEEDED
# # Define training args
# training_args = Seq2SeqTrainingArguments(
#     output_dir=REPOSITORY_ID,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     predict_with_generate=True,
#     fp16=False,  # Overflows with fp16
#     learning_rate=3e-4,
#     num_train_epochs=10,
#     logging_dir=f"{REPOSITORY_ID}/logs",  # logging & evaluation strategies
#     logging_strategy="epoch",
#     evaluation_strategy=IntervalStrategy.STEPS, #'no','epochs'
#     eval_steps = 50, # Evaluation and Save happens every 50 steps
#     save_strategy=IntervalStrategy.STEPS,
#     save_total_limit=2,
#     metric_for_best_model = 'f1',
#     load_best_model_at_end=True,
#     report_to="tensorboard",
#     # push_to_hub=True,
#     # hub_strategy="every_save",
#     # hub_model_id=REPOSITORY_ID,
#     # hub_token=HfFolder.get_token(),
# )

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Train the model

In [None]:
# callback = keras.callbacks.EarlyStopping(monitor='loss',patience=3)
# nltk.download("punkt")

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=8,
    )

In [None]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_testset,
        compute_metrics=compute_metrics,
        # callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

    )

# TRAIN
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,F1,Gen Len
1,4.7839,0.023209,25.4314,2.0
2,0.0294,0.014425,25.4314,2.0
3,0.0205,0.011862,25.4314,2.0
4,0.0166,0.010382,41.0942,2.0
5,0.0145,0.009099,50.8957,2.004405
6,0.0131,0.008588,71.0816,2.088106
7,0.0122,0.007582,82.3202,2.132159
8,0.011,0.00704,84.9366,2.167401
9,0.0105,0.006402,88.6798,2.167401
10,0.0101,0.006366,90.9702,2.162996




TrainOutput(global_step=2830, training_loss=0.49219215935616106, metrics={'train_runtime': 1210.4411, 'train_samples_per_second': 18.704, 'train_steps_per_second': 2.338, 'total_flos': 5371428348887040.0, 'train_loss': 0.49219215935616106, 'epoch': 10.0})

# Save the adapter and Merge it with the base model(for saving within Colab)

In [None]:
#save to google drive under folder FlanT5_Sentiment_analysis_FPB
trainer.save_model("FlanT5_Sentiment_analysis_FPB")
adapter_model = model

print("Lora Adapter saved")

Lora Adapter saved


One thing to keep in mind is that you can’t merge the 8 bit/4 bit base model with Lora (as of right now) so you have to reload the model with full precision.

In [None]:
#To merge base model with fine tuned model
repo_id = "google/flan-t5-base"
use_ram_optimized_load=False

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    repo_id,
    device_map='auto',
    trust_remote_code=True,
)

In [None]:

base_model.config.use_cache = False

# Load Lora adapter
model = PeftModel.from_pretrained(
    base_model,
    "FlanT5_Sentiment_analysis_FPB",
    )



In [None]:
merged_model = model.merge_and_unload()

In [None]:
trainer.evaluate()

# EVALUATE

# Before Fine tuning

In [None]:
MODEL_ID = "google/flan-t5-base"

In [None]:
before_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
before_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
model.to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
torch.cuda.is_available()

False

In [None]:
evaluate_on_test(before_model,before_tokenizer)

Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

                                                                                                                                                                                                                                                                                           precision    recall  f1-score   support

                                                                                                                                                                                                                  - Operating profit rose by 26.9 % to EUR 105.8 million ( 83.4 ) million       0.00      0.00      0.00       0.0
                                                                                                                                                                   - Sales during the period were 31.6 million EUR 36.6 million EUR 1-6/2007 and profit before taxes was 0.2 1.3 million.       0.00      0.00      0.00       0.0
                              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# After fine tuning

In [None]:
!pip install tqdm



In [None]:
from sklearn.metrics import classification_report
from tqdm.auto import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

In [None]:
dataset = load_dataset("financial_phrasebank",'sentences_allagree')
df = dataset['train'].to_pandas()
df = df.dropna(subset=['sentence', 'label']) ## drop missing values
df_train, df_test, = train_test_split(df, stratify=df['label'], test_size=0.1, random_state=42)

df_train = datasets.Dataset.from_pandas(df_train)
df_test = datasets.Dataset.from_pandas(df_test)

In [None]:
df_test

Dataset({
    features: ['sentence', 'label', '__index_level_0__'],
    num_rows: 227
})

In [None]:
# Load model and tokenizer from the hub
after_tokenizer = AutoTokenizer.from_pretrained("steve1989/FlanT5_financial_Sentiment_finetuned")
after_model = AutoModelForSeq2SeqLM.from_pretrained("steve1989/FlanT5_financial_Sentiment_finetuned")
model.to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
max_source_length = 172
def classify(texts_to_classify: str,model,tokenizer):
    """Classify a batch of texts using the model."""
    inputs = tokenizer(
        texts_to_classify,
        padding="max_length",
        truncation=True,
        max_length=max_source_length,
        return_tensors="pt",
    )
    # inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=150,
            num_beams=2,
            early_stopping=True,
        )

    predictions = [
        tokenizer.decode(output, skip_special_tokens=True) for output in outputs
    ]
    return predictions

In [None]:
def evaluate_on_test(model,tokenizer):
    """Evaluate the model on the test dataset."""
    predictions_list, labels_list = [], []

    batch_size = 16  # Adjust batch size based GPU capacity
    num_batches = len(df_test) // batch_size + (
        0 if len(df_test) % batch_size == 0 else 1
    )
    progress_bar = tqdm(total=num_batches, desc="Evaluating")

    for i in range(0, len(df_test), batch_size):
        batch_texts = df_test["sentence"][i : i + batch_size]
        batch_labels = df_test["label"][i : i + batch_size]

        batch_predictions = classify(batch_texts,model,tokenizer)

        predictions_list.extend(batch_predictions)
        labels_list.extend([str(label) for label in batch_labels])

        progress_bar.update(1)

    progress_bar.close()
    report = classification_report(labels_list, predictions_list)
    print(report)

In [None]:
evaluate_on_test(after_model,after_tokenizer)

Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.79      1.00      0.88        30
           1       0.94      0.98      0.96       140
           2       0.95      0.74      0.83        57

    accuracy                           0.92       227
   macro avg       0.90      0.91      0.89       227
weighted avg       0.93      0.92      0.92       227



# SAVE THE MODEL

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
 # SAVE
# tokenizer.save_pretrained(REPOSITORY_ID)
trainer.create_model_card()
merged_model.push_to_hub("steve1989/FlanT5_financial_Sentiment_finetuned")
tokenizer.push_to_hub("steve1989/FlanT5_financial_Sentiment_finetuned")

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/steve1989/FlanT5_financial_Sentiment_finetuned/commit/e94f24bb1377d54f2c1ae6e81da9b8c65045a950', commit_message='Upload tokenizer', commit_description='', oid='e94f24bb1377d54f2c1ae6e81da9b8c65045a950', pr_url=None, pr_revision=None, pr_num=None)

# Start TensorBoard within the notebook

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs

# [Make predictions on TEST DATA](https://medium.com/nerd-for-tech/fine-tuning-pretrained-bert-for-sentiment-classification-using-transformers-in-python-931ed142e37)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import datasets
from typing import List, Tuple
from datasets import Dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("steve1989/FlanT5_financial_Sentiment_finetuned")
model = AutoModelForSeq2SeqLM.from_pretrained("steve1989/FlanT5_financial_Sentiment_finetuned")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [None]:
dataset = load_dataset("financial_phrasebank",'sentences_allagree')
df = dataset['train'].to_pandas()
df = df.dropna(subset=['sentence', 'label']) ## drop missing values
df_train, df_test, = train_test_split(df, stratify=df['label'], test_size=0.1, random_state=42)

Downloading data:   0%|          | 0.00/171k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

In [None]:
df_test = datasets.Dataset.from_pandas(df_test)

In [None]:
#Not Working for seq2seq LM
# preds = model.predict(test_dataset=tokenized_testset)

In [None]:
torch.cuda.is_available()

False

In [None]:
def classify(texts_to_classify: str):
    """Classify a batch of texts using the model."""
    inputs = tokenizer(
        texts_to_classify,
        padding="max_length",
        truncation=True,
        max_length=max_source_length,
        return_tensors="pt",
    )
    # inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=150,
            num_beams=2,
            early_stopping=True,
        )

    predictions = [
        tokenizer.decode(output, skip_special_tokens=True) for output in outputs
    ]
    return predictions

In [None]:
classify("The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported.")

['0']

In [None]:
classify("With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .")

['1']

In [None]:
classify("According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .")

['1']

# REFT Fine tuning

In [None]:
import torch, transformers, pyreft

In [None]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )


In [None]:
from datasets import load_dataset

dataset = load_dataset("steve1989/financial_news_headlines")

In [None]:
MODEL_ID = "google/flan-t5-base"
REPOSITORY_ID = f"{MODEL_ID.split('/')[1]}-FPB-finetuned"

# Load dataset
# dataset = load_dataset()


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID,device_map='auto',
    quantization_config=bnb_config,
    trust_remote_code=True,)

model.config.use_cache = False

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Load tokenizer of FLAN-t5
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [None]:
prompt_no_input_template=[]
for i in ["hello","world","steve"]:
  prompt_no_input_template.append(f"""<s>[INST] <<SYS>>
What is the sentiment of the input. Choose either positive or negative or neutral. Input:{i} Answer:
<</SYS>>

%s [/INST]
""".format(i))

In [None]:
for batch in dataset:
  prompt_no_input_template=[]
  for example in dataset[batch]:
    i=example['Headlines']
    prompt_no_input_template.append(f"""<s>[INST] <<SYS>>
What is the sentiment of the input. Choose either positive or negative or neutral. Input:{i} Answer:
<</SYS>>

%s [/INST]
""".format(i))
  dataset[batch] = dataset[batch].add_column('prompt',prompt_no_input_template)


In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Headlines', 'sentiment_label', '__index_level_0__', 'prompt'],
        num_rows: 13031
    })
    validation: Dataset({
        features: ['Headlines', 'sentiment_label', '__index_level_0__', 'prompt'],
        num_rows: 2793
    })
    test: Dataset({
        features: ['Headlines', 'sentiment_label', '__index_level_0__', 'prompt'],
        num_rows: 2793
    })
})

In [None]:
dataset['train'] = dataset['train'].remove_columns(['labels'])
dataset['test'] = dataset['test'].remove_columns(['labels'])

In [None]:
train_split = dataset['train'].train_test_split(test_size=0.2,seed=42)
test_split = dataset['train'].train_test_split(test_size=0.2,seed=42)

In [None]:
train_split

DatasetDict({
    train: Dataset({
        features: ['Headlines', 'sentiment_label', '__index_level_0__', 'prompt'],
        num_rows: 10424
    })
    test: Dataset({
        features: ['Headlines', 'sentiment_label', '__index_level_0__', 'prompt'],
        num_rows: 2607
    })
})

In [None]:
train_split_subset = train_split['test'].train_test_split(test_size=0.2,seed=42)
train_split_subset

DatasetDict({
    train: Dataset({
        features: ['Headlines', 'sentiment_label', '__index_level_0__', 'prompt'],
        num_rows: 2085
    })
    test: Dataset({
        features: ['Headlines', 'sentiment_label', '__index_level_0__', 'prompt'],
        num_rows: 522
    })
})

In [None]:
import torch, transformers, pyreft

In [None]:
MODEL_ID = "google/flan-t5-base"
REPOSITORY_ID = f"{MODEL_ID.split('/')[1]}-FPB-finetuned"

# Load dataset
# dataset = load_dataset()


In [None]:
# model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID,device_map='auto',
#     quantization_config=bnb_config,
#     trust_remote_code=True,)

# model.config.use_cache = False
model_id = "internlm/internlm2-7b"
internlm7b_model = transformers.AutoModelForCausalLM.from_pretrained(model_id,quantization_config=bnb_config,
                                                        device_map="auto", trust_remote_code=True,
                                                        low_cpu_mem_usage=True,)

internlm7b_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)


config.json:   0%|          | 0.00/817 [00:00<?, ?B/s]

configuration_internlm2.py:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/internlm/internlm2-7b:
- configuration_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_internlm2.py:   0%|          | 0.00/60.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/internlm/internlm2-7b:
- modeling_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin.index.json:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/5.51G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/393 [00:00<?, ?B/s]

tokenization_internlm2_fast.py:   0%|          | 0.00/7.80k [00:00<?, ?B/s]

tokenization_internlm2.py:   0%|          | 0.00/8.81k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/internlm/internlm2-7b:
- tokenization_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/internlm/internlm2-7b:
- tokenization_internlm2_fast.py
- tokenization_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

In [None]:
# get reft model
reft_config = pyreft.ReftConfig(representations={
    "layer": 15, "component": "block_output",
    # alternatively, you can specify as string component access,
    # "component": "model.layers[0].output",
    "low_rank_dimension": 4,
    "intervention": pyreft.LoreftIntervention(embed_dim=model.config.hidden_size,
    low_rank_dimension=4)})
reft_model = pyreft.get_reft_model(model, reft_config)
reft_model.set_device("cuda")
reft_model.print_trainable_parameters()

AttributeError: 'InternLM2ForCausalLM' object has no attribute ''

In [None]:
model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name_or_path, torch_dtype=torch.bfloat16, device_map="auto")

# get tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name_or_path, model_max_length=2048,
    padding_side="right", use_fast=False)
tokenizer.pad_token = tokenizer.unk_token

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [None]:
dataset_split = dataset.train_test_split(test_size=0.2,seed=42)

In [None]:
data_module = pyreft.make_last_position_supervised_data_module(
    tokenizer, model, [prompt_no_input_template % e["prompt"] for e in train_split_subset['test']],
    [e["sentiment_label"] for e in train_split_subset['test']])