# Introduction

This notebook focuses on the fine-tuning of the Mistralv0.2 7B model for the task of sentiment analysis.

We leverage a specific Kaggle dataset, which comprises various texts labeled with their corresponding sentiments.

Based on this [huggingface blog post](https://huggingface.co/blog/lmassaron/fine-tuning-llms-on-kaggle-notebooks) and [kaggle notebook](https://www.kaggle.com/code/lucamassaron/fine-tune-mistral-v0-2-for-sentiment-analysis) using this [kaggle sentiment-analysis dataset](https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news).

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
%env HF_HUB_ENABLE_HF_TRANSFER=True

env: HF_HUB_ENABLE_HF_TRANSFER=True


In [7]:
!pip install -q -U peft transformers datasets bitsandbytes trl accelerate wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.3/155.3 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

# Prepare Dataset

In [3]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"killianryan","key":"42d9a1571e2cf8e24d81a5324b36640e"}'}

In [4]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [5]:
!ls -lha kaggle.json

# Install the Kaggle library
!pip install -q kaggle

# Create a Kaggle directory and move the kaggle.json file there
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# Change the permission of the file
!chmod 600 ~/.kaggle/kaggle.json

import kaggle

!kaggle datasets download -d ankurzing/sentiment-analysis-for-financial-news

!ls -lha /content

!unzip sentiment-analysis-for-financial-news.zip

-rw-r--r-- 1 root root 67 Mar 13 14:25 kaggle.json
Downloading sentiment-analysis-for-financial-news.zip to /content
  0% 0.00/903k [00:00<?, ?B/s]
100% 903k/903k [00:00<00:00, 147MB/s]
total 924K
drwxr-xr-x 1 root root 4.0K Mar 13 14:25 .
drwxr-xr-x 1 root root 4.0K Mar 13 14:23 ..
drwxr-xr-x 4 root root 4.0K Mar 11 13:24 .config
-rw-r--r-- 1 root root   67 Mar 13 14:25 kaggle.json
drwxr-xr-x 1 root root 4.0K Mar 11 13:25 sample_data
-rw-r--r-- 1 root root 904K May 27  2020 sentiment-analysis-for-financial-news.zip
Archive:  sentiment-analysis-for-financial-news.zip
  inflating: FinancialPhraseBank/License.txt  
  inflating: FinancialPhraseBank/README.txt  
  inflating: FinancialPhraseBank/Sentences_50Agree.txt  
  inflating: FinancialPhraseBank/Sentences_66Agree.txt  
  inflating: FinancialPhraseBank/Sentences_75Agree.txt  
  inflating: FinancialPhraseBank/Sentences_AllAgree.txt  
  inflating: all-data.csv            


In [9]:
from datasets import Dataset

In [10]:
filename = "all-data.csv"

df = pd.read_csv(filename,
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

X_train = list()
X_test = list()
for sentiment in ["positive", "neutral", "negative"]:
    train, test  = train_test_split(df[df.sentiment==sentiment],
                                    train_size=300,
                                    test_size=300,
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

# def generate_prompt(data_point):
#     return f"""
#             Analyze the sentiment of the news headline enclosed in square brackets,
#             determine if it is positive, neutral, or negative, and return the answer as
#             the corresponding sentiment label "positive" or "neutral" or "negative".

#             [{data_point["text"]}] = {data_point["sentiment"]}
#             """.strip()

# def generate_test_prompt(data_point):
#     return f"""
#             Analyze the sentiment of the news headline enclosed in square brackets,
#             determine if it is positive, neutral, or negative, and return the answer as
#             the corresponding sentiment label "positive" or "neutral" or "negative".

#             [{data_point["text"]}] = """.strip()

#  prompts to be used by Mistral v0.2
def generate_prompt(data_point):
    return f"""
            [INST]Analyze the sentiment of the news headline enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as
            the corresponding sentiment label "positive" or "neutral" or "negative"[/INST]

            [{data_point["text"]}] = {data_point["sentiment"]}
            """.strip()

def generate_test_prompt(data_point):
    return f"""
            [INST]Analyze the sentiment of the news headline enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as
            the corresponding sentiment label "positive" or "neutral" or "negative"[/INST]

            [{data_point["text"]}] = """.strip()

X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1),
                       columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1),
                      columns=["text"])

y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [11]:
X_train.head()

Unnamed: 0,text
0,[INST]Analyze the sentiment of the news headli...
1,[INST]Analyze the sentiment of the news headli...
2,[INST]Analyze the sentiment of the news headli...
3,[INST]Analyze the sentiment of the news headli...
4,[INST]Analyze the sentiment of the news headli...


# Evaluation code

In [12]:
def evaluate(y_true, y_pred):
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'none':1, 'negative': 0}
    def map_func(x):
        return mapping.get(x, 1)

    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [29]:
def predict(test, model, tokenizer):

        # Set pad_token_id to eos_token_id if pad_token is not defined
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens = 1,
                        temperature = 0.1,
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")
    return y_pred

# from tqdm.auto import tqdm

# def predict(test, model, tokenizer):
#     y_pred = []
#     pipe = pipeline(task="text-generation",
#                     model=model,
#                     tokenizer=tokenizer,
#                     max_new_tokens=1,
#                     temperature=0.1,
#                    )
#     for prompt in tqdm(test["text"]):
#         result = pipe(prompt)
#         answer = result[0]['generated_text'].split("=")[-1].strip()
#         if "positive" in answer:
#             y_pred.append("positive")
#         elif "negative" in answer:
#             y_pred.append("negative")
#         elif "neutral" in answer:
#             y_pred.append("neutral")
#         else:
#             y_pred.append("none")
#     return y_pred

In [14]:


import torch
import torch.nn as nn
import transformers
from datasets import Dataset
import bitsandbytes as bnb
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda


In [16]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

# tokenizer = AutoTokenizer.from_pretrained(model_name,
#                                           trust_remote_code=True,
#                                          )
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

# model, tokenizer = setup_chat_format(model, tokenizer)

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                          padding_side="left",
                                          add_eos_token=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

# Evaluate Performance of foundation model

In [20]:
y_pred = predict(test, model, tokenizer)

  0%|          | 0/900 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [21]:
evaluate(y_true, y_pred)

Accuracy: 0.498
Accuracy for label 0: 0.493
Accuracy for label 1: 0.980
Accuracy for label 2: 0.020

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.49      0.65       300
           1       0.40      0.98      0.57       300
           2       1.00      0.02      0.04       300

    accuracy                           0.50       900
   macro avg       0.79      0.50      0.42       900
weighted avg       0.79      0.50      0.42       900


Confusion Matrix:
[[148 152   0]
 [  6 294   0]
 [  0 294   6]]


# Fine-tune

In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
import wandb
wandb.login()


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [24]:
import os
os.environ["WANDB_PROJECT"] = "mistralv0.2-7b-sentiment-analysis"


In [25]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    push_to_hub=True,  # Enable saving to Hugging Face Hub
    hub_model_id="RubyDiamond/mistralv0.2-7b-sentiment-analysis",  # Replace with your HF username and desired model name
    hub_strategy="end",
    report_to="wandb",
    evaluation_strategy="epoch"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    max_seq_length=1024,
)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [26]:
# Train model
trainer.train()



[34m[1mwandb[0m: Currently logged in as: [33mkillian_ryan[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
0,0.8368,0.816684


TrainOutput(global_step=112, training_loss=1.0872302651405334, metrics={'train_runtime': 571.6184, 'train_samples_per_second': 1.574, 'train_steps_per_second': 0.196, 'total_flos': 4122621677002752.0, 'train_loss': 1.0872302651405334, 'epoch': 1.0})

In [27]:
# Save trained model
trainer.model.save_pretrained("trained-model")

In [30]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)

  0%|          | 0/900 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Accuracy: 0.811
Accuracy for label 0: 0.977
Accuracy for label 1: 0.560
Accuracy for label 2: 0.897

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.98      0.94       300
           1       0.87      0.56      0.68       300
           2       0.70      0.90      0.79       300

    accuracy                           0.81       900
   macro avg       0.83      0.81      0.80       900
weighted avg       0.83      0.81      0.80       900


Confusion Matrix:
[[293   3   4]
 [ 23 168 109]
 [  9  22 269]]


In [31]:
!pip install huggingface_hub


NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [32]:
import os

os.environ['LC_ALL'] = 'C.UTF-8'
os.environ['LANG'] = 'C.UTF-8'


In [34]:
!export LC_ALL=C.UTF-8
!export LANG=C.UTF-8


NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [35]:
import subprocess

# Use subprocess to call pip install
subprocess.run(["pip", "install", "huggingface_hub"], check=True)



CompletedProcess(args=['pip', 'install', 'huggingface_hub'], returncode=0)

In [36]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
!cd trained-model && git init && git lfs track "*.bin" && git add . && git commit -m "Add model" && git remote add origin https://huggingface.co/RubyDiamond/mistralv0.2-7b-sentiment-analysis && git push -u origin master


NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [38]:
import subprocess

# Define your command as a multi-line string
cmd = """
cd trained-model && \
git init && \
git lfs track "*.bin" && \
git add . && \
git commit -m "Add model" && \
git remote add origin https://huggingface.co/RubyDiamond/mistralv0.2-7b-sentiment-analysis && \
git push -u origin master
"""

# Execute the command
process = subprocess.run(cmd, shell=True, check=True, text=True, executable='/bin/bash')


CalledProcessError: Command '
cd trained-model && git init && git lfs track "*.bin" && git add . && git commit -m "Add model" && git remote add origin https://huggingface.co/RubyDiamond/mistralv0.2-7b-sentiment-analysis && git push -u origin master
' returned non-zero exit status 128.

In [39]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [40]:
model_directory = '/content/trained-model'
drive_path = '/content/drive/My Drive/Colab Notebooks/mistralv0.2-7b-sentiment-analysis'
!cp -r "$model_directory" "$drive_path"


NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [41]:
# Form the command as a list of arguments
cmd = ['cp', '-r', model_directory, drive_path]

# Execute the command
subprocess.run(cmd, check=True)


CompletedProcess(args=['cp', '-r', '/content/trained-model', '/content/drive/My Drive/Colab Notebooks/mistralv0.2-7b-sentiment-analysis'], returncode=0)