In [None]:
!pip install git+https://github.com/huggingface/accelerate.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install bitsandbytes

In [None]:
!pip install datasets

In [None]:
!pip install peft
!pip install evaluate
!pip install wandb

In [4]:
import numpy as np
import pandas as pd
import torch

from sklearn.model_selection import train_test_split

import tensorflow as tf
import datasets
from datasets import Dataset

import os
from google.colab import drive
import functools

In [5]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)

In [6]:
drive.mount('/content/drive', force_remount=True)
os.chdir("drive/My Drive/tweet-data/")


#read all training data
train_big_df = pd.read_hdf('train-data.h5', 'train')

#read all testing data
test_big_df = pd.read_hdf('test-data.h5', 'test')

Mounted at /content/drive


In [7]:
MAX_LEN = 512
mistral_checkpoint = 'mistralai/Mistral-7B-v0.1'

In [8]:
train_big_df['labels'] = train_big_df['labels'].astype(int)
test_big_df['labels'] = test_big_df['labels'].astype(int)

In [9]:
train_dataset = Dataset.from_pandas(train_big_df, split="train")
test_dataset = Dataset.from_pandas(test_big_df, split="test")
dataset = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})

In [10]:
# Split the dataset into training and validation datasets
data = dataset['train'].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
data['val'] = data.pop("test")
# Convert the test dataframe to HuggingFace dataset and add it into the first dataset
data['test'] = dataset['test']

In [11]:
pos_weights = len(data['train'].to_pandas()) / (2 * data['train'].to_pandas().labels.value_counts()[1])
neg_weights = len(data['train'].to_pandas()) / (2 * data['train'].to_pandas().labels.value_counts()[0])
pos_weights, neg_weights

(1.4830363680742007, 0.7543140906269398)

In [12]:
# Number of Characters
max_char = data['train'].to_pandas()['text'].str.len().max()
# Number of Words
max_words = data['train'].to_pandas()['text'].str.split().str.len().max()

print(f"Max characters: {max_char}")
print(f"Max words: {max_words}")

Max characters: 283
Max words: 56


In [13]:
data['train'][0]

{'tweet_id': 903420789361344513,
 'text': 'please see message below we r look 4 donations please bring them to zysa this weekend our friends need help',
 'labels': 0,
 '__index_level_0__': 24268}

In [14]:
# Load Mistral 7B Tokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding

col_to_delete = ['tweet_id','__index_level_0__','text']

mistral_tokenizer = AutoTokenizer.from_pretrained(mistral_checkpoint, add_prefix_space=True)
mistral_tokenizer.pad_token_id = mistral_tokenizer.eos_token_id
mistral_tokenizer.pad_token = mistral_tokenizer.eos_token

def mistral_preprocessing_function(examples):
    return mistral_tokenizer(examples['text'], truncation=True, max_length=MAX_LEN)

mistral_tokenized_datasets = data.map(mistral_preprocessing_function, batched=True, remove_columns=col_to_delete)
mistral_tokenized_datasets = mistral_tokenized_datasets.rename_column("labels", "label")
mistral_tokenized_datasets.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
mistral_data_collator = DataCollatorWithPadding(tokenizer=mistral_tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Map:   0%|          | 0/24304 [00:00<?, ? examples/s]

Map:   0%|          | 0/6077 [00:00<?, ? examples/s]

Map:   0%|          | 0/8606 [00:00<?, ? examples/s]

In [15]:
mistral_tokenized_datasets['train'][0]

{'label': tensor(0),
 'input_ids': tensor([    1,  4665,  1032,  2928,  3624,   478,   408,   913, 28705, 28781,
           949,   697,  4665,  2968,   706,   298,   686,   846, 28708,   456,
          9071,   813,  3282,   927,  1316]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1])}

In [None]:
# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

In [16]:
from transformers import AutoModelForSequenceClassification
import torch
mistral_model =  AutoModelForSequenceClassification.from_pretrained(
  pretrained_model_name_or_path=mistral_checkpoint,
  num_labels=2,
  device_map="auto"
)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
mistral_model.config.pad_token_id = mistral_model.config.eos_token_id

In [18]:
from peft import get_peft_model, LoraConfig, TaskType

mistral_peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=2, lora_alpha=16, lora_dropout=0.1, bias="none",
    target_modules=[
        "q_proj",
        "v_proj",
    ],
)

mistral_model = get_peft_model(mistral_model, mistral_peft_config)
mistral_model.print_trainable_parameters()

trainable params: 860,160 || all params: 7,111,528,448 || trainable%: 0.012095290151611583


In [19]:
import evaluate
import numpy as np

def compute_metrics(eval_pred):
    # All metrics are already predefined in the HF `evaluate` package
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric= evaluate.load("f1")
    accuracy_metric = evaluate.load("accuracy")

    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    # The trainer is expecting a dictionary where the keys are the metrics names and the values are the scores.
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

In [20]:
from transformers import Trainer

class WeightedCELossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # Get model's predictions
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Compute custom loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor([neg_weights, pos_weights], device=model.device, dtype=logits.dtype))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [21]:
from transformers import TrainingArguments, Trainer

mistral_model = mistral_model.cuda()

lr = 1e-4
batch_size = 8
num_epochs = 5

training_args = TrainingArguments(
    output_dir="mistral-lora-token-classification",
    learning_rate=lr,
    lr_scheduler_type= "constant",
    warmup_ratio= 0.1,
    max_grad_norm= 0.3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.001,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16=True,
    gradient_checkpointing=True,
)


mistral_trainer = WeightedCELossTrainer(
    model=mistral_model,
    args=training_args,
    train_dataset=mistral_tokenized_datasets['train'],
    eval_dataset=mistral_tokenized_datasets["val"],
    data_collator=mistral_data_collator,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [22]:
# Fine-tune the model
mistral_trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1,0.9744,0.823349,0.644258,0.767398,0.700457,0.773408
2,0.7704,0.727251,0.684388,0.773117,0.726052,0.798585
3,0.6851,0.742827,0.720791,0.746902,0.733614,0.812737
4,0.6186,0.720037,0.699232,0.78122,0.737956,0.808458
5,0.5857,0.821761,0.749083,0.681602,0.713751,0.811256


Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



TrainOutput(global_step=15190, training_loss=0.7590269032240384, metrics={'train_runtime': 1975.2504, 'train_samples_per_second': 61.521, 'train_steps_per_second': 7.69, 'total_flos': 2.2144576661028864e+17, 'train_loss': 0.7590269032240384, 'epoch': 5.0})

In [23]:
mistral_trainer.evaluate()

{'eval_loss': 0.7200365662574768,
 'eval_precision': 0.6992320819112628,
 'eval_recall': 0.7812202097235462,
 'eval_f1-score': 0.7379558757316523,
 'eval_accuracy': 0.8084581207832813,
 'eval_runtime': 73.6441,
 'eval_samples_per_second': 82.518,
 'eval_steps_per_second': 10.32,
 'epoch': 5.0}