## **1. Install and import bibraries**


In [None]:
# Adopted from - Trinh Xuan Khai-https://huggingface.co/trinhxuankhai

In [1]:
!pip install datasets evaluate accelerate
!pip install causal-conv1d>=1.1.0
!pip install mamba-ssm



In [None]:

!export LC_ALL="en_US.UTF-8"
!export LD_LIBRARY_PATH="/usr/lib64-nvidia"
!export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
!ldconfig /usr/lib64-nvidia

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link



In [2]:
import os
import random
import json
import torch
import torch.nn as nn
from collections import namedtuple
from dataclasses import dataclass, field, asdict
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf

import evaluate
import numpy as np
from datasets import load_dataset
from transformers import Trainer
from transformers import AutoTokenizer, TrainingArguments

  @custom_fwd
  @custom_bwd
  @custom_fwd
  @custom_bwd
  @custom_fwd
  @custom_bwd
  @custom_fwd
  @custom_bwd
  from .autonotebook import tqdm as notebook_tqdm


Login into huggingface_hub to push trained model

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## **2. Download dataset**


In [3]:
# imbd dataset loader
imdb = load_dataset("imdb")

## **3. EDA dataset**


There are two fields in this dataset:

* text: the movie review text.
* label: a value that is either 0 for a negative review or 1 for a positive review.

In [4]:
imdb["test"][0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

## **4. Build Custom Mamba Model for Text Classification**


In [5]:
@dataclass
class MambaConfig:
    d_model: int = 2560
    n_layer: int = 64
    vocab_size: int = 50277
    ssm_cfg: dict = field(default_factory=dict)
    rms_norm: bool = True
    residual_in_fp32: bool = True
    fused_add_norm: bool = True
    pad_vocab_size_multiple: int = 8

    def to_json_string(self):
        return json.dumps(asdict(self))

    def to_dict(self):
        return asdict(self)

In [6]:
class MambaClassificationHead(nn.Module):
    def __init__(self, d_model, num_classes, **kwargs):
        super(MambaClassificationHead, self).__init__()
        self.classification_head = nn.Linear(d_model, num_classes, **kwargs)

    def forward(self, hidden_states):
        return self.classification_head(hidden_states)

In [7]:
class MambaTextClassification(MambaLMHeadModel):
    def __init__(
        self,
        config: MambaConfig,
        initializer_cfg=None,
        device=None,
        dtype=None,
    ) -> None:
        super().__init__(config, initializer_cfg, device, dtype)

        
        self.classification_head = MambaClassificationHead(d_model=config.d_model, num_classes=2)

        del self.lm_head

    def forward(self, input_ids, attention_mask=None, labels=None):
       
        hidden_states = self.backbone(input_ids)

        
        mean_hidden_states = hidden_states.mean(dim=1)
        logits = self.classification_head(mean_hidden_states)

        if labels is None:
          ClassificationOutput = namedtuple("ClassificationOutput", ["logits"])
          return ClassificationOutput(logits=logits)
        else:
          ClassificationOutput = namedtuple("ClassificationOutput", ["loss", "logits"])

          loss_fct = nn.CrossEntropyLoss()
          loss = loss_fct(logits, labels)

          return ClassificationOutput(loss=loss, logits=logits)

    def predict(self, text, tokenizer, id2label=None):
        input_ids = torch.tensor(tokenizer(text)['input_ids'], device='cuda')[None]
        with torch.no_grad():
          logits = self.forward(input_ids).logits[0]
          label = np.argmax(logits.cpu().numpy())

        if id2label is not None:
          return id2label[label]
        else:
          return label

    @classmethod
    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
      
        config_data = load_config_hf(pretrained_model_name)
        config = MambaConfig(**config_data)

       
        model = cls(config, device=device, dtype=dtype, **kwargs)

        
        model_state_dict = load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype)
        model.load_state_dict(model_state_dict, strict=False)
        print("Newly initialized embedding:", set(model.state_dict().keys()) - set(model_state_dict.keys()))
        return model

In [11]:
from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer

In [20]:
model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
model.to("cuda")

tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

tokenizer.pad_token_id = tokenizer.eos_token_id

## **5. Preprocess dataset**


In [21]:

def preprocess_function(examples):
    samples = tokenizer(examples["text"], truncation=True)
    samples.pop('attention_mask')
    return samples

In [22]:

tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 25000/25000 [00:16<00:00, 1548.26 examples/s]
Map: 100%|██████████| 25000/25000 [00:15<00:00, 1653.21 examples/s]
Map: 100%|██████████| 50000/50000 [00:35<00:00, 1421.78 examples/s]


In [23]:

random.seed(42)


train_dataset = tokenized_imdb["train"]
test_dataset = tokenized_imdb["test"]


total_samples = len(test_dataset)
eval_samples = int(0.01 * total_samples)
eval_indices = random.sample(range(total_samples), eval_samples)
eval_dataset = test_dataset.select(eval_indices)

## **6. Evaluation metric**


In [24]:

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    
    predictions = np.argmax(predictions, axis=1)

    
    return accuracy.compute(predictions=predictions, references=labels)

## **7. Trainning**


In [33]:

class MambaTrainer(Trainer):

    
    def compute_loss(self, model, inputs, num_items_in_batch,return_outputs=False):
       
        input_ids = inputs.pop("input_ids")
        labels = inputs.pop('labels')

        outputs = model(input_ids=input_ids, labels=labels)

        
        print(outputs.loss)
        loss = outputs.loss / num_items_in_batch


        return (loss, outputs) if return_outputs else loss

    
    def save_model(self, output_dir = None, _internal_call = False):
        
        if output_dir is None:
            output_dir = self.args.output_dir

       
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

       
        torch.save(self.model.state_dict(), f"{output_dir}/pytorch_model.bin")

       
        self.tokenizer.save_pretrained(output_dir)

        
        with open(f'{output_dir}/config.json', 'w') as f:
            json.dump(self.model.config.to_dict(), f)

In [34]:
training_args = TrainingArguments(
    output_dir="mamba_text_classification", 
    learning_rate=5e-5, 
    per_device_train_batch_size=4, 
    per_device_eval_batch_size=16, 
    num_train_epochs=1, 
    warmup_ratio=0.01, 
    lr_scheduler_type="cosine", 
    report_to="none", 
    evaluation_strategy="steps", 
    eval_steps=0.1,
    save_strategy="steps",
    save_steps=0.1, 
    logging_strategy="steps", 
    logging_steps=1, 
    push_to_hub=False,  # 
    load_best_model_at_end=True, 
)



In [35]:

trainer = MambaTrainer(
    model=model, 
    train_dataset=train_dataset, 
    eval_dataset=eval_dataset, 
    tokenizer=tokenizer, 
    args=training_args,
    compute_metrics=compute_metrics 
    )

  trainer = MambaTrainer(


In [None]:
trainer.train()

In [None]:
# Đẩy model lên huggingface hub
trainer.push_to_hub(commit_message="Training complete")

CommitInfo(commit_url='https://huggingface.co/trinhxuankhai/mamba_text_classification/commit/bce1a8bedec382d325b06404f9bbb0994927ed6c', commit_message='Training complete', commit_description='', oid='bce1a8bedec382d325b06404f9bbb0994927ed6c', pr_url=None, pr_revision=None, pr_num=None)

## **8. Run Testing**


In [None]:
# Thực hiện dự đoán trên tập dữ liệu validation
outputs = trainer.predict(test_dataset)

In [None]:
print(outputs.metrics)

{'test_loss': 0.2068171352148056, 'test_accuracy': 0.94172, 'test_runtime': 1336.8157, 'test_samples_per_second': 18.701, 'test_steps_per_second': 1.169}
