In [27]:
from datasets import load_dataset, DatasetDict, Dataset

In [28]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [29]:
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

In [30]:
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [31]:
model_checkpoint = 'LiyaT3/sentiment-analysis-imdb-distilbert'

id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)
for name, param in model.named_parameters():
    param.requires_grad = False
# num=2
# model.layers[-1].trainable= True
# model.layers[-1].units= num

In [32]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [33]:
for name, param in model.named_parameters():
     if name.startswith("classifier"): # choose whatever you like here
        param.requires_grad = True

In [34]:
dir(model)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_auto_class',
 '_autoset_attn_implementation',
 '_backward_compatibility_gradient_checkpointing',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_buffers',
 '_call_impl',
 '_check_and_enable_flash_attn_2',
 '_check_and_enable_sdpa',
 '_compiled_call_impl',
 '_convert_head_mask_to_5d',
 '_copy_lm_head_original_to_resized',
 '_create_repo',
 '_dispatch_accelerate_model',
 '_expand_inputs_for_generation',
 '_extract_past_from_model_output',
 '_forward_hooks',
 '_forward_hooks_always_called',
 '_forward_hooks_with_k

In [35]:
from datasets import load_dataset

dataset = load_dataset("sg247/binary-classification")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 8004
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 2000
    })
})


In [36]:
import pandas as pd

df = pd.DataFrame(dataset['train'])
df.dropna(inplace = True)


df['label'] = df['label'].astype(int)
print(df)


                                                  tweet  label
0     Want to say a huge thanks to @WarriorAssaultS ...      1
1     @jaynehh_ you just need a job and get a letter...      1
2             @knhillrocks HA yes, make it quick tho :D      1
3     @shartyboy Thanks for texting me back :)) I'm ...      1
4     Laying out a greetings card range for print to...      1
...                                                 ...    ...
7999  "@aula_jr: @Jude_Mugabi I'm watching it. 2-0 M...      0
8000                                  why so sudden :((      0
8001        @21oclock :((( bout to instant transmission      0
8002  @ButtCupboard my feet were cold!!!! and there ...      0
8003  @Kimwoobin89__ oppa............./hug ur arm/ w...      0

[8000 rows x 2 columns]


In [37]:
d = pd.DataFrame(dataset['test'])
d.dropna(inplace = True)
d['label'] = d['label'].astype(int)


In [38]:
from datasets import Dataset, DatasetDict
train_dict = {
    'tweet': df['tweet'].tolist(),
    'label': df['label'].tolist()
}
train_dataset = Dataset.from_dict(train_dict)
dataset_dict = DatasetDict({'train': train_dataset})
print(dataset_dict['train']['label'])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [39]:
from datasets import Dataset, DatasetDict
test_dict = {
    'tweet': d['tweet'].tolist(),
    'label': d['label'].tolist()
}
test_dataset = Dataset.from_dict(train_dict)
dataset_dict = DatasetDict({'test': train_dataset})

# print(dataset_dict['train']['label'][2])
print(dataset_dict['test']['label'][2])
print(dataset_dict)

1
DatasetDict({
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 8000
    })
})


In [40]:
test_dict = {
    'tweet': d['tweet'].tolist(),
    'label': d['label'].tolist()
}

train_dict = {
    'tweet': df['tweet'].tolist(),
    'label': df['label'].tolist()
}



import datasets
train_dataset = datasets.Dataset.from_dict(train_dict)
test_dataset = datasets.Dataset.from_dict(test_dict)
my_dataset_dict = datasets.DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})
print(my_dataset_dict)
print(my_dataset_dict['test']['label'][2])
print(my_dataset_dict['train']['label'][2])

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 2000
    })
})
1
1


In [41]:
!pip install -q -U bitsandbytes


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [42]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [43]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


In [44]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

def tokenize_function(examples):
    tweets = examples["tweet"]
    preprocess_t= [tweet if isinstance(tweet , str) else str(tweet) for tweet in tweets]
    return tokenizer(preprocess_t , truncation= True , padding= "max_length")
    

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

tokenized_dataset = my_dataset_dict.map(tokenize_function, batched=True)
tokenized_dataset

Map: 100%|██████████| 8000/8000 [00:01<00:00, 5166.40 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 5134.19 examples/s]


DatasetDict({
    train: Dataset({
        features: ['tweet', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['tweet', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [45]:
print(bnb_config)

BitsAndBytesConfig {
  "_load_in_4bit": true,
  "_load_in_8bit": false,
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}



In [46]:
!pip install accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable 

TOKENIZERS_PARALLELISM=(true | false)




In [47]:
pip install -i https://pypi.org/simple/ bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple/
Note: you may need to restart the kernel to use updated packages.


In [54]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id,quantization_config=bnb_config,

)

ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [53]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# training 


peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin','k_lin']
                       )

peft_config

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

TypeError: get_peft_model() got an unexpected keyword argument 'bits_and_bytes_config'