## Import Necessary libraries

In [2]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.1/88.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.1/519.1 kB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.9 MB/s[0m eta [36m0

## Data Preparation

In [3]:
!unzip -u "/content/drive/MyDrive/LLama2 FineTune/departments.zip" -d "/content/drive/MyDrive/LLama2 FineTune"
!unzip -u "/content/drive/MyDrive/LLama2 FineTune/products.zip" -d "/content/drive/MyDrive/LLama2 FineTune"

Archive:  /content/drive/MyDrive/LLama2 FineTune/departments.zip
Archive:  /content/drive/MyDrive/LLama2 FineTune/products.zip


In [4]:
import pandas as pd
df_product = pd.read_csv("/content/drive/MyDrive/LLama2 FineTune/products.csv")
df_dept = pd.read_csv('/content/drive/MyDrive/LLama2 FineTune//departments.csv')

In [5]:
df_product.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [6]:
df_dept.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [7]:
df_joined = pd.merge(df_product, df_dept, on = ['department_id'])
df_joined['text'] = df_joined.apply(lambda row: row['product_name'] + " ->: " + row['department'], axis = 1)
df_final=df_joined[['product_name','department','text']]
df_final.head()

Unnamed: 0,product_name,department,text
0,Chocolate Sandwich Cookies,snacks,Chocolate Sandwich Cookies ->: snacks
1,Mint Chocolate Flavored Syrup,snacks,Mint Chocolate Flavored Syrup ->: snacks
2,Salted Caramel Lean Protein & Fiber Bar,snacks,Salted Caramel Lean Protein & Fiber Bar ->: sn...
3,Nacho Cheese White Bean Chips,snacks,Nacho Cheese White Bean Chips ->: snacks
4,Organic Sourdough Einkorn Crackers Rosemary,snacks,Organic Sourdough Einkorn Crackers Rosemary ->...


In [8]:
# train test split
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_final, test_size=0.2, random_state=42)

## Convert dataset into dict

In [9]:
from datasets import Dataset,DatasetDict
train_dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
})

## Loading the Llama2 model and Tokenizer

In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "TinyPixel/Llama-2-7B-bf16-sharded" # Shared model used

# This saves more memory at no additional performance - from our empirical observations,
# this enables fine-tuning llama-13b model on an NVIDIA-T4 16GB with a sequence length of 1024,
# batch size of 1 and gradient accumulation steps of 4.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

Downloading (…)lve/main/config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/14 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00014.bin:   0%|          | 0.00/981M [00:00<?, ?B/s]

Downloading (…)l-00002-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00003-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00004-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00005-of-00014.bin:   0%|          | 0.00/944M [00:00<?, ?B/s]

Downloading (…)l-00006-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00007-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00008-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00009-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00010-of-00014.bin:   0%|          | 0.00/944M [00:00<?, ?B/s]

Downloading (…)l-00011-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00012-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00013-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00014-of-00014.bin:   0%|          | 0.00/847M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)okenizer_config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

### Let's check what the base model predicts before finetuning. :)

In [12]:
# !pip install xformers

In [13]:
import transformers
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [14]:
sequences = pipeline(
   ["“Free & Clear Stage 4 Overnight Diapers” ->:","Bread Rolls ->:","French Milled Oval Almond Gourmande Soap ->:"],
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq[0]['generated_text']}")



Result: “Free & Clear Stage 4 Overnight Diapers” ->: http://bit. sierp, 2018. The new formula was released at CES in 2012. It’s a little more expensive than traditional paper diapers, but the convenience is worth it.
Diaper Genie: The Diaper Genie is a container that fits underneath the toilet and holds up to 25 disposable diapers. 08.12.2010 · This feature is not available right now. The Diaper Genie 3.0 is an automatic disposable diaper pail and is the most advanced version of the popular Diaper Genie 2.0.
It’s the perfect solution for storing and collecting disposable diapers in a compact, convenient and safe way. It can hold up to six months supply. The 3.0 version was released at CES in
Result: Bread Rolls ->:
 февруари 25 2019 07:44
I think what you typed was very reasonable. But, consider this, what if you typed a catchier post title? I mean, I don’t wish to tell you how to run your blog, however what if you added a headline to maybe grab people’s attention? I mean Cake Bread Ro

Below we will load the configuration file in order to create the LoRA model. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance.

In [15]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","v_proj"]
)

## Loading the trainer

Here we will use the SFTTrainer from TRL library that gives a wrapper around transformers Trainer to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

In [16]:
from transformers import TrainingArguments
training_arguments = TrainingArguments(
    output_dir = "./LLama2 FineTune",
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    optim = "paged_adamw_32bit",
    save_steps = 10,
    logging_steps = 1,
    learning_rate = 2e-4,
    fp16=True,
    max_grad_norm = 0.3,
    max_steps = 120,
    warmup_ratio = 0.03,
    group_by_length=True,
    lr_scheduler_type = "constant",
)

In [17]:
# Pass everything to trainer
from trl import SFTTrainer

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_dict['train'],
    # train_dataset=data['train'],
    peft_config=peft_config,
    dataset_text_field="text",
    # dataset_text_field="prediction",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)



Map:   0%|          | 0/39750 [00:00<?, ? examples/s]



In [18]:
# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

## Train the model

In [19]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.0291
2,3.0896
3,4.3592
4,6.2712
5,4.2501
6,3.0921
7,4.5168
8,3.9676
9,4.4688
10,5.3532


TrainOutput(global_step=120, training_loss=2.7125535984834035, metrics={'train_runtime': 1632.3115, 'train_samples_per_second': 1.176, 'train_steps_per_second': 0.074, 'total_flos': 627303342243840.0, 'train_loss': 2.7125535984834035, 'epoch': 0.05})

In [20]:
lst_test_data = list(test_df['text'])
len(lst_test_data)

9938

In [21]:
# Take only 25 samples to check it out
sample_size = 25
lst_test_data_short = lst_test_data[:sample_size]

In [22]:
import transformers

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    # torch_dtype=torch.bfloat16,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
)

sequences = pipeline(
    lst_test_data_short,
    max_length=100,  #200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

for ix,seq in enumerate(sequences):
    print(ix,seq[0]['generated_text'])



0 Free & Clear Stage 4 Overnight Diapers ->: babies toddlers kids baby toiletries diapers & pants free clear stage 4 overnight diapers ->: personal care personal care products disposable feminine items pantiliners & pantyliners ->: personal care personal care products disposable feminine items pantiliners personal care products free & clear stage 4 overnight diapers ->: babies todd
1 Beef pot roast with roasted potatoes, carrots, sweet onions, green beans, and a rich gravy Beef Pot Roast ->: frozen foods pasta and meat ->: meat ->: entrees ->: meat and fish ->: poultry and meats ->: pork ->: pork & bacon ->: pork loin ->: meats ->: pork meatloaf ->: meats ->: meat products ->
2 Coffee Liquer ->: alcohol spirits coffee liqueur ->: beverages non-alcoholic beverages coffee beverages ->: coffee coffee beverages coffee beverages ->: coffee liqueuers coffee beverages ->: beverages alcoholic beverages: liqueuers & cordials: liqueurs ->: beverages beverages liqueuers & cordials liqueuers: coff

In [23]:
def correct_answer(ans):
  return (ans.split("->:")[1]).strip()

answers = []
for ix,seq in enumerate(sequences):
    # print(ix,seq[0]['generated_text'])
    answers.append(correct_answer(seq[0]['generated_text']))

answers

['babies toddlers kids baby toiletries diapers & pants free clear stage 4 overnight diapers',
 'frozen foods pasta and meat',
 'alcohol spirits coffee liqueur',
 'bakery',
 'personal care beauty personal care soap bar',
 'household: cleaning',
 'deli meat & cheese',
 'beverages: soft drinks & waters: beverages',
 'dairy eggs & dairy products eggs butter',
 'canned goods soup',
 'dairy eggs cheese eggs & dairy products dairy products',
 'snacks candy bars',
 'dry goods pasta',
 'personal care',
 'beverages',
 'international pantry international pantry: sauces',
 'breakfast',
 'dairy eggs eggs',
 'meat seafood',
 'household cleaning pantry personal care laundry detergent',
 'frozen foods',
 'dairy eggs & cheese eggs & dairy eggs & beans & nuts frozen dairy organic frozen dairy organic soymilk',
 'breakfast',
 'personal care toiletries personal care skin care facial care morning burst facial scrub cleaners',
 'meat seafood pasta']

In [24]:
df_evaluate = test_df.iloc[:sample_size][['product_name','department']]

df_evaluate = df_evaluate.reset_index(drop=True)

df_evaluate['department_predicted'] = answers

df_evaluate

Unnamed: 0,product_name,department,department_predicted
0,Free & Clear Stage 4 Overnight Diapers,babies,babies toddlers kids baby toiletries diapers &...
1,"Beef pot roast with roasted potatoes, carrots,...",frozen,frozen foods pasta and meat
2,Coffee Liquer,alcohol,alcohol spirits coffee liqueur
3,Bread Rolls,bakery,bakery
4,French Milled Oval Almond Gourmande Soap,personal care,personal care beauty personal care soap bar
5,Dust Pan,household,household: cleaning
6,Roasted Pine Nut Hommus,deli,deli meat & cheese
7,Cranberry Raspberry Juice Cocktail,beverages,beverages: soft drinks & waters: beverages
8,Sweet Cream Butter Salted,dairy eggs,dairy eggs & dairy products eggs butter
9,Traditional Chicken Barley Soup,canned goods,canned goods soup
