## Installing Necessary Libraries

In [1]:
import torch


print(f"Pytorch version: {torch.__version__}")
print(f"Is MPS (Metal Performance Shader) built? {torch.backends.mps.is_built()}")
print(f"Is MPS avialable? {torch.backends.mps.is_available()}")

device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device {device}")


Pytorch version: 2.0.1
Is MPS (Metal Performance Shader) built? True
Is MPS avialable? True
Using device mps


# Dataset details
Instacart Data can be downloaded from [here](https://www.kaggle.com/competitions/instacart-market-basket-analysis/data). We just need product & department csv files

In [2]:
import pandas as pd
df_product = pd.read_csv("instacart_data/departments.csv")
df_dept = pd.read_csv('instacart_data/products.csv')

In [3]:
df_joined = pd.merge(df_product, df_dept, on = ['department_id'])
df_joined['text'] = df_joined.apply(lambda row: row['product_name'] + " ->: " + row['department'], axis = 1)
df_joined

Unnamed: 0,department_id,department,product_id,product_name,aisle_id,text
0,1,frozen,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,Smart Ones Classic Favorites Mini Rigatoni Wit...
1,1,frozen,8,Cut Russet Potatoes Steam N' Mash,116,Cut Russet Potatoes Steam N' Mash ->: frozen
2,1,frozen,12,Chocolate Fudge Layer Cake,119,Chocolate Fudge Layer Cake ->: frozen
3,1,frozen,18,Pizza for One Suprema Frozen Pizza,79,Pizza for One Suprema Frozen Pizza ->: frozen
4,1,frozen,30,"Three Cheese Ziti, Marinara with Meatballs",38,"Three Cheese Ziti, Marinara with Meatballs ->:..."
...,...,...,...,...,...,...
49683,21,missing,49553,Organic Plain Skyr,100,Organic Plain Skyr ->: missing
49684,21,missing,49575,Chocolate Vanilla Malt Custard,100,Chocolate Vanilla Malt Custard ->: missing
49685,21,missing,49641,"8\"" Pecan Pie",100,"8\"" Pecan Pie ->: missing"
49686,21,missing,49664,Lemon Cayenne Drinking Vinegar,100,Lemon Cayenne Drinking Vinegar ->: missing


In [4]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_joined, test_size=0.2, random_state=42)

In [5]:
train_df.head(10)

Unnamed: 0,department_id,department,product_id,product_name,aisle_id,text
7361,4,produce,38057,Spicy Organic Microgreens,83,Spicy Organic Microgreens ->: produce
32534,15,canned goods,32461,Spaghetti Cool Shapes Star Wars,59,Spaghetti Cool Shapes Star Wars ->: canned goods
1457,1,frozen,17540,Burnt Sugar Vanilla Ice Cream,37,Burnt Sugar Vanilla Ice Cream ->: frozen
5201,3,bakery,22335,Sliced Italian Bread,112,Sliced Italian Bread ->: bakery
38539,17,household,29431,Air Effects Value Pack Hawaiian Aloha,101,Air Effects Value Pack Hawaiian Aloha ->: hous...
18377,11,personal care,9014,Mint Mouthwash,20,Mint Mouthwash ->: personal care
28627,13,pantry,37330,Pure Cane Washed Raw Sugar,17,Pure Cane Washed Raw Sugar ->: pantry
30283,14,breakfast,11892,Mixed Berry BelVita Bites,48,Mixed Berry BelVita Bites ->: breakfast
32030,15,canned goods,20544,Peeled Whole Tomatoes,81,Peeled Whole Tomatoes ->: canned goods
28655,13,pantry,37534,Unprocessed Wheat Bran,17,Unprocessed Wheat Bran ->: pantry


In [6]:
test_df.head(10)

Unnamed: 0,department_id,department,product_id,product_name,aisle_id,text
33626,16,dairy eggs,5570,Coffee Rich Original Non-Dairy Creamer,53,Coffee Rich Original Non-Dairy Creamer ->: dai...
18192,11,personal care,7582,Cold Snap,11,Cold Snap ->: personal care
47099,19,snacks,49614,Sandies Pecan Shortbread Cookies,61,Sandies Pecan Shortbread Cookies ->: snacks
48183,20,deli,39968,Miso Soup,1,Miso Soup ->: deli
22197,11,personal care,37799,Body Envy Volumizing Shampoo,22,Body Envy Volumizing Shampoo ->: personal care
31573,15,canned goods,9984,Bean Salad,81,Bean Salad ->: canned goods
45362,19,snacks,35795,Mexican Restaurant Style Corn Tortilla Chips,107,Mexican Restaurant Style Corn Tortilla Chips -...
14131,7,beverages,47666,Grapefruit No Sugar Added 100% Juice,98,Grapefruit No Sugar Added 100% Juice ->: bever...
26903,13,pantry,21534,"Salsa, Diablo, Hot",51,"Salsa, Diablo, Hot ->: pantry"
39417,17,household,43688,Toilet Bowl Cleaner with Lime & Rust Remover,114,Toilet Bowl Cleaner with Lime & Rust Remover -...


In [7]:
from datasets import Dataset,DatasetDict
train_dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
})

  from .autonotebook import tqdm as notebook_tqdm


## Loading the model

In [None]:
import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer

model_name = "TinyPixel/Llama-2-7B-bf16-sharded"

model_name_mac_optimized = "TheBloke/Llama-2-7B-Chat-GGUF"

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16,
# )

model = AutoModelForCausalLM.from_pretrained(
    "TinyPixel/Llama-2-7B-bf16-sharded",
    hf=True
)

# llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-GGML")
# llm

model

  from .autonotebook import tqdm as notebook_tqdm


Let's also load the tokenizer below

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)okenizer_config.json: 100%|█████████████████████████████████████████████████████████| 676/676 [00:00<00:00, 23.8kB/s]
Downloading tokenizer.model: 100%|███████████████████████████████████████████████████████████████| 500k/500k [00:00<00:00, 8.93MB/s]
Downloading (…)/main/tokenizer.json: 100%|█████████████████████████████████████████████████████| 1.84M/1.84M [00:00<00:00, 3.82MB/s]
Downloading (…)cial_tokens_map.json: 100%|█████████████████████████████████████████████████████████| 411/411 [00:00<00:00, 83.5kB/s]


**Let's check what the base model predicts before finetuning. :)**

In [10]:
import transformers
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)


sequences = pipeline(
   ["“Free & Clear Stage 4 Overnight Diapers” ->:","Bread Rolls ->:","French Milled Oval Almond Gourmande Soap ->:"],
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq[0]['generated_text']}")

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


KeyboardInterrupt: 

Below we will load the configuration file in order to create the LoRA model. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance.

In [None]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","v_proj"]
)

## Loading the trainer

Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

In [None]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 1
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 120
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

Then finally pass everthing to the trainer

In [None]:
from trl import SFTTrainer

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_dict['train'],
    # train_dataset=data['train'],
    peft_config=peft_config,
    dataset_text_field="text",
    # dataset_text_field="prediction",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)



Map:   0%|          | 0/39750 [00:00<?, ? examples/s]

We will also pre-process the model by upcasting the layer norms in float 32 for more stable training

In [None]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

## Train the model

Now let's train the model! Simply call `trainer.train()`

In [None]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.0291
2,3.0896
3,4.3592
4,6.2712
5,4.2501
6,3.0927
7,4.5173
8,3.9694
9,4.4714
10,5.3573


TrainOutput(global_step=120, training_loss=2.6939144556721053, metrics={'train_runtime': 410.6557, 'train_samples_per_second': 4.675, 'train_steps_per_second': 0.292, 'total_flos': 627303342243840.0, 'train_loss': 2.6939144556721053, 'epoch': 0.05})

In [None]:
lst_test_data = list(test_df['text'])

In [None]:
len(lst_test_data)

9938

In [None]:
sample_size = 25
lst_test_data_short = lst_test_data[:sample_size]

In [None]:
import transformers

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    # torch_dtype=torch.bfloat16,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
)

sequences = pipeline(
    lst_test_data_short,
    max_length=100,  #200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

for ix,seq in enumerate(sequences):
    print(ix,seq[0]['generated_text'])

0 Free & Clear Stage 4 Overnight Diapers ->: babies: diapers & accessories diapers ->: baby needs diapers & pads free clear diapers ->: baby needs diapers free & clear stage 4 overnight diapers: diapers ->: baby needs diapers diapers & accessories ->: baby needs diapers free & clear stage 4 overnight diapers: diapers ->: personal care baby care ->: diapers & accessories dia
1 Beef pot roast with roasted potatoes, carrots, sweet onions, green beans, and a rich gravy Beef Pot Roast ->: frozen goods ->: frozen meals ->: meat, chicken & seafood ->: beef & beef products ->: entree ->: entree ->: entree ->: entrees: beef ->: beef entree ->: meat ->: beef ->: meat, chicken
2 Coffee Liquer ->: alcoholic beverages spirits ->: beverages beer & hard cider ->: beer coffee liqueur ->: beverages liquor ->: beverages beer & hard cider coffee liquor ->: beverages beer coffee liqueur original ->: beverages liquor ->: beverages beer & hard cider coffee liqueur original alcoholic beverages spirits drinks

In [None]:
def correct_answer(ans):
  return (ans.split("->:")[1]).strip()

answers = []
for ix,seq in enumerate(sequences):
    # print(ix,seq[0]['generated_text'])
    answers.append(correct_answer(seq[0]['generated_text']))

answers

['babies: diapers & accessories diapers',
 'frozen goods',
 'alcoholic beverages spirits',
 'bakery',
 'personal care soap & bath',
 'household: pantry: cleaning: broom & mop: broom & broom brushes: hand broom',
 'deli meats & cheese',
 'beverages alcohol juice cranberry raspberry juice cocktail',
 'dairy eggs butter & cheese eggs',
 'canned goods',
 'dairy eggs & dairy eggs',
 'snacks chocolate & candy',
 'dry goods pasta, rice, beans and grains pasta',
 'personal care toiletries personal care toiletries liquid hand wash & dish wash',
 'beverages',
 'international foods dry goods spices & seasonings spices five spices',
 'breakfast cereals',
 'dairy eggs & cheese frozen dairy',
 'meat seafood frozen',
 'household essentials candles & accessories scented votive wax tablets',
 'frozen food',
 'dairy eggs dairy egg products soy products',
 'breakfast foods',
 'personal care',
 'meat seafood fish seafood']

In [None]:
df_evaluate = test_df.iloc[:sample_size][['product_name','department']]

df_evaluate = df_evaluate.reset_index(drop=True)

df_evaluate['department_predicted'] = answers

df_evaluate

Unnamed: 0,product_name,department,department_predicted
0,Free & Clear Stage 4 Overnight Diapers,babies,babies: diapers & accessories diapers
1,"Beef pot roast with roasted potatoes, carrots,...",frozen,frozen goods
2,Coffee Liquer,alcohol,alcoholic beverages spirits
3,Bread Rolls,bakery,bakery
4,French Milled Oval Almond Gourmande Soap,personal care,personal care soap & bath
5,Dust Pan,household,household: pantry: cleaning: broom & mop: broo...
6,Roasted Pine Nut Hommus,deli,deli meats & cheese
7,Cranberry Raspberry Juice Cocktail,beverages,beverages alcohol juice cranberry raspberry ju...
8,Sweet Cream Butter Salted,dairy eggs,dairy eggs butter & cheese eggs
9,Traditional Chicken Barley Soup,canned goods,canned goods
