In [None]:
# !pip install -q -U bitsandbytes # only needed for quantization
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install flash-attn --no-build-isolation


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.8.3-cp312-cp312-linux_x86_64.whl size=256040057 sha256=f25da18657a87fc83dc1bfb8b7751b82246e9db355510226b674fd437c34b5fb
  Stored in directory: /root/.cache/pip/wheels/3d/59/46/f282c12c73dd4bb3c2e3fe199f1a0d0f8cec06df0cccfeee27
Successfully built flash-attn
Installing collected packages: flash-attn
Successfully installed flash-attn-2.8.3


In [None]:
# Mount GDrive - will prompt authentication
from google.colab import drive
drive.mount('/content/drive')

## Get HF tokenizer
with open("/content/drive/MyDrive/ColabNotebooks/ParentPalAI/data/hftoken.txt") as f:
    HF_TOKEN = f.read().strip()


BASE_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

Mounted at /content/drive


# Load Test Data i.e. Prompts to Generate DPO samples from

In [None]:
# Load test data
from datasets import load_dataset

TEST_DATA_FOLDER = '/content/drive/MyDrive/ColabNotebooks/ParentPalAI/data/'
TEST_DATA_FILE = 'dpo_dataset'
test_data = load_dataset('json', data_files=TEST_DATA_FOLDER+TEST_DATA_FILE+'.jsonl', split='train')

In [None]:
print (f"Number of test samples: {len(test_data)}")


Number of test samples: 1182


In [None]:
print(test_data[0])


{'title': 'Feeling Overwhelmed with Potty Training', 'body': "My 1-year-old is showing interest in the potty, but every time I try to sit him down, he just stands up and runs away giggling. I’ve tried making it fun with songs and games, but nothing seems to work. I'm feeling a bit overwhelmed and wondering if I'm starting too soon. Any advice on how to make this less stressful?", 'topic': 'Potty training struggles - Ages 1-2'}


# Formatting to Get Two Types of Prompts (Standard and Empathy)
There are two types of prompts:
1. Standard parenting prompt
2. Empathy eliciting parenting prompt

In [None]:
MAX_WORDS = 250
MAX_OUTPUT_TOKENS = 600

STANDARD_PROMPT_TEMPLATE = """<s>[INST] You are a parenting assistant. Your job is to help parents make informed decisions and solve day-to-day challenges with their children.
Keep your answers under {MAX_WORDS} words and focused on the user’s specific question.

Question Title: {title}
Question Body: {body}
[/INST]"""

EMPATHY_PROMPT_TEMPLATE = """<s>[INST] You are a parenting assistant. Your job is to help parents make informed decisions and solve day-to-day challenges with their children.
Give clear, structured, practical, actionable, comprehensive, and empathetic advice.
Keep your response under {MAX_WORDS} words and focused on the user’s question.

Question Title: {title}
Question Body: {body}
[/INST]"""

def build_standard_prompt(example):
  example['standard_prompt'] =  STANDARD_PROMPT_TEMPLATE.format(title=example['title'].strip(), body=example['body'].strip(), MAX_WORDS=MAX_WORDS)
  return example


def build_empathy_prompt(example):
  example['empathy_prompt'] = EMPATHY_PROMPT_TEMPLATE.format(title=example['title'].strip(), body=example['body'].strip(), MAX_WORDS=MAX_WORDS)
  return example


In [None]:
test_data = test_data.map(build_standard_prompt)
test_data = test_data.map(build_empathy_prompt)


Map:   0%|          | 0/1182 [00:00<?, ? examples/s]

Map:   0%|          | 0/1182 [00:00<?, ? examples/s]

In [None]:
test_data[0]

{'title': 'Feeling Overwhelmed with Potty Training',
 'body': "My 1-year-old is showing interest in the potty, but every time I try to sit him down, he just stands up and runs away giggling. I’ve tried making it fun with songs and games, but nothing seems to work. I'm feeling a bit overwhelmed and wondering if I'm starting too soon. Any advice on how to make this less stressful?",
 'topic': 'Potty training struggles - Ages 1-2',
 'standard_prompt': "<s>[INST] You are a parenting assistant. Your job is to help parents make informed decisions and solve day-to-day challenges with their children.\nKeep your answers under 250 words and focused on the user’s specific question.\n\nQuestion Title: Feeling Overwhelmed with Potty Training\nQuestion Body: My 1-year-old is showing interest in the potty, but every time I try to sit him down, he just stands up and runs away giggling. I’ve tried making it fun with songs and games, but nothing seems to work. I'm feeling a bit overwhelmed and wondering

# Tokenizing the dataset
We want to do batch inference and hence we we need to pad and truncate.

We don't want to truncate too much though. So, lets first estimate max tokens based on the current dataset

In [None]:
# Set Up Tokenizer - end of the prompts must align so left padding
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL_ID,
    use_fast=True,
    padding_side="left",
    add_bos_token=False,
    add_eos_token=False,
    token = HF_TOKEN
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Get max_length by tokenizing and count max tokens in test data without padding
import numpy as np
def get_max_tokens(test_data, prompt_lbl):
  encoded = tokenizer(
      list(test_data[prompt_lbl]),
      return_tensors=None,
      padding=False,
      truncation=False,
  )
  n_tokens_max = int(np.ceil(np.mean([len(x) for x in encoded['input_ids']])))
  return n_tokens_max

n_tokens_standard = get_max_tokens(test_data, 'standard_prompt')
n_tokens_empathy = get_max_tokens(test_data, 'empathy_prompt')
n_tokens_max = max(n_tokens_standard, n_tokens_empathy)

print (f"SET Max Tokens: {n_tokens_max}")
print (f"Standard Prompt Max Tokens: {n_tokens_standard}")
print (f"Empathy Prompt Max Tokens: {n_tokens_empathy}")


SET Max Tokens: 156
Standard Prompt Max Tokens: 138
Empathy Prompt Max Tokens: 156


In [None]:
# Tokenize prompts with padding and truncation
def tokenize_with_padding(test_data, prompt_lbl):
  encoded = tokenizer(
      list(test_data[prompt_lbl]),
      return_tensors="pt", # return pytorch tensors
      padding=True,
      truncation=True,
      max_length=n_tokens_max+50 # some buffer added as future test sets can have more tokens too
  )
  return encoded



In [None]:
tokenized_standard = tokenize_with_padding(test_data, 'standard_prompt')
tokenized_empathy = tokenize_with_padding(test_data, 'empathy_prompt')


In [None]:
# Ensure padding is correct!
## We need to left pad with </s> for mistral/ llama
## because <s> has semantic meaning and would confuse the model.
## There's no padding token in mistral/ llama
decoded = tokenizer.batch_decode(tokenized_standard["input_ids"], skip_special_tokens=False)
print(decoded[0])

decoded = tokenizer.batch_decode(tokenized_empathy["input_ids"], skip_special_tokens=False)
print(decoded[0])


</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s><s>[INST] You are a parenting assistant. Your job is to help parents make informed decisions and solve day-to-day challenges with their children.
Keep your answers under 250 words and focused on the user’s specific question.

Question Title: Feeling Overwhelmed with Potty Training
Question Body: My 1-year-old is showing interest in the potty, but every time I try to sit him down, he just stands up and runs away giggling. I’ve tried making it fun with songs and games, but nothing seems to work. I'm feeling a bit overwhelmed and wondering if I'm starting too soon. Any advice on how to make this less stressful?
[/INST]
</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s><s>[INST] You are a parenting assistant. Your job is to help parents make informed decisions and solve day-to-day challenges wit

# Loading Mistral Instruct v0.3
### Note on Quantization vs No Quantization:
On A100, we don't need quantization and without quantization, inference might actually be faster. If VRAM usage becomes a problem, we can consider quantization later.

For inference, we will need to chunk dataset though, else we will be OOM.

In [None]:
# LOAD THE BASE MODEL WITHOUT QUANTIZATION (should work and even be faster for A100)
import torch
# from transformers import AutoModelForCausalLM, BitsAndBytesConfig # for quantization
from transformers import AutoModelForCausalLM

torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")

## version without quantization - running out of memory when dealing with large dataset
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2", # FA2 is fastest on A100
    token=HF_TOKEN # login to hugging face
)å

# For inference always turn on eval model to avoid drop outs etc.
model.eval()

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): MistralRMSNorm((4096,), eps=1e-0

In [None]:
import time

## do inference on tokenized prompts
def get_inferences(tokenized_data):
  with torch.inference_mode(): # better than torch.no_grad()
    inputs = {k: v.to(model.device) for k, v in tokenized_data.items()}
    outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_OUTPUT_TOKENS,
        do_sample=False, # use we use this instead of True + temperature/top_p to get more deterministic responses (greedy i.e. most likely next token is selected)
        eos_token_id=tokenizer.eos_token_id, # prevents model from generating tokens until max_tokens are reached
        pad_token_id=tokenizer.pad_token_id, # required for batch inference
        use_cache=True # reduces computation of
    )
    raw_output = tokenizer.batch_decode(outputs, skip_special_tokens=False)
    return raw_output

def get_cleaned_answer(text):
  qasplit = text.split("[/INST]")
  a = qasplit[1]
  a = a.replace('</s>', '').strip()
  return a

def batch_inferences(tokenized_data): # to save memory
  chunk_size = 100   # adjust based on your GPU memory
  all_outputs = []
  stt = time.time()
  n = len(tokenized_data["input_ids"])
  for i in range(0, n, chunk_size):
    print (f"Running for chunk starting at {i}")
    batch = {k: v[i:i+chunk_size] for k, v in tokenized_data.items()}
    out = get_inferences(batch)
    all_outputs.extend(out)
  ttt = time.time() - stt
  print(f"Time taken for {n} samples: {ttt/60:.2f} min ({ttt:.2f} secs)")
  return all_outputs


In [None]:
# Run inference on standard prompt
output_standard = batch_inferences(tokenized_standard)


Running for chunk starting at 0
Running for chunk starting at 100
Running for chunk starting at 200
Running for chunk starting at 300
Running for chunk starting at 400
Running for chunk starting at 500
Running for chunk starting at 600
Running for chunk starting at 700
Running for chunk starting at 800
Running for chunk starting at 900
Running for chunk starting at 1000
Running for chunk starting at 1100
Time taken for 1182 samples: 6.89 min (413.60 secs)


In [None]:
# Clean output and add to test_data
output_standard_clean = [get_cleaned_answer(x) for x in output_standard]
test_data = test_data.add_column('standard_output', output_standard_clean)
test_data.to_json(TEST_DATA_FOLDER+TEST_DATA_FILE+'_output_interim.jsonl', orient="records", lines=True)


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

3949370

In [None]:
# Run inference on empathy prompt
output_empathy = batch_inferences(tokenized_empathy)


Running for chunk starting at 0
Running for chunk starting at 100
Running for chunk starting at 200
Running for chunk starting at 300
Running for chunk starting at 400
Running for chunk starting at 500
Running for chunk starting at 600
Running for chunk starting at 700
Running for chunk starting at 800
Running for chunk starting at 900
Running for chunk starting at 1000
Running for chunk starting at 1100
Time taken for 1182 samples: 6.93 min (415.84 secs)


In [None]:
# Clean output and add to test_data
output_empathy_clean = [get_cleaned_answer(x) for x in output_empathy]
test_data = test_data.add_column('empathy_output', output_empathy_clean)
test_data.to_json(TEST_DATA_FOLDER+TEST_DATA_FILE+'_output.jsonl', orient="records", lines=True)


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

6003157

In [None]:
# ## Once everything runs, terminate session to save compute units
from google.colab import runtime
runtime.unassign()
