In [1]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch
import transformers

In [4]:
# !ls /assets/models/

In [5]:
# Change this to the model you need to use. Use the above to see the list of models.
model_name_or_path = "/assets/models/mistralai-mistral-instruct-7b-v0.3"

In [6]:
# %pip install protobuf
# %pip install sentencepiece

In [7]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
print("tokenizer loaded")

# LLaMa's tokenizer does not have a valid PAD token, so we need to initialize this as so
# tokenizer.pad_token = tokenizer.eos_token

# For decoder-only models, just to be safe, also do:
tokenizer.padding_side = "left"

tokenizer loaded


In [8]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    # By default, map different parts of the model to available GPU(s).
    device_map="auto",
    # Loading the model in full precision can use a lot of
    # of memory, so we quantize it using reduced precision types.
    torch_dtype='bfloat16'
)

# Best practices
model.eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): MistralRMSNorm((4096,), eps=1e-0

### Generation

More information on using the transformers library and its components can be found here: https://huggingface.co/docs/transformers/llm_tutorial

Specifically, for text generation, the following can be useful:
- https://huggingface.co/docs/transformers/main/en/main_classes/text_generation
- https://huggingface.co/blog/how-to-generate

In [9]:
TASK_PROMPT = '''Generate 10 English sentences each for the relations: 
grandmother, grandfather, uncle, aunt, brother-in-law, sister-in-law, cousin, nephew, niece. 
For each relation, generate 10 sentences using the following topics: games, deep talks, questions, exclamations, and other forms of speeches. 
Ensure the sentences include different forms of possessive pronouns (e.g., my, their, his, her) for each of the 9 relations. Avoid using short/colloquial terms for the relations.
Only provide the response as a Python list of strings for all categories. No need to segregate them in any manner.
Sample output: 
<<<[
"My grandmother and I play chess together.", 
...
"Their grandfather and I have deep talks.", 
...
"Due to the heartache, my brother-in-law was not present in the function.",
...
"For the party, my uncle will pick and drop you.",
...
"I have the best sister-in-law in the world!",
...
"I am very grateful to my aunt to take care of my mother when I was not available",
...
... 
] (total 90 sentences: for each relation -> 10 sentences)>>>'''

def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.inference_mode():
        outputs = model.generate(
            **inputs.to(model.device),
            temperature=0.7,
            do_sample=True,
            num_return_sequences=2,
            num_beams=5,
            max_new_tokens=2000
        )

        outputs = tokenizer.batch_decode(
            outputs, skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )
    return outputs



In [10]:
# for output in outputs:
#     print("o/p::", output)
    # print('-' * 50)
    # print()
# save the outputs to a file named : generated_outputs.txt
# open in append mode

def save_outputs(outputs):
    with open('generated_outputs.txt', 'a') as f:
        for output in outputs:
            f.write(output + '\n')
            # print(output)






In [11]:
def extract_sentences(outputs):
    response = outputs[0]
    sentences = response[response.find(TASK_PROMPT) + len(TASK_PROMPT):]
    sentences = sentences[sentences.find('[') + 1:sentences.find(']')]  
    sentences = sentences.split(',\n')
    sentences = [sentence.strip() for sentence in sentences]
    sentences = [sentence[1:-1] for sentence in sentences]
    # print(sentences, type(sentences))
    return sentences

# extract_sentences(outputs)

In [12]:
# save the sentences to a file named : generated_sentences.txt
def save_sentences(sentences):
    with open('generated_sentences.txt', 'a+') as f:
        for sentence in sentences:
            f.write(sentence + '\n')

## Prompt template:


In [None]:
prompts=[]
# techniques = examples.keys()
topics=["occupation", " religion", " sport", " politics", "  health", " finance", " education", " farming", " entertainment", "  news", "  daily conversation", "  weather", " technology", " conflicts", " controversials", 
        " international", "  UN", " travel", " tourism", " shopping", " baby care", " valentines", "  soldiers", " prisioners", " soul actions", " nature", " pollution", " bio hazards", " elders", " family", 
        "  social studies", " maths", " literature", "  physics", " chemistry", "  biology", " Indian history ", " civics", " geography", "  computer", " physical education", "  arts and craft", "  food", 
        " clothes", " water shortage", " road blockage", " traffic", " arriving late", " bargaining", " toys", " games", " deep talks", "declarative", "interrogative", "imperative",  "exclamatory", "safe work space", 
        "medical checkups", "self-reliant", "future India"]

In [15]:
topics = [ "legal", "governance",  
  "STEM" ,   "business", "sports",  "culture", "Alarm",
    "Audio",
    "Calendar / Events",  # Added space for better readability
    "Communication",
    "DateTime",
    "Email",
    "Finance",  # Likely, but limited information available
    "General",
    "Home Automation",  # Likely, but limited information available
    "Location",
    "Music",
    "News",
    "Reminders",
    "Social Media",  # Likely, but limited information available
    "Travel",
    "Weather",
    "BookRestaurant",
    "BookFlight",
    "BookHotel",
    "GetDirections",
    "GetPlaceDetails",
    "GetWeather",
    "SearchForInformation",
    "GetNews",
    "PlayMusic",
    "ControlVolume",
    "SetAlarm",
    "CancelAlarm",
    "SetReminder",
    "MakeCall",
    "SendMessage",
    "SendEmail",
    "CheckEmail",
    "GetTime",
    "GetDate",
    "Greeting",
    "Farewell",
    "AskHowAreYou",
    "ThankYou",
    "TellAJoke",
    "TurnOnLights",  # and other home automation actions
    "Social media interactions",  # posting, liking, etc.
    "Financial transactions"  ]
len(topics)

49

In [16]:

# for technique in techniques:
    # pick every 10 topics from topics in a loop
for i in range(0, len(topics)):
    # print(topics[i:i+10])
    # print("Generate 100 sentences in "+ lang +" with action doer as Male or Female. Differentiate between male and female sentences using the "+ technique +" technique. You can use the topics like "+ ", ".join(topics[i:i+10]) +".Return as a python dictionary.")
    # prompts.append("Generate 10 sentences in "+ lang +" with action doer as Male or Female. Differentiate between male and female sentences using the "+ technique +" technique. You can use the topics like "+ ", ".join(topics[i:i+10]) +".Return as a python dictionary.")
    prompt = '''Generate 10 English sentences each for the relations: 
grandmother, grandfather, uncle, aunt, brother-in-law, sister-in-law, cousin, nephew, niece. 
For each relation, generate 10 sentences using the following topic: ''' + topics[i]+''' 
Ensure the sentences include different forms of possessive pronouns (e.g., my, their, his, her) for each of the 9 relations. Avoid using short/colloquial terms for the relations.
Only provide the response as a Python list of strings for all categories. No need to segregate them in any manner.
Sample output: 
<<<[
"My grandmother and I play chess together.", 
...
"Their grandfather and I have deep talks.", 
...
"Due to the heartache, my brother-in-law was not present in the function.",
...
"For the party, my uncle will pick and drop you.",
...
"I have the best sister-in-law in the world!",
...
"I am very grateful to my aunt to take care of my mother when I was not available",
...
... 
] (total 90 sentences: for each relation -> 10 sentences)>>>'''  
    prompts.append(prompt)

    outputs= generate_response(prompt=prompt)
    print("outputs done for topic: ", topics[i], i)
    save_outputs(outputs)
    sentences= extract_sentences(outputs)
    print("sentences extracted for topic:", topics[i], i)
    save_sentences(sentences)

        

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  legal 0
sentences extracted for topic: legal 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  governance 1
sentences extracted for topic: governance 1


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  STEM 2
sentences extracted for topic: STEM 2


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  business 3
sentences extracted for topic: business 3


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  sports 4
sentences extracted for topic: sports 4


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  culture 5
sentences extracted for topic: culture 5


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Alarm 6
sentences extracted for topic: Alarm 6


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Audio 7
sentences extracted for topic: Audio 7


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Calendar / Events 8
sentences extracted for topic: Calendar / Events 8


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Communication 9
sentences extracted for topic: Communication 9


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  DateTime 10
sentences extracted for topic: DateTime 10


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Email 11
sentences extracted for topic: Email 11


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Finance 12
sentences extracted for topic: Finance 12


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  General 13
sentences extracted for topic: General 13


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Home Automation 14
sentences extracted for topic: Home Automation 14


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Location 15
sentences extracted for topic: Location 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Music 16
sentences extracted for topic: Music 16


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  News 17
sentences extracted for topic: News 17


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Reminders 18
sentences extracted for topic: Reminders 18


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Social Media 19
sentences extracted for topic: Social Media 19


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Travel 20
sentences extracted for topic: Travel 20


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Weather 21
sentences extracted for topic: Weather 21


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  BookRestaurant 22
sentences extracted for topic: BookRestaurant 22


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  BookFlight 23
sentences extracted for topic: BookFlight 23


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  BookHotel 24
sentences extracted for topic: BookHotel 24


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  GetDirections 25
sentences extracted for topic: GetDirections 25


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  GetPlaceDetails 26
sentences extracted for topic: GetPlaceDetails 26


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  GetWeather 27
sentences extracted for topic: GetWeather 27


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  SearchForInformation 28
sentences extracted for topic: SearchForInformation 28


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  GetNews 29
sentences extracted for topic: GetNews 29


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  PlayMusic 30
sentences extracted for topic: PlayMusic 30


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  ControlVolume 31
sentences extracted for topic: ControlVolume 31


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  SetAlarm 32
sentences extracted for topic: SetAlarm 32


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  CancelAlarm 33
sentences extracted for topic: CancelAlarm 33


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  SetReminder 34
sentences extracted for topic: SetReminder 34


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  MakeCall 35
sentences extracted for topic: MakeCall 35


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  SendMessage 36
sentences extracted for topic: SendMessage 36


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  SendEmail 37
sentences extracted for topic: SendEmail 37


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  CheckEmail 38
sentences extracted for topic: CheckEmail 38


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  GetTime 39
sentences extracted for topic: GetTime 39


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  GetDate 40
sentences extracted for topic: GetDate 40


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Greeting 41
sentences extracted for topic: Greeting 41


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Farewell 42
sentences extracted for topic: Farewell 42


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  AskHowAreYou 43
sentences extracted for topic: AskHowAreYou 43


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  ThankYou 44
sentences extracted for topic: ThankYou 44


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  TellAJoke 45
sentences extracted for topic: TellAJoke 45


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  TurnOnLights 46
sentences extracted for topic: TurnOnLights 46


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


outputs done for topic:  Social media interactions 47
sentences extracted for topic: Social media interactions 47
outputs done for topic:  Financial transactions 48
sentences extracted for topic: Financial transactions 48


For RLHF'd models (LLaMa-3.1 Instruct, etc.), an additional prompt formatting step is needed to ensure that the model is able to generate the desired output. The template is applied using `tokenizer.apply_chat_template` function, and basically adds formatting tokens to your prompt. Use it only with instruction-fine-tuned models.

In [None]:
# # %pip install jinja2>=3.1.0

# TASK_PROMPT = "Please answer my question. What is the capital of India?"
# TASK_CONVERSATION = [
#     # System Prompt: This is optional, and not all models support this.
#     # But use it if you need explicit instructions to be followed.
#     dict(role='system', content='You are a helpful assistant.'),
#     # Your message (as if on the web interface) goes here.
#     # Past history can be added to this conversation too.
#     dict(role='user', content=TASK_PROMPT)
# ]

# # Format the conversation to a text prompt, using apply chat template.
# conversation_prompt = tokenizer.apply_chat_template(
#     TASK_CONVERSATION,
#     tokenize=False,
#     # Needed to allow the model to start its reply instead of completing yours.
#     add_generation_prompt=True
# )
# # We skip special tokens because the template already adds them. This is an overlooked thing, so be careful.
# inputs = tokenizer(conversation_prompt, return_tensors="pt", add_special_tokens=False)

# # Generation process is the same as before.
# with torch.inference_mode():
#     outputs = model.generate(
#         **inputs.to(model.device),
#         temperature=0.2,
#         do_sample=True,
#         num_return_sequences=2,
#         num_beams=2,
#         max_new_tokens=10
#     )

#     outputs = tokenizer.batch_decode(
#         outputs, skip_special_tokens=True,
#         clean_up_tokenization_spaces=True
#     )

#     for output in outputs:
#         print(output)
#         print('-' * 50)
#         print()