In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from datasets import load_dataset
import jsonlines
from pprint import pprint


# Load instruction tuned dataset

In [9]:
instruction_tuned_datset = load_dataset('tatsu-lab/alpaca', split='train', streaming=True)

In [15]:
n = 3
top_n = instruction_tuned_datset.take(n)
for i in top_n:
    pprint(i, sort_dicts=False)

{'instruction': 'Give three tips for staying healthy.',
 'input': '',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits '
           'and vegetables. \n'
           '2. Exercise regularly to keep your body active and strong. \n'
           '3. Get enough sleep and maintain a consistent sleep schedule.',
 'text': 'Below is an instruction that describes a task. Write a response that '
         'appropriately completes the request.\n'
         '\n'
         '### Instruction:\n'
         'Give three tips for staying healthy.\n'
         '\n'
         '### Response:\n'
         '1.Eat a balanced diet and make sure to include plenty of fruits and '
         'vegetables. \n'
         '2. Exercise regularly to keep your body active and strong. \n'
         '3. Get enough sleep and maintain a consistent sleep schedule.'}
{'instruction': 'What are the three primary colors?',
 'input': '',
 'output': 'The three primary colors are red, blue, and yellow.',
 'text': 'Below 

In [17]:
instruction_tuned_datset.dataset_size

44095043

In [18]:
instruction_tuned_datset.features

{'instruction': Value(dtype='string', id=None),
 'input': Value(dtype='string', id=None),
 'output': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None)}

In [19]:
prompt_template_with_input = """Below is an instruction that describes a task, paired with an input that provides furthur context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:"""

prompt_template_without_input = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""

In [22]:
processed_data = []
for i in top_n:
    if not i['input']:
        processed_prompt = prompt_template_without_input.format(instruction=i['instruction'])
    else:
        processed_prompt = prompt_template_with_input.format(instruction=i['instruction'], input=i['input'])
    processed_data.append({'input':processed_prompt, 'output':i['output']})

In [25]:
pprint(processed_data[0], sort_dicts=False)

{'input': 'Below is an instruction that describes a task. Write a response '
          'that appropriately completes the request.\n'
          '\n'
          '### Instruction:\n'
          'Give three tips for staying healthy.\n'
          '\n'
          '### Response:',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits '
           'and vegetables. \n'
           '2. Exercise regularly to keep your body active and strong. \n'
           '3. Get enough sleep and maintain a consistent sleep schedule.'}


In [27]:
dataset_hf = load_dataset('lamini/alpaca')
print(dataset_hf)

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 52002
    })
})


# Compare non-instruction-tuned vs. instruction-tuned models

In [None]:
import lamini
lamini.api_key = "get you api key here -  https://app.lamini.ai/account"

llm = lamini.Lamini("meta-llama/Meta-Llama-3-8B-Instruct")
print(llm.generate("How are you?"))

In [34]:
non_instruct_model = lamini.Lamini('meta-llama/Llama-2-7b-hf')
non_instruct_output = non_instruct_model.generate('Tell me how to train my dog to sit')
pprint(non_instruct_output)

('.\n'
 'Tell me how to train my dog to sit. I have a 10 month old puppy and I want '
 'to train him to sit. I have tried the treat method and the verbal command '
 "method. I have tried both and he just doesn't seem to get it. I have tried "
 'to get him to sit by putting my hand on his back and pushing him down. I '
 'have tried to get him to sit by putting my hand on his back and pushing him '
 'down. I have tried to get him to sit by putting my hand on his back and '
 'pushing him down. I have tried to get him to sit by putting my hand on his '
 'back and pushing him down. I have tried to get him to sit by putting my hand '
 'on his back and pushing him down. I have tried to get him to sit by putting '
 'my hand on his back and pushing him down. I have tried to get him to sit by '
 'putting my hand on his back and pushing him down. I have tried to get him to '
 'sit by putting my hand on his back and pushing him down. I have tried to get '
 'him to sit by putting my hand on his bac

In [35]:
instruct_model = lamini.Lamini('meta-llama/Llama-2-7b-chat-hf')
instruct_output = instruct_model.generate('Tell me how to train my dog to sit')
pprint(instruct_output)

(' on command.\n'
 'Training a dog to sit on command is a basic obedience command that can be '
 "achieved with patience, consistency, and positive reinforcement. Here's a "
 'step-by-step guide on how to train your dog to sit on command:\n'
 '\n'
 '1. Choose a quiet and distraction-free area: Find a quiet area with minimal '
 'distractions where your dog can focus on you.\n'
 "2. Have treats ready: Choose your dog's favorite treats and have them ready "
 'to use as rewards.\n'
 '3. Stand in front of your dog: Stand in front of your dog and hold a treat '
 'close to their nose.\n'
 '4. Move the treat up and back: Slowly move the treat up and back, towards '
 'your dog\'s tail, while saying "sit" in a calm and clear voice.\n'
 '5. Dog will sit: As you move the treat, your dog will naturally sit down to '
 'follow the treat. The moment their bottom touches the ground, say "good sit" '
 'and give them the treat.\n'
 '6. Repeat the process: Repeat steps 3-5 several times, so your dog start

# smaller models

In [36]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/pythia-70m')
model = AutoModelForCausalLM.from_pretrained('EleutherAI/pythia-70m')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [38]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    #Tokenize
    input_ids = tokenizer.encode(
        text,
        return_tensors='pt',
        truncation=True,
        max_length = max_input_tokens
    )

    # Generate
    device = model.device
    generate_tokens_with_prompt = model.generate(
        input_ids = input_ids.to(device),
        max_length = max_output_tokens
    )

    # Decode
    generate_text_with_prompt = tokenizer.batch_decode(generate_tokens_with_prompt, skip_special_tokens=True)
    generate_text_answer = generate_text_with_prompt[0][len(text):]
    return generate_text_answer

In [41]:
finetuning_dataset = load_dataset('lamini/lamini_docs')
print(finetuning_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})


In [42]:
test_sample = finetuning_dataset['test'][0]
print(test_sample)
print(inference(test_sample['question'], model, tokenizer))

{'question': 'Can Lamini generate technical documentation or user manuals for software projects?', 'answer': 'Yes, Lamini can generate technical documentation and user manuals for software projects. It uses natural language generation techniques to create clear and concise documentation that is easy to understand for both technical and non-technical users. This can save developers a significant amount of time and effort in creating documentation, allowing them to focus on other aspects of their projects.', 'input_ids': [5804, 418, 4988, 74, 6635, 7681, 10097, 390, 2608, 11595, 84, 323, 3694, 6493, 32, 4374, 13, 418, 4988, 74, 476, 6635, 7681, 10097, 285, 2608, 11595, 84, 323, 3694, 6493, 15, 733, 4648, 3626, 3448, 5978, 5609, 281, 2794, 2590, 285, 44003, 10097, 326, 310, 3477, 281, 2096, 323, 1097, 7681, 285, 1327, 14, 48746, 4212, 15, 831, 476, 5321, 12259, 247, 1534, 2408, 273, 673, 285, 3434, 275, 6153, 10097, 13, 6941, 731, 281, 2770, 327, 643, 7794, 273, 616, 6493, 15], 'attention

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.




I have a question about the following:

How do I get the correct documentation to work?

A:

I think you need to use the following code:

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following


# Compare to finetuned small model

In [43]:
instruction_model= AutoModelForCausalLM.from_pretrained("lamini/lamini_docs_finetuned")



In [45]:
print(inference(test_sample['question'], instruction_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Yes, Lamini can generate technical documentation or user manuals for software projects. This can be achieved by providing a prompt for a specific technical question or question to the LLM Engine, or by providing a prompt for a specific technical question or question. Additionally, Lamini can be trained on specific technical questions or questions to help users understand the process and provide feedback to the LLM Engine. Additionally, Lamini
