In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
model_dir = Path("C:\mydata\pretrained\meta-llama\Llama-2-7b-chat-hf")

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)

  pd.set_option("use_inf_as_na", True)


In [4]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

device=0, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB
device=1, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [5]:
tokenizer = AutoTokenizer.from_pretrained(str(model_dir))
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
print(f"pad_token_id={tokenizer.pad_token_id}")

LlamaTokenizerFast(name_or_path='C:\mydata\pretrained\meta-llama\Llama-2-7b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
model_input_names=['input_ids', 'attention_mask']
pad_token_id=None


In [6]:
%%time
# device_map="auto" requires `pip install accelerate`
model = AutoModelForCausalLM.from_pretrained(str(model_dir), device_map="auto")
#model = AutoModelForCausalLM.from_pretrained(str(model_dir), torch_dtype=torch.float16)
#model.to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CPU times: total: 16.4 s
Wall time: 26.5 s


In [7]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head):

In [8]:
prompt = """<s>[INST] <<SYS>>You are a 12 year-old student who must answer the following QUESTION related to the ARTICLE.
Your answer must be in your own words and use complete sentences.
QUESTION: Summarize the pros and cons of pushing back school start times. Which side of the argument does the author support with more evidence? Cite evidence from the text.
ARTICLE: As your phone’s alarm blares, you slowly open one eye. How can it be morning already? It seems like it was only a few hours ago that you finished your homework.

You close your eyes for just five more minutes, but then your mom bangs on the door: “Time to get up or you’ll miss the bus!”

If this sounds familiar, you’re not alone. Many teens struggle to get enough sleep. But when schools switched to remote learning in 2020 because of the pandemic, classes often began later. That, combined with many students not having to commute to school, allowed teens to get some extra shut-eye.

Now, as in-person classes resume, a number of schools are going back to their earlier start times. But some kids are trying to change that. For example, in Cherry Hill, New Jersey, a group of students have formed an organization called Cherry Hill Students for Later School Start Times. It wants local middle and high schools to push back the start of the school day to 8:30 a.m. (Currently, middle schools in Cherry Hill start at 8 a.m., and high schools start at 7:30.) 

In fact, schools in several states have switched to later start times in recent years. Research shows that inadequate sleep can negatively affect students’ health and grades. 

But not everyone thinks kids should be sleeping in. Opponents say starting classes later is expensive for school districts and limits students’ time for after-school activities.

Do we need a wake-up call when it comes to school start times? Keep reading, then decide what you think.

Let Kids Sleep In

Starting the school day too early can deprive teens of much-needed sleep, experts say. According to the Sleep Foundation, nearly 60 percent of middle school students and more than 70 percent of high school students in the U.S. aren’t getting the recommended 8.5 to 9.5 hours of sleep a night. 

Studies show that well-rested teens are more likely to get good grades and less likely to be in car accidents or suffer from depression.

What’s more, experts say, teens are biologically wired to fall asleep later at night. As a result, some students may get as few as five hours of sleep before they need to be up for their first class.

As Aiden Rood, the co-founder of Cherry Hill Students for Later School Start Times, explains: “Kids are safer and healthier in general when they start later, and they do better in school.”

You Snooze, You Lose?

Many school districts, however, say delaying the start of the school day would create a number of challenges. 

For starters, districts would have to reschedule school buses—and perhaps pay for more of them. There’s also the concern that later school start times could leave some kids walking home in the dark. 

Shifting school hours could also create child care problems for some parents. In addition, opponents say, later school start times would leave teens with fewer hours for after-school activities and homework. 

That’s what Erin Isherwood, a parent in Chico, California, is worried about. The start time at her kids’ high school was pushed back 45 minutes this year.

“My son has an exercise class that he absolutely loves at a gym” that was right after school last year, she explains. “[Now] he can’t go.”
<</SYS>>
[/INST]
"""

In [9]:
%%time
inputs = tokenizer(prompt, return_tensors="pt").to(device)
gids = model.generate(
    inputs.input_ids,
    max_new_tokens=512,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=2,
)
outputs = tokenizer.batch_decode(gids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
for i in range(len(outputs)):
    out = outputs[i][len(prompt):].strip()
    print(f"=====  ANSWER {i+1}  =====\n{out}\n\n")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


OutOfMemoryError: CUDA out of memory. Tried to allocate 30.00 MiB. GPU 0 has a total capacity of 15.99 GiB of which 0 bytes is free. Of the allocated memory 14.61 GiB is allocated by PyTorch, and 559.56 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)