In [1]:
import gc
import os
import pathlib
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel, LlamaForCausalLM
import torch
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable, Any
import scml
from scml import pandasx as pdx
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [2]:
device=torch.device('cpu')
if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda', 0)

In [3]:
%%time
tokenizer = AutoTokenizer.from_pretrained("huggingface/meta-llama/Llama-2-7b-hf")
tokenizer.add_special_tokens({"pad_token":"<pad>"})
print(f"{repr(tokenizer)}\npad_token_id={tokenizer.pad_token_id}\nmodel_input_names={tokenizer.model_input_names}")

LlamaTokenizerFast(name_or_path='huggingface/meta-llama/Llama-2-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
pad_token_id=32000
model_input_names=['input_ids', 'attention_mask']
CPU times: user 29.4 ms, sys: 4.96 ms, total: 34.3 ms
Wall time: 35.1 ms


In [4]:
%%time
# Inference uses fp16
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16)
model = model.to(device)
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))
print(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32001, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head):

In [5]:


prompt = "Hey, are you conscious? Can you talk to me?"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate
generate_ids = model.generate(inputs.input_ids, max_length=256)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

'Hey, are you conscious? Can you talk to me?\nI’m conscious, but I can’t talk.\nI’m conscious, but I can’t talk. I’m conscious, but I can’t talk.\nI’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk. I’m conscious, but I can’t talk.'

In [6]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:35.554114
