In [1]:
# !pip install -Uqq transformers accelerate bitsandbytes  # allow efficient inference with option load_in_8bit=True

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_names = ['EleutherAI/pythia-2.8B-deduped',
               'bigscience/bloom-3b',
               'cerebras/Cerebras-GPT-2.7B']  # bigger, may overflow RAM

In [4]:
model_name = model_names[2]

In [5]:
print('Making string tokenizer...')

Making string tokenizer...


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json: 100%|██████████| 361/361 [00:00<00:00, 2.64MB/s]
vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.62MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 764kB/s]


In [7]:
print(f'Loading model `{model_name}`...')

Loading model `cerebras/Cerebras-GPT-2.7B`...


In [8]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # automatically store parameters on gpu, cpu or disk
    low_cpu_mem_usage=True,  # try to limit RAM
    load_in_8bit=True,  # load model in low precision to save memory
    torch_dtype=torch.float16,  # load model in low precision to save memory
    offload_state_dict=True,  # offload onto disk if needed
    offload_folder="offload",  # offload model to `offload/`
)

pytorch_model.bin: 100%|██████████| 10.7G/10.7G [01:45<00:00, 102MB/s] 


In [9]:
print('Finished loading model')

Finished loading model


In [10]:
import time

In [21]:
set_seed(1)
prompt = '''The following is Python code for plotting the sine function from 0 to 2pi:'''
# prompt = '''La ciudad más grande de América del Norte es:'''
# prompt = '''What is a good question for a course in numerical methods for chemical engineers?'''
# prompt = '''What is a number that can be represented exactly in decimal scientific notation, but not in binary floating point?'''

In [22]:
print('Performing model inference with prompt `%s`' % prompt)

Performing model inference with prompt `The following is Python code for plotting the sine function from 0 to 2pi:`


In [23]:
print(f'Output {model_name}:')

Output cerebras/Cerebras-GPT-2.7B:


In [24]:
start_time = time.time()

In [25]:
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

In [26]:
generated_ids = model.generate(**inputs, max_new_tokens=100)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [27]:
outputs = tokenizer.batch_decode(generated_ids)

In [28]:
print(outputs[0])

The following is Python code for plotting the sine function from 0 to 2pi:
import matplotlib.pyplot as plt
import numpy as np

def sine(x):
    return np.sin(x)

x = np.linspace(0, 2*np.pi, 100)
y = np.sin(x)

plt.plot(x, y)
plt.show()

The plot looks like this:

The problem is that the sine function is


In [29]:
print('Inference time: %.2fs' % (time.time() - start_time))

Inference time: 9.89s
