In [1]:
%load_ext autotime

time: 114 µs (started: 2024-03-16 12:03:26 -03:00)


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
device = "cuda" # the device to load the model onto

model_id = './Qwen'
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
    low_cpu_mem_usage= True,
    quantization_config=quantization_config,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
import textwrap
textwrap.wrap(response, width=80)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


"A large language model is a type of artificial intelligence (AI) model that has been trained on an extensive corpus of text data, typically billions or trillions of words. These models are designed to have a massive number of parameters, allowing them to learn complex patterns and relationships within language. They are often based on deep neural networks, specifically transformer architectures like those found in the GPT (Generative Pre-trained Transformer) family.\n\nLarge language models can perform a variety of natural language processing tasks, such as text generation, language translation, sentiment analysis, question answering, and even creative writing. They use their understanding of context and language structure to generate responses or complete tasks by predicting the most likely sequence of words given an input prompt or context.\n\nDue to their size and training, these models are capable of producing highly adaptable and contextually relevant outputs, but they also requi

In [1]:
from langchain_community.llms import CTransformers
model_id = '/home/rps/projects/Question-Answer-Generation-App-using-Mistral-7B/qwen1_5-14b-chat-q5_0.gguf'
config = {
    'gpu_layers': 10000,
    #'model_type': "mistral",
    'max_new_tokens' : 2048,
    'context_length' : 4096,
    'temperature' : 0.3,
    }
#'/NousResearch/gguf/nous-hermes-2-solar-10.7b.Q5_K_M.gguf'
llm = CTransformers(
    #model = '/home/rps/projects/Question-Answer-Generation-App-using-Mistral-7B/NousResearch/gguf/nous-hermes-2-solar-10.7b.Q5_K_M.gguf',#"mistral-7b-instruct-v0.1.Q4_K_S.gguf",
    model = model_id,
    config = config,
    model_type = 'llama',
)
print(llm.invoke('AI is going to'))

RuntimeError: Failed to create LLM 'llama' from '/home/rps/projects/Question-Answer-Generation-App-using-Mistral-7B/qwen1_5-14b-chat-q5_0.gguf'.

In [1]:
from ctransformers import AutoModelForCausalLM
model_id = 'qwen1_5-14b-chat-q5_0.gguf'

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained(model_id, model_file="codellama-7b-instruct.q4_K_M.gguf", model_type="llama", gpu_layers=50)

print(llm("AI is going to"))


RuntimeError: Failed to create LLM 'llama' from 'qwen1_5-14b-chat-q5_0.gguf'.

In [2]:
from llama_cpp import Llama
model_id = '/home/rps/projects/Question-Answer-Generation-App-using-Mistral-7B/qwen1_5-14b-chat-q5_0.gguf'

llm = Llama(
    model_path= model_id,
    # n_gpu_layers=-1, # Uncomment to use GPU acceleration
    # seed=1337, # Uncomment to set a specific seed
    # n_ctx=2048, # Uncomment to increase the context window
)
output = llm(
      "Q: Name the planets in the solar system? A: ", # Prompt
      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output['choices'][0]['text'])

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 2 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 1: NVIDIA GeForce RTX 3080 Ti, compute capability 8.6, VMM: yes
llama_model_loader: loaded meta data with 21 key-value pairs and 483 tensors from /home/rps/projects/Question-Answer-Generation-App-using-Mistral-7B/qwen1_5-14b-chat-q5_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.name str              = Qwen1.5-14B-Chat-AWQ-fp16
llama_model_loader: - kv   2:                          qwen2.block_count u32              = 40
llama_model_loader: - kv   3:                       qwen2.context_length u32              = 32768
llama_model_loader

Q: Name the planets in the solar system? A: 1. Mercury 2. Venus 3. Earth 4. Mars 5. Jupiter 6. Saturn 7. Uranus 8. Neptune


In [5]:
print(output['choices'][0]['text'])

Q: Name the planets in the solar system? A: 1. Mercury 2. Venus 3. Earth 4. Mars 5. Jupiter 6. Saturn 7. Uranus 8. Neptune


In [6]:
import pkg_resources
installed_packages = pkg_resources.working_set
installed_packages_list = sorted(["%s==%s" % (i.key, i.version)
   for i in installed_packages])
print(installed_packages_list)

['accelerate==0.28.0', 'aiohttp==3.9.3', 'aiosignal==1.3.1', 'annotated-types==0.6.0', 'anyio==4.3.0', 'asttokens==2.4.1', 'attrs==23.2.0', 'auto-gptq==0.7.1', 'bitsandbytes==0.43.0', 'brotli==1.0.9', 'certifi==2024.2.2', 'charset-normalizer==2.0.4', 'coloredlogs==15.0.1', 'comm==0.2.2', 'ctransformers==0.2.27', 'dataclasses-json==0.6.4', 'datasets==2.18.0', 'debugpy==1.6.7', 'decorator==5.1.1', 'dill==0.3.8', 'exceptiongroup==1.2.0', 'executing==2.0.1', 'filelock==3.13.1', 'frozenlist==1.4.1', 'fsspec==2024.2.0', 'gekko==1.0.7', 'gmpy2==2.1.2', 'greenlet==3.0.3', 'huggingface-hub==0.21.4', 'humanfriendly==10.0', 'idna==3.4', 'importlib-metadata==7.0.2', 'ipykernel==6.29.3', 'ipython==8.22.2', 'jedi==0.19.1', 'jinja2==3.1.3', 'jsonpatch==1.33', 'jsonpointer==2.4', 'jupyter-client==8.6.1', 'jupyter-core==5.7.2', 'langchain-community==0.0.28', 'langchain-core==0.1.32', 'langchain-text-splitters==0.0.1', 'langchain==0.1.12', 'langsmith==0.1.27', 'markupsafe==2.1.3', 'marshmallow==3.21.1',

In [5]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp

model_id = '/home/rps/projects/Question-Answer-Generation-App-using-Mistral-7B/qwen1_5-14b-chat-q5_0.gguf'

template = """Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate.from_template(template)
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
n_gpu_layers = -1  # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU.
n_batch = 2048  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path= model_id,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    temperature=0.3,
    max_tokens=2048,
    #context_length = 2048,
    top_p=1,
    verbose=False,  # Verbose is required to pass to the callback manager
)

# llm_chain = LLMChain(prompt=prompt, llm=llm)
# question = "How can i make an agent using langchain?"
# llm_chain.run(question)
question = """
Question: Give me a short introduction to large language model
"""
response= llm.invoke(question)

# response = llm.invoke({'question': 'Give me a short introduction to large language model'})
import textwrap
textwrap.wrap(response, width=80)

Answer:

Title: "Colbert vs. Oliver: The Intellectual Rap-off"

Introduction:
In a cultural phenomenon that combines wit, intelligence, and the undeniable talent of two late-night TV hosts, Stephen Colbert and John Oliver face off in an epic rap battle.

Verse 1 - Stephen Colbert:
I'm the king of Late Night, 
With my red-faced jokes, they're never flat.
I've got a bone to pick with you, 
John, your news is fine, but where's your fun?

Verse 2 - John Oliver:
You may be the king of Late Night's jest,
But I've got a different kind of message to address.
My news may be serious, but it's not without humor,
Stephen, let's give our audiences a battle they'll never forget.

Conclusion:
As the rap battle concludes, it's clear that Stephen Colbert and John Oliver are not only masters of their respective domains but also formidable opponents when it comes to intellectual rap-offs.























































































































['Answer:  Title: "Colbert vs. Oliver: The Intellectual Rap-off"  Introduction: In',
 'a cultural phenomenon that combines wit, intelligence, and the undeniable talent',
 'of two late-night TV hosts, Stephen Colbert and John Oliver face off in an epic',
 "rap battle.  Verse 1 - Stephen Colbert: I'm the king of Late Night,  With my",
 "red-faced jokes, they're never flat. I've got a bone to pick with you,  John,",
 "your news is fine, but where's your fun?  Verse 2 - John Oliver: You may be the",
 "king of Late Night's jest, But I've got a different kind of message to address.",
 "My news may be serious, but it's not without humor, Stephen, let's give our",
 "audiences a battle they'll never forget.  Conclusion: As the rap battle",
 "concludes, it's clear that Stephen Colbert and John Oliver are not only masters",
 'of their respective domains but also formidable opponents when it comes to',
 'intellectual rap-offs.']

In [2]:
llm.invoke('Give me a short introduction to large language model')

''