In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
torch.manual_seed(0)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = 'cpu'
print("Device is", device)

# Loading model
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

In [None]:
torch.backends.mps.is_available()

In [None]:
torch.cuda.is_available()

In [None]:
print(f"Model size: {model.get_memory_footprint()/ (1023**3):,} GB")

In [None]:
model_int8 = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', load_in_8bit=True,load_in_8bit_fp32_cpu_offload=True)
print(f"Model size: {model_int8.get_memory_footprint():,} bytes")

In [None]:

def text_inference(model_list, text):
    results = []
    for model in model_list:
        inference_engine = ModelInference(model, device='gpu')  # Assuming default to GPU
        result = inference_engine.inference(text)
        results.append(result)
    return results

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import os
from huggingface_hub import login

model_id = "TheBloke/zephyr-7B-alpha-AWQ"

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32,device_map='auto')

tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token


messages = [
    {"role": "user", "content": "heyyyy"}
]
encoded_string = tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encoded_string.to(model.device)

generated_ids = model.generate(model_inputs, pad_token_id=tokenizer.pad_token_id, max_new_tokens = 1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print("Model: ", decoded[0])    

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_name_or_path = "TheBloke/Llama-2-7b-Chat-AWQ"

# Load model
model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
                                          trust_remote_code=False, safetensors=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)

prompt = "Tell me about AI"
prompt_template=f'''[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>
{prompt}[/INST]

'''

print("\n\n*** Generate:")

tokens = tokenizer(
    prompt_template,
    return_tensors='pt'
).input_ids.cuda()

# Generate output
generation_output = model.generate(
    tokens,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    max_new_tokens=512
)

print("Output: ", tokenizer.decode(generation_output[0]))


In [None]:
from awq import AutoAWQForCausalLM
model_id = "TheBloke/Llama-2-7B-Chat-AWQ"
# model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32,device_map='auto')
model = AutoAWQForCausalLM.from_quantized(model_id, fuse_layers=True,
                                    trust_remote_code=False, safetensors=True)

# model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", attn_implementation="flash_attention_2", device_map='auto')
# print(f"Model Size: {model.get_memory_footprint() / (1024**3):,} GB")

In [None]:
from awq import AutoAWQForCausalLM
model_id = "TheBloke/Llama-2-7B-Chat-AWQ"
# model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32,device_map='auto')
model = AutoAWQForCausalLM.from_quantized(model_id, fuse_layers=True,
                                    trust_remote_code=False, safetensors=True)

# model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", attn_implementation="flash_attention_2", device_map='auto')
# print(f"Model Size: {model.get_memory_footprint() / (1024**3):,} GB")

In [None]:

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import os
from huggingface_hub import login


while True:

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

    # User input
    user_input = input("You: ")
    if not user_input.strip():
        print("Exiting...")
        break

    messages = [
        {"role": "user", "content": user_input}
    ]
    encoded_string = tokenizer.apply_chat_template(messages, return_tensors="pt")

    model_inputs = encoded_string.to('cuda')
    
    generated_ids = model.generate(model_inputs, pad_token_id=tokenizer.pad_token_id, max_new_tokens = 1000, do_sample=True)
    decoded = tokenizer.batch_decode(generated_ids)
    print("Model: ", decoded[0]) 

In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
# # To use a different branch, change revision
# # For example: revision="gptq-4bit-64g-actorder_True"
# model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
#                                              device_map="auto",
#                                              trust_remote_code=False,
#                                              revision="main")

# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

# prompt = "Tell me about AI"
# prompt_template=f'''[INST] <<SYS>>
# You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
# <</SYS>>
# {prompt}[/INST]

# '''

# print("\n\n*** Generate:")

# input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
# output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
# print(tokenizer.decode(output[0]))

# # Inference can also be done using transformers' pipeline

# print("*** Pipeline:")
# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     max_new_tokens=512,
#     do_sample=True,
#     temperature=0.7,
#     top_p=0.95,
#     top_k=40,
#     repetition_penalty=1.1
# )

# print(pipe(prompt_template)[0]['generated_text'])


In [None]:
text = """ [INST] asdas [/INST]  [INST] asdas [/INST] It seems like you have typed an irrelevant word "asdas" in your message. Could you please provide more context or clarify your question so I can assist you better?</s></s> [INST] Hahaah [/INST] I apologize if my previous response was unhelpful. I\'m here to help answer questions to the best of my ability. If you have a question related to programming, technology, mathematics, or any other field, please feel free to ask and I will do my best to provide an accurate and helpful response.\n\nRegarding the text "asdas" you have typed, it appears to be meaningless. If you meant something else, could you please clarify?\n\nBest regards,\nAI Assistant.</s>"""

In [None]:
output = ('<s> [INST] asdas [/INST]<s>  [INST] asdas [/INST] It seems like you have typed an irrelevant word "asdas" in your message. Could you please provide more context or clarify your question so I can assist you better?</s></s> [INST] Hahaah [/INST]<s>  [INST] asdas [/INST]<s>   [INST] asdas [/INST] It seems like you have typed an irrelevant word "asdas" in your message. Could you please provide more context or clarify your question so I can assist you better?</s></s>  [INST] Hahaah [/INST] I apologize if my previous response was unhelpful. I\'m here to help answer questions to the best of my ability. If you have a question related to programming, technology, mathematics, or any other field, please feel free to ask and I will do my best to provide an accurate and helpful response.\n\nRegarding the text "asdas" you have typed, it appears to be meaningless. If you meant something else, could you please clarify?\n\nBest regards,\nAI Assistant.</s></s> [INST] asxnisdcnwi [/INST] It seems like you have typed an unintelligible phrase "asxnisdcnwi". Could you please provide more context or clarify your question so I can assist you better? If you meant to ask a specific question related to programming, technology, mathematics, or any other field, feel free to ask.\n\nBest regards,\nAI Assistant.</s>', None, None, None)
transformedutput = []

In [None]:
for chatbot_output in output[:]:
    new_output = ''
    counter = 0

    if chatbot_output == None:
        transformedutput.append(chatbot_output)
        continue
        
    for prompt in chatbot_output.split('[INST]'):
        for prompt_output in prompt.split('[/INST]'):
            if counter == 0:
                new_output +=  'ASSISTANT' + prompt_output + '\n\n'
                counter = 1
            else:
                new_output += 'USER' + prompt_output + '\n'
                counter = 0

    transformedutput.append(new_output)

transformedutput = tuple(transformedutput)

In [None]:
tuple(transformedutput)

In [None]:
print(output)

In [None]:
list('<s> [INST] asdas [/INST]<s>  [INST] asdas [/INST] It seems like you have typed an irrelevant word "asdas" in your message. Could you please provide more context or clarify your question so I can assist you better?</s></s> [INST] Hahaah [/INST]<s>  [INST] asdas [/INST]<s>   [INST] asdas [/INST] It seems like you have typed an irrelevant word "asdas" in your message. Could you please provide more context or clarify your question so I can assist you better?</s></s>  [INST] Hahaah [/INST] I apologize if my previous response was unhelpful. I\'m here to help answer questions to the best of my ability. If you have a question related to programming, technology, mathematics, or any other field, please feel free to ask and I will do my best to provide an accurate and helpful response.\n\nRegarding the text "asdas" you have typed, it appears to be meaningless. If you meant something else, could you please clarify?\n\nBest regards,\nAI Assistant.</s></s> [INST] asxnisdcnwi [/INST] It seems like you have typed an unintelligible phrase "asxnisdcnwi". Could you please provide more context or clarify your question so I can assist you better? If you meant to ask a specific question related to programming, technology, mathematics, or any other field, feel free to ask.\n\nBest regards,\nAI Assistant.</s>', None, None, None)
