In [1]:
import torch

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(device)
model_path = "/home/ptummal3/Downloads/clip/llama-3.1-transformers-8b-instruct-v2"

tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForCausalLM.from_pretrained(model_path)

model.to(device)
with open("prompt.txt", "r", encoding='utf-8') as file:
    prompt = file.read()
# print(prompt)
inputs = tokenizer(prompt, return_tensors="pt", max_length=4096)
outputs = model.generate(**inputs, max_length=1024)


response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.56s/it]
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.cuda.amp import autocast

# Set the environment variable for expandable memory segments to avoid fragmentation
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["FLASH_ATTENTION"] = "0"

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

# Define the model path
model_path = "/home/ptummal3/Downloads/clip/llama-3.1-transformers-8b-instruct-v2"

# Load the tokenizer and model with memory-efficient options
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
model.to(device)
model.config.use_cache = False
model.gradient_checkpointing_enable()  # Enable gradient checkpointing for memory savings

# Read the entire text from prompt.txt file
with open("prompt.txt", "r", encoding='utf-8') as file:
    prompt = file.read().strip()  # Remove any leading/trailing whitespace

# Tokenize the input text and move tensors to the same device as the model
inputs = tokenizer(prompt, return_tensors="pt", max_length=4096, truncation=True)
inputs = {key: value.to(device) for key, value in inputs.items()}  # Move input tensors to GPU



# Generate the response with the model, using mixed precision and disabling gradients for inference
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model.generate(**inputs, max_length=1024)

# Decode the generated output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

# Clear GPU cache after generation to free memory
torch.cuda.empty_cache()


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards:  50%|█████     | 2/4 [00:48<00:48, 24.20s/it]


KeyboardInterrupt: 

In [3]:
import torch
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import json

def main(input_question):
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # Set seed for reproducibility
    seed = 42
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Load the tokenizer
    print("Loading tokenizer...")
    model_path = "/home/ptummal3/Downloads/clip/llama-3.1-transformers-8b-instruct-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, padding_side='left')
    tokenizer.pad_token = tokenizer.eos_token

    # Add special tokens and resize embeddings
    special_tokens_dict = {'additional_special_tokens': ['<s>', '</s>', '[INST]', '[/INST]', '<<SYS>>', '<</SYS>>']}
    tokenizer.add_special_tokens(special_tokens_dict)

    # Define system message
    system_message = ""

    # Load the base model with 8-bit precision
    print("Loading model...")
    bnb_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=6.0)

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map='auto',
        token=True
    ).to('cuda:0')

    # Resize token embeddings in case new tokens were added
    model.resize_token_embeddings(len(tokenizer))

    # Load the fine-tuned LoRA model
    # print("Loading fine-tuned model...")
    # peft_model_path = model_path
    # model = PeftModel.from_pretrained(base_model, peft_model_path)
    # model.eval()

    # Move the model to the appropriate device (GPU or CPU)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Set the padding token ID
    model.config.pad_token_id = tokenizer.pad_token_id

    # Function to format the input question
    def format_question(question):
        return f"<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n{question} [/INST]"

    # Prepare and format the single prompt
    prompt = format_question(input_question)

    # Tokenize prompt
    print("Tokenizing input...")
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=4096).to(device)

    inputs = inputs.to(device)

    # Generate output
    print("Generating output...")
    with torch.no_grad():
        output = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=512,
            do_sample=False,
            num_beams=1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True,
        )

    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    generated_answer = output_text[len(prompt):].strip()

    print(f"\nGenerated Answer: {generated_answer}")

if __name__ == '__main__':
    # with open("prompt.txt", "r", encoding='utf-8') as file:
    #     prompt = file.read().strip() 
    prompt = 'What is the value if we multiply 3 with 5?'
    main(prompt)


Loading tokenizer...
Loading model...


Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.93s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
You shouldn't move a model that is dispatched using accelerate hooks.
You shouldn't move a model that is dispatched using accelerate hooks.


Tokenizing input...
Generating output...




RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

In [6]:
import torch
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import json
import re

def main(input_question):
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # Set seed for reproducibility
    seed = 42
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Load the tokenizer
    print("Loading tokenizer...")
    model_path = "/home/ptummal3/Downloads/clip/llama-3.1-transformers-8b-instruct-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, padding_side='left')
    tokenizer.pad_token = tokenizer.eos_token

    # Add special tokens and resize embeddings
    special_tokens_dict = {'additional_special_tokens': ['<s>', '</s>', '[INST]', '[/INST]', '<<SYS>>', '<</SYS>>']}
    tokenizer.add_special_tokens(special_tokens_dict)

    # Define system message
    system_message = ""

    # Load the base model with 8-bit precision
    print("Loading model...")
    # bnb_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=6.0)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        # quantization_config=bnb_config,
        device_map=None  # Remove or set to None to load the model on a single GPU
    ).to(device)

    # Resize token embeddings in case new tokens were added
    model.resize_token_embeddings(len(tokenizer))

    # Set the padding token ID
    model.config.pad_token_id = tokenizer.pad_token_id

    # Function to format the input question
    def format_question(question):
        return f"<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n{question} [/INST]"

    # Prepare and format the single prompt
    prompt = format_question(input_question)

    # Tokenize prompt
    print("Tokenizing input...")
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=4096).to(device)

    # Generate output
    print("Generating output...")
    with torch.no_grad():
        output = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=512,
            do_sample=False,
            num_beams=1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True,
        )

    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    # with open("output.txt", "w", encoding='utf-8') as file:
    #     file.write(output_text)
    # print(output_text)
    # json_pattern = r'\[\s*\{(?:.|\n)?\}\s\]'

    # json_match = re.search(json_pattern, output_text)
    # if json_match:
    #     json_content = json_match.group(0)

    #     print(f"\nGenerated Answer: {json_content}")
    # else:
    #     print('No JSON content')
    generated_answer = output_text[len(prompt)-1:].strip()
    
    print(f"\nGenerated Answer: {generated_answer}")
    del model
    torch.cuda.empty_cache()

if __name__ == '__main__':
    # prompt = 'What is the value if we multiply 3 with 5?'
    with open("prompt.txt", "r", encoding='utf-8') as file:
        prompt = file.read().strip() 
    main(prompt)



Loading tokenizer...
Loading model...


Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.28s/it]


Tokenizing input...
Generating output...




 



Instructions:
You are provided with:
A target image description (text).
Image features from a user-provided image.
Your tasks are:
Decompose the target image description into three components:
a0 (Text phrases to eliminate): Identify phrases that correspond to elements whose texture or pattern we want to capture from the provided image features, but we do not want these elements to appear directly in the final image.
a1 (Secondary text phrases): Identify phrases that correspond to secondary elements that should appear in the final image.
a2 (Main role): Identify the primary subject or main element that should appear in the final image.
Assign tasks to models according to the following architecture:
Task 1:
task: "clip_model"
action: "encode_image"
input_variables: ["Image features from user input"]
output_variable: "Vθ"
Task 2:
task: "clip_model"
action: "encode_text"
input_phrases: [Phrases in a0]
output_variable: "Tθ"
Task 3:
task: "compute"
action: "subtract"
input_variables: [