In [None]:
!pip install transformers accelerate gradio torch


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:
from huggingface_hub import login

# Replace 'your_access_token' with your actual Hugging Face access token
login(token='')

import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ''

In [None]:
model_name = "meta-llama/Meta-Llama-3.1-8B"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_auth_token=True,
    padding_side="left",
    truncation_side="left",
)





In [None]:
# Load the pre-trained model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    use_auth_token=True,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)

In [None]:
# Test loading the model and tokenizer
try:
    # (Insert the corrected model and tokenizer loading code here)
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")


In [15]:
def generate_response(prompt, history=[]):
    # Combine the conversation history into a simple prompt
    conversation = ""
    for user_input, bot_response in history:
        conversation += f"User: {user_input}\nAssistant: {bot_response}\n"
    conversation += f"User: {prompt}\nAssistant:"

    # Tokenize the conversation
    inputs = tokenizer(conversation, return_tensors="pt").to(model.device)

    # Generate the response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode the output
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the assistant's response
    bot_response = output_text[len(conversation):].strip().split("User:")[0].strip()

    # Update the history
    history.append((prompt, bot_response))

    # Return the response and updated history
    return bot_response, history


In [None]:
import time

def test_model_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
        )
    end_time = time.time()
    response_time = end_time - start_time
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Response time: {response_time:.2f} seconds")
    print(f"Output: {output_text}")

# Test the function
test_model_response("Hello, how are you?")


In [None]:
import gradio as gr

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    state = gr.State([])  # Initialize conversation history

    with gr.Row():
        txt = gr.Textbox(
            show_label=False,
            placeholder="Type your message and press Enter",
            lines=1,
            container=False
        )

    def respond(user_input, history):
        bot_response, history = generate_response(user_input, history)
        # Return an empty string to clear the input textbox,
        # the updated conversation history to the chatbot component,
        # and the updated history to the state
        return "", history, history

    # Include the chatbot component and state in outputs
    txt.submit(respond, [txt, state], [txt, chatbot, state])

    # Optional: Add a button to clear the conversation
    def clear_conversation():
        return [], []

    with gr.Row():
        clear_btn = gr.Button("Clear Conversation")
        clear_btn.click(clear_conversation, inputs=None, outputs=[chatbot, state])

demo.launch(share=True)
