# Step 1: Install Required Libraries

In [1]:
!pip install --upgrade transformers bitsandbytes accelerate optimum




# Step 2: Load the Model and Tokenizer

In [2]:
!pip install optimum
!pip install auto-gptq
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the model name
model_name = "TheBloke/Llama-2-7B-Chat-GPTQ"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the GPTQ model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,  # Ensure model runs in float16
    trust_remote_code=True
)

print("Model loaded successfully!")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
  @custom_fwd
  @custom_bwd


Model loaded successfully!


# Step 3: Implement Conversation Memory

In [3]:
MAX_HISTORY_TOKENS = 2048  # Limit conversation history to prevent memory overflow

def trim_history(history):
    """
    Trims conversation history to avoid excessive GPU memory usage.
    """
    tokens = tokenizer(history, return_tensors="pt").input_ids
    if tokens.shape[1] > MAX_HISTORY_TOKENS:
        tokens = tokens[:, -MAX_HISTORY_TOKENS:]  # Keep only recent tokens
    return tokenizer.decode(tokens[0], skip_special_tokens=True)


# Step 4: Define the Chatbot Function (With Memory)

In [4]:
def generate_response(conversation_history, user_input, max_length=1024, temperature=0.7, top_k=50, top_p=0.9):
    """
    Generates a response from the GPTQ model while keeping conversation history.

    Args:
        conversation_history (str): Previous user-bot exchanges.
        user_input (str): Current user question.
        max_length (int): Maximum response length.
        temperature (float): Randomness control.
        top_k (int): Limits token selection to top K most probable words.
        top_p (float): Controls nucleus sampling.

    Returns:
        str: Model's generated response.
    """

    # Update history with new user input
    conversation_history += f"\nUser: {user_input}\nBot:"

    # Trim conversation history to avoid excessive memory use
    conversation_history = trim_history(conversation_history)

    # Tokenize input (GPTQ requires input_ids in long format)
    input_ids = tokenizer(conversation_history, return_tensors="pt").input_ids.to(model.device).to(torch.long)

    # Generate response
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            do_sample=True,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p
        )

    # Decode output
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Append bot response to history
    conversation_history += f" {response}\n"

    return response, conversation_history


# Step 5: Implement Live Chat Loop

In [6]:
import gc

# Initialize conversation history
conversation_history = "Bot: Hello! How can I assist you today?\n"

while True:
    try:
        user_input = input("You: ")  # Get live user input
        if user_input.lower() in ["exit", "quit", "bye"]:
            print("Bot: Goodbye! Have a great day. 😊")
            break  # Exit chat

        # Get bot response
        response, conversation_history = generate_response(conversation_history, user_input)
        print(f"Bot: {response}")

        # Clear GPU memory to avoid crashes
        torch.cuda.empty_cache()
        gc.collect()

    except KeyboardInterrupt:
        print("\nBot: Chat ended. Goodbye! 😊")
        break


You: What are the benefits of exercise?
Bot: Bot: Hello! How can I assist you today?

User: What are the benefits of exercise?
Bot: Exercise has numerous benefits for the body and mind! It can help improve cardiovascular health, increase strength and flexibility, boost mood, and even reduce stress. Regular exercise can also improve sleep quality and increase cognitive function. Additionally, it can help with weight management and reduce the risk of chronic diseases like diabetes and certain types of cancer. Overall, exercise is an important part of a healthy lifestyle and can have a significant impact on overall health and well-being. Would you like to know more about specific types of exercise or how to get started with a workout routine?
You: Explain how does it prevent cancer
Bot: Bot: Hello! How can I assist you today?

User: What are the benefits of exercise?
Bot: Bot: Hello! How can I assist you today?

User: What are the benefits of exercise?
Bot: Exercise has numerous benefits 