In [1]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import os

def save_model(model, tokenizer, save_directory):
    """Save the model and tokenizer to the specified directory."""
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)
    model.save_pretrained(save_directory)
    tokenizer.save_pretrained(save_directory)
    print(f"Model and tokenizer saved to {save_directory}")

def load_model(model_name, token, save_directory=None):
    """Load the LLaMA model and tokenizer from Hugging Face or from a local directory."""
    if save_directory and os.path.exists(save_directory):
        print(f"Loading model and tokenizer from {save_directory}...")
        tokenizer = AutoTokenizer.from_pretrained(save_directory)
        model = AutoModelForCausalLM.from_pretrained(save_directory, torch_dtype=torch.float16)
    else:
        print(f"Loading model and tokenizer from Hugging Face {model_name}...")
        tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            token=token,
            torch_dtype=torch.float16,
            device_map="auto"  # Automatically places the model on available GPUs
        )
        # Optionally save it for future use
        if save_directory:
            save_model(model, tokenizer, save_directory)

    return pipeline("text-generation", model=model, tokenizer=tokenizer)

def extract_data_from_json(json_path):
    """Extract data from the JSON file."""
    with open(json_path, 'r') as f:
        return json.load(f)

def split_text_into_chunks(text, chunk_size=2000):
    """Split large text into smaller chunks."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def find_relevant_chunk(chunks, query, encoder):
    """Find the most relevant chunk based on query similarity."""
    query_vector = encoder.encode(query, convert_to_tensor=True).cpu()
    chunk_vectors = encoder.encode(chunks, convert_to_tensor=True).cpu()
    similarities = np.dot(chunk_vectors.numpy(), query_vector.numpy())
    return chunks[np.argmax(similarities)]

def generate_response(prompt, context, model_pipeline):
    """Generate a response from the model using the prompt and context."""
    input_text = f"Context: {context}\n\nQuestion: {prompt}\nAnswer:"
    response = model_pipeline(input_text, max_new_tokens=200, do_sample=True)
    return response[0]["generated_text"]

if __name__ == "__main__":
    huggingface_token = "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"  # Replace with your token
    model_name = "meta-llama/Llama-2-7b-chat-hf"
    save_directory = "./llama_model"  # Local directory to save the model
    json_path = "yahoo_news.json"  # Replace with your JSON file path

    # Ensure CUDA is available
    if torch.cuda.is_available():
        print("CUDA is available. Using GPU.")
        device = "cuda"  # Set device to GPU
    else:
        print("CUDA is not available. Using CPU.")
        device = "cpu"  # Set device to CPU

    print("Loading LLaMA model...")
    llama_pipeline = load_model(model_name, huggingface_token, save_directory)

    print("Loading JSON data...")
    data = extract_data_from_json(json_path)

    print("Loading Sentence Transformer for encoding...")
    encoder = SentenceTransformer("all-MiniLM-L6-v2")
    save_model(llama_pipeline.model, llama_pipeline.tokenizer, save_directory)
    print("Ready for questions.")
    while True:
        user_question = input("Ask a question (or type 'exit' to quit): ")
        if user_question.lower() == "exit":
            break

        print("Finding relevant chunk...")
        relevant_chunk = find_relevant_chunk(data, user_question, encoder)
        print("Generating response...")
        response = generate_response(user_question, relevant_chunk, llama_pipeline)
        print(f"Answer: {response}")

    # Optionally save the model after usage
    save_model(llama_pipeline.model, llama_pipeline.tokenizer, save_directory)

CUDA is available. Using GPU.
Loading LLaMA model...
Loading model and tokenizer from Hugging Face meta-llama/Llama-2-7b-chat-hf...


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [3]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import os
import ipywidgets as widgets
from IPython.display import display

# Function definitions from your provided code (save_model, load_model, etc.)

def save_model(model, tokenizer, save_directory):
    """Save the model and tokenizer to the specified directory."""
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)
    model.save_pretrained(save_directory)
    tokenizer.save_pretrained(save_directory)
    print(f"Model and tokenizer saved to {save_directory}")

def load_model(model_name, token, save_directory=None):
    """Load the LLaMA model and tokenizer from Hugging Face or from a local directory."""
    if save_directory and os.path.exists(save_directory):
        print(f"Loading model and tokenizer from {save_directory}...")
        tokenizer = AutoTokenizer.from_pretrained(save_directory)
        model = AutoModelForCausalLM.from_pretrained(save_directory, torch_dtype=torch.float16)
    else:
        print(f"Loading model and tokenizer from Hugging Face {model_name}...")
        tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            token=token,
            torch_dtype=torch.float16,
            device_map="auto"  # Automatically places the model on available GPUs
        )
        # Optionally save it for future use
        if save_directory:
            save_model(model, tokenizer, save_directory)

    return pipeline("text-generation", model=model, tokenizer=tokenizer)

def generate_response_ui(user_question, model_pipeline, encoder, data):
    """Generate a response for the chatbot based on the user's question."""
    print("Finding relevant chunk...")
    relevant_chunk = find_relevant_chunk(data, user_question, encoder)
    print("Generating response...")
    response = generate_response(user_question, relevant_chunk, model_pipeline)
    return response

# Load the model and other resources
huggingface_token = "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"
model_name = "meta-llama/Llama-2-7b-chat-hf"
save_directory = "./llama_model"
json_path = "yahoo_news.json"

# Ensure CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load LLaMA model
llama_pipeline = load_model(model_name, huggingface_token)  # Fix: Only pass model_name and token

# Load data (JSON) for the chatbot
data = extract_data_from_json(json_path)

# Load Sentence Transformer for encoding
encoder = SentenceTransformer("all-MiniLM-L6-v2")

# Define the chatbot UI elements
chatbox = widgets.Output()
user_input = widgets.Textarea(
    value='',
    placeholder='Type your question here...',
    description='Your Question:',
    disabled=False,
    layout=widgets.Layout(width='100%', height='80px')
)
send_button = widgets.Button(description="Send")

# Function to handle sending the message
def on_button_click(b):
    user_question = user_input.value
    if user_question.strip() == "":
        return

    with chatbox:
        print(f"You: {user_question}")
        response = generate_response_ui(user_question, llama_pipeline, encoder, data)
        print(f"Bot: {response}")

    # Clear the user input field after sending the question
    user_input.value = ""

# Link the button to the function
send_button.on_click(on_button_click)

# Display the UI components
display(chatbox, user_input, send_button)


Loading model and tokenizer from Hugging Face meta-llama/Llama-2-7b-chat-hf...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



KeyboardInterrupt: 

In [4]:
%pip install gradio
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
import os
import gradio as gr

# Function definitions from your provided code (save_model, load_model, etc.)

def save_model(model, tokenizer, save_directory):
    """Save the model and tokenizer to the specified directory."""
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)
    model.save_pretrained(save_directory)
    tokenizer.save_pretrained(save_directory)
    print(f"Model and tokenizer saved to {save_directory}")

def load_model(model_name, token, save_directory=None):
    """Load the LLaMA model and tokenizer from Hugging Face or from a local directory."""
    if save_directory and os.path.exists(save_directory):
        print(f"Loading model and tokenizer from {save_directory}...")
        tokenizer = AutoTokenizer.from_pretrained(save_directory)
        model = AutoModelForCausalLM.from_pretrained(save_directory, torch_dtype=torch.float16)
    else:
        print(f"Loading model and tokenizer from Hugging Face {model_name}...")
        tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            token=token,
            torch_dtype=torch.float16,
            device_map="auto"  # Automatically places the model on available GPUs
        )
        # Optionally save it for future use
        if save_directory:
            save_model(model, tokenizer, save_directory)

    return pipeline("text-generation", model=model, tokenizer=tokenizer)

def extract_data_from_json(json_path):
    """Extract data from the JSON file."""
    with open(json_path, 'r') as f:
        return json.load(f)

def split_text_into_chunks(text, chunk_size=2000):
    """Split large text into smaller chunks."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def find_relevant_chunk(chunks, query, encoder):
    """Find the most relevant chunk based on query similarity."""
    query_vector = encoder.encode(query, convert_to_tensor=True).cpu()
    chunk_vectors = encoder.encode(chunks, convert_to_tensor=True).cpu()
    similarities = torch.matmul(chunk_vectors, query_vector.T).cpu()
    return chunks[torch.argmax(similarities).item()]

def generate_response(prompt, context, model_pipeline):
    """Generate a response from the model using the prompt and context."""
    input_text = f"Context: {context}\n\nQuestion: {prompt}\nAnswer:"
    response = model_pipeline(input_text, max_new_tokens=200, do_sample=True)
    return response[0]["generated_text"]

def generate_response_ui(user_question, model_pipeline, encoder, data):
    """Generate a response for the chatbot based on the user's question."""
    relevant_chunk = find_relevant_chunk(data, user_question, encoder)
    response = generate_response(user_question, relevant_chunk, model_pipeline)
    return response

# Load the model and other resources
huggingface_token = "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"
model_name = "meta-llama/Llama-2-7b-chat-hf"
save_directory = "./llama_model"
json_path = "yahoo_news.json"

# Ensure CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load LLaMA model
llama_pipeline = load_model(model_name, huggingface_token)  # Fix: Only pass model_name and token

# Load data (replace with your own data loading method)
data = extract_data_from_json(json_path)

# Load Sentence Transformer for encoding
encoder = SentenceTransformer("all-MiniLM-L6-v2")

# Define the chatbot UI using Gradio
def chatbot_interface(user_question):
    response = generate_response_ui(user_question, llama_pipeline, encoder, data)
    return response

# Create Gradio interface
iface = gr.Interface(
    fn=chatbot_interface,
    inputs=gr.Textbox(label="Your Question", placeholder="Type your question here..."),
    outputs=gr.Textbox(label="Bot's Response"),
    live=True,
    title="Chatbot with LLaMA",
    description="Ask any question and get a response based on the relevant context."
)

# Launch the interface
iface.launch()


Loading model and tokenizer from Hugging Face meta-llama/Llama-2-7b-chat-hf...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://caa0fa6b2f74871573.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
import torch
import json
import os
import numpy as np

def save_model(model, tokenizer, save_directory):
    """Save the model and tokenizer to the specified directory."""
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)
    model.save_pretrained(save_directory)
    tokenizer.save_pretrained(save_directory)
    print(f"Model and tokenizer saved to {save_directory}")

def load_model(model_directory, token=None):
    """Load the pretrained model and tokenizer from a local directory."""
    if os.path.exists(model_directory):
        print(f"Loading model and tokenizer from {model_directory}...")
        tokenizer = AutoTokenizer.from_pretrained(model_directory)
        model = AutoModelForCausalLM.from_pretrained(model_directory, torch_dtype=torch.float16)
        return pipeline("text-generation", model=model, tokenizer=tokenizer)
    else:
        raise FileNotFoundError(f"The specified model directory {model_directory} does not exist.")

def extract_data_from_json(json_path):
    """Extract data from the JSON file."""
    with open(json_path, 'r') as f:
        return json.load(f)

def split_text_into_chunks(text, chunk_size=2000):
    """Split large text into smaller chunks."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def find_relevant_chunk(chunks, query, encoder):
    """Find the most relevant chunk based on query similarity."""
    query_vector = encoder.encode(query, convert_to_tensor=True).cpu()
    chunk_vectors = encoder.encode(chunks, convert_to_tensor=True).cpu()
    similarities = np.dot(chunk_vectors.numpy(), query_vector.numpy())
    return chunks[np.argmax(similarities)]

def generate_response(prompt, context, model_pipeline):
    """Generate a response from the model using the prompt and context."""
    input_text = f"Context: {context}\n\nQuestion: {prompt}\nAnswer:"
    response = model_pipeline(input_text, max_new_tokens=200, do_sample=True)
    return response[0]["generated_text"]

if __name__ == "__main__":
    model_directory = "./llama_model"  # Path to your pretrained model
    json_path = "yahoo_news.json"  # Path to your JSON file

    # Ensure CUDA is available
    if torch.cuda.is_available():
        print("CUDA is available. Using GPU.")
        device = "cuda"  # Set device to GPU
    else:
        print("CUDA is not available. Using CPU.")
        device = "cpu"  # Set device to CPU

    print("Loading pretrained model...")
    llama_pipeline = load_model(model_directory)

    print("Loading JSON data...")
    data = extract_data_from_json(json_path)

    print("Loading Sentence Transformer for encoding...")
    encoder = SentenceTransformer("all-MiniLM-L6-v2")

    print("Ready for questions.")
    while True:
        user_question = input("Ask a question (or type 'exit' to quit): ")
        if user_question.lower() == "exit":
            break

        print("Finding relevant chunk...")
        relevant_chunk = find_relevant_chunk(data, user_question, encoder)
        print("Generating response...")
        response = generate_response(user_question, relevant_chunk, llama_pipeline)
        print(f"Answer: {response}")


CUDA is available. Using GPU.
Loading pretrained model...
Loading model and tokenizer from ./llama_model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

SafetensorError: Error while deserializing header: MetadataIncompleteBuffer