# Legal Assistant Chatbot - Fixed Version
Fine-tuned Qwen model with LoRA for legal Q&A

In [2]:
!pip install -q transformers peft accelerate huggingface_hub bitsandbytes

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

base_id    = "Qwen/Qwen2-0.5B"
adapter_id = "rzeraat/qwen-0.5b-law-v1"
QUANT_4BIT = False

# Load tokenizer
try:
    tok = AutoTokenizer.from_pretrained(adapter_id, use_fast=True)
except Exception:
    tok = AutoTokenizer.from_pretrained(base_id, use_fast=True)

if tok.pad_token_id is None and tok.eos_token_id is not None:
    tok.pad_token = tok.eos_token

# Load model
print("Loading model...")
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForCausalLM.from_pretrained(
    base_id,
    torch_dtype=dtype,
    device_map="auto" if torch.cuda.is_available() else None,
)

# Apply LoRA
model = PeftModel.from_pretrained(model, adapter_id)
model.eval()

print("✅ Loaded:", base_id, "+", adapter_id)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/976 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/327 [00:00<?, ?B/s]

Loading model...


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/898 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/35.2M [00:00<?, ?B/s]

✅ Loaded: Qwen/Qwen2-0.5B + rzeraat/qwen-0.5b-law-v1


In [None]:
import textwrap

question = "Explain consideration in contract law in 3 bullet points."

# Use the SAME format as training (Instruction/Response format)
prompt = f"""### Instruction:
{question}

### Response:"""

inputs = tok(prompt, return_tensors="pt")
if torch.cuda.is_available():
    inputs = {k: v.cuda() for k, v in inputs.items()}

with torch.inference_mode():
    out = model.generate(
        **inputs,
        max_new_tokens=300,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tok.pad_token_id or tok.eos_token_id,
        eos_token_id=tok.eos_token_id
    )

response = tok.decode(out[0], skip_special_tokens=True)
# Extract only the response part (after "### Response:")
if "### Response:" in response:
    answer = response.split("### Response:")[-1].strip()
else:
    answer = response
print(answer)

In [None]:
# @title Live CLI - FIXED with correct prompt format + STREAMING
import sys, torch
from transformers import TextStreamer

SYSTEM_PROMPT = "You are a helpful legal assistant."

print("Live CLI ready. Type your message and press Enter.")
print("Command: /exit (quit)\n")

def generate_answer(question):
    # Build prompt in training format (Instruction/Response)
    prompt = f"""### Instruction:
{question}

### Response:"""

    inputs = tok(prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}

    # Create streamer that skips the prompt and special tokens
    streamer = TextStreamer(
        tok, 
        skip_prompt=True, 
        skip_special_tokens=True
    )

    with torch.inference_mode():
        out = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tok.pad_token_id or tok.eos_token_id,
            eos_token_id=tok.eos_token_id,
            streamer=streamer  # Enable streaming output
        )

while True:
    try:
        user_msg = input("you> ").strip()
    except EOFError:
        break

    if not user_msg:
        continue

    cmd = user_msg.lower()
    if cmd in ("/exit", "exit", "quit"):
        print("bye!")
        break

    print("assistant> ", end="", flush=True)
    generate_answer(user_msg)
    print("\n")

In [None]:
# Install Gradio for web UI
!pip install -q gradio

print("✅ Gradio installed!")

In [None]:
import gradio as gr
import torch
from transformers import TextIteratorStreamer
from threading import Thread

# Define generation function for Gradio
def chat_with_model(message, history, temperature, max_tokens, top_p):
    """
    Generate response using the fine-tuned model with streaming.
    """
    # Build prompt in training format (Instruction/Response)
    prompt = f"""### Instruction:
{message}

### Response:"""
    
    # Tokenize
    inputs = tok(prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    # Setup streamer for real-time output
    streamer = TextIteratorStreamer(tok, skip_prompt=True, skip_special_tokens=True)
    
    # Generation parameters
    generation_kwargs = {
        **inputs,
        "max_new_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "do_sample": True,
        "pad_token_id": tok.pad_token_id or tok.eos_token_id,
        "eos_token_id": tok.eos_token_id,
        "streamer": streamer,
    }
    
    # Run generation in separate thread
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    # Stream the output
    partial_response = ""
    for new_text in streamer:
        partial_response += new_text
        yield partial_response
    
    thread.join()

# Create Gradio interface with ChatInterface
demo = gr.ChatInterface(
    fn=chat_with_model,
    title="🏛️ UK Legal Assistant - Qwen 0.5B LoRA",
    description="""
    **Fine-tuned Qwen2-0.5B model for UK legal Q&A**
    
    This model provides:
    - 📚 Topic identification and difficulty assessment
    - 🧠 Step-by-step legal reasoning
    - ✅ Comprehensive answers based on UK law
    - 📖 Relevant case citations and statutory references
    
    *Model: `rzeraat/qwen-0.5b-law-v1` (LoRA adapter on Qwen/Qwen2-0.5B)*
    """,
    examples=[
        "What are the duties of company directors under UK law?",
        "Explain consideration in contract law in 3 bullet points",
        "What is the difference between wrongful and unfair dismissal?",
        "How does the doctrine of privity apply to contracts?",
        "What are the requirements for a valid marriage in the UK?",
        "Explain the rule against perpetuities",
    ],
    additional_inputs=[
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.7,
            step=0.1,
            label="Temperature",
            info="Higher = more creative, Lower = more focused"
        ),
        gr.Slider(
            minimum=128,
            maximum=512,
            value=300,
            step=64,
            label="Max Tokens",
            info="Maximum length of generated response"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.9,
            step=0.05,
            label="Top P",
            info="Nucleus sampling threshold"
        ),
    ],
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="slate",
    ),
    retry_btn="🔄 Retry",
    undo_btn="↩️ Undo",
    clear_btn="🗑️ Clear",
)

# Launch the interface
print("\n" + "="*80)
print("🚀 LAUNCHING GRADIO WEB INTERFACE")
print("="*80)
print("📊 The interface will be available at:")
print("   - Local: http://127.0.0.1:7860")
print("   - Public: Will be shown below if share=True")
print("\n⚠️  Note: In Colab/Kaggle, a public URL will be generated automatically")
print("   The public URL is valid for 72 hours and can be shared with anyone!")
print("="*80 + "\n")

# Launch with share=True for public URL (useful in Colab/Kaggle)
demo.launch(
    share=True,           # Creates public URL for 72 hours
    server_name="0.0.0.0", # Listen on all interfaces
    server_port=7860,      # Default Gradio port
    show_error=True,       # Show errors in UI
)