<a href="https://colab.research.google.com/github/suleiman-odeh/NLP_Project_Team16/blob/main/Llama_3/zero_shot_direct_Llama_3_8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install -U bitsandbytes accelerate transformers huggingface_hub

Collecting transformers
  Using cached transformers-4.57.6-py3-none-any.whl.metadata (43 kB)
Collecting huggingface_hub
  Using cached huggingface_hub-1.3.2-py3-none-any.whl.metadata (13 kB)
Using cached transformers-4.57.6-py3-none-any.whl (12.0 MB)
[0mInstalling collected packages: transformers
[0mSuccessfully installed transformers


In [14]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login
from tqdm import tqdm
import re
import json
from sklearn.metrics import classification_report, accuracy_score

# login(token="your_hf_token_here")
login(token="")

# Load Llama 3 8B model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

print(f"Loading {model_id} in 4-bit...")
DATA_FILE = "QEvasion_cleaned.jsonl"
def load_data(filepath):
    print(f"Loading data from {filepath}...")
    df = pd.read_json(filepath, lines=True)
    test_df = df[df['split_type'] == 'test'].copy()
    print(f"Test Set Size: {len(test_df)}")
    return test_df

def load_model():
    print(f"Loading {model_id} (Direct)...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    return model, tokenizer

print("Llama 3 8B loaded successfully!")

Loading meta-llama/Meta-Llama-3-8B-Instruct in 4-bit...
Llama 3 8B loaded successfully!


In [15]:
def get_prompt(question, answer):
# PROMPT FROM PAPER (Zero-Shot Clarity)
    return f"""Based on a segment of the interview in
which the interviewer poses a series of questions, classify the
type of response provided by the interviewee for the following
question using the following taxonomy:
1. Clear Reply - The information requested is explicitly stated (in the requested form)
2. Clear Non-Reply - The information requested is not given
at all due to ignorance, need for clarification or declining to
answer
3. Ambiguous - The information requested is given in
an incomplete way e.g. the answer is too general, partial,
implicit, dodging or deflection.
You are required to respond with a single term corresponding to the Taxonomy code and only.
### Part of the interview ###
Answer: {answer}
### Question ###
Question: {question}
Taxonomy code:"""

In [16]:
def predict(model, tokenizer, prompt):
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=10, # Tokens needed
            temperature=0.1,
            do_sample=False,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode only the newly generated tokens
    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

In [17]:
def run_direct():
    df = load_data(DATA_FILE)
    model, tokenizer = load_model()

    predictions = []
    true_labels = []

    print("\nStarting Direct Inference (Paper Prompt)...")

    # Counter to control debug prints
    debug_count = 0

    for _, row in tqdm(df.iterrows(), total=len(df)):
        prompt = get_prompt(row['question'], row['cleaned_answer'])
        raw_output = predict(model, tokenizer, prompt)

        # DEBUGGING: Print the first 5 outputs to see what the model is saying
        if debug_count < 5:
            print(f"\n[DEBUG Sample {debug_count+1}]")
            print(f"Model Output: '{raw_output}'")
            debug_count += 1


        # PARSING LOGIC
        # 1. Clean the output
        clean_output = raw_output.strip().lower()

        predicted_label = -1

        # 2. Check for Digits first (1, 2, or 3)
        # We use a regex that looks for the number alone OR surrounded by non-digits
        # This handles "1", "1.", "**1**", "Code: 1", etc.
        digit_match = re.search(r'\b([1-3])\b', clean_output)

        if digit_match:
            val = int(digit_match.group(1))
            if val == 1: predicted_label = 0   # 1. Clear Reply -> Dataset 0
            elif val == 2: predicted_label = 2 # 2. Clear Non-Reply -> Dataset 2
            elif val == 3: predicted_label = 1 # 3. Ambiguous -> Dataset 1

        # 3. Fallback: Check for Text Labels if no digit found
        # (Models sometimes ignore instructions and output the text class)
        else:
            if "clear non-reply" in clean_output or "clear non reply" in clean_output:
                predicted_label = 2
            elif "clear reply" in clean_output:
                predicted_label = 0
            elif "ambiguous" in clean_output or "ambivalent" in clean_output:
                predicted_label = 1

        predictions.append(predicted_label)
        true_labels.append(row['clarity_id'])

    # REPORTING
    print("\n" + "="*60)
    print(f"DIRECT RESULTS (Meta-Llama-3-8B-Instruct) | Samples: {len(true_labels)}")
    print("="*60)

    # Filter valid predictions
    valid_indices = [i for i, p in enumerate(predictions) if p != -1]

    # Check if we have ANY valid predictions before running report
    if len(valid_indices) == 0:
        print("ERROR: No valid predictions could be parsed. Check the Debug outputs above.")
        print("The model output format likely contradicts the parsing logic completely.")
        return # Stop here to prevent the ValueError crash

    if len(valid_indices) < len(predictions):
        print(f"Warning: {len(predictions) - len(valid_indices)} parsing failures (mapped to -1).")

    filtered_true = [true_labels[i] for i in valid_indices]
    filtered_pred = [predictions[i] for i in valid_indices]

    print(classification_report(
        filtered_true,
        filtered_pred,
        target_names=["Clear (0)", "Ambivalent (1)", "Non-Reply (2)"],
        zero_division=0
    ))

In [18]:
if __name__ == "__main__":
    run_direct()

Loading data from QEvasion_cleaned.jsonl...
Test Set Size: 308
Loading meta-llama/Meta-Llama-3-8B-Instruct (Direct)...


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]


Starting Direct Inference (Paper Prompt)...


  0%|          | 0/308 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  0%|          | 1/308 [00:03<16:49,  3.29s/it]


[DEBUG Sample 1]
Model Output: 'Ambiguous'


  1%|          | 2/308 [00:05<13:58,  2.74s/it]


[DEBUG Sample 2]
Model Output: 'Ambiguous'


  1%|          | 3/308 [00:09<16:18,  3.21s/it]


[DEBUG Sample 3]
Model Output: 'Ambiguous'


  1%|▏         | 4/308 [00:13<17:20,  3.42s/it]


[DEBUG Sample 4]
Model Output: 'Ambiguous'


  2%|▏         | 5/308 [00:15<15:22,  3.05s/it]


[DEBUG Sample 5]
Model Output: 'Ambiguous'


100%|██████████| 308/308 [20:38<00:00,  4.02s/it]


DIRECT RESULTS (Meta-Llama-3-8B-Instruct) | Samples: 308
                precision    recall  f1-score   support

     Clear (0)       0.54      0.18      0.27        79
Ambivalent (1)       0.69      0.94      0.79       206
 Non-Reply (2)       0.00      0.00      0.00        23

      accuracy                           0.67       308
     macro avg       0.41      0.37      0.35       308
  weighted avg       0.60      0.67      0.60       308




