<a href="https://colab.research.google.com/github/suleiman-odeh/NLP_Project_Team16/blob/main/Qwen2.5_7B/zero_shot_indirect_Qwen2_5_7B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
import re
import json
import os


In [None]:

# Updated path based on your project structure
DATA_FILE = "../QEvasion_cleaned.jsonl"

if os.path.exists(DATA_FILE):
    try:
        df = pd.read_json(DATA_FILE, lines=True)
        df = df.dropna(how='all')

        print(f"Data loaded successfully. Total rows: {len(df)}")
    except ValueError as e:
        print(f"ValueError: {e}. The file might be empty or improperly formatted.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
else:
    print(f"ERROR: {DATA_FILE} not found. Check if it is in {os.getcwd()}")

Data loaded successfully. Total rows: 3756


In [None]:
# 1. Install/Update everything in one go
# !pip install -q -U bitsandbytes accelerate transformers

In [None]:
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"

def load_model_colab():
    print(f"Loading {MODEL_ID} in 4-bit for Google Colab...")

    # 4-bit configuration for NVIDIA GPUs
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for better stability on T4/A100
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    return model, tokenizer

# Initialize variables
model, tokenizer = load_model_colab()

Loading Qwen/Qwen2.5-7B-Instruct in 4-bit for Google Colab...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

In [6]:
# Map the 9 evasion labels to the 3 clarity IDs based on the paper's taxonomy
# 0: Clear Reply, 1: Ambivalent, 2: Clear Non-Reply
EVASION_TO_CLARITY_ID = {
    'Explicit': 0,
    'Implicit': 1, 'Dodging': 1, 'Deflection': 1, 'General': 1, 'Partial/half-answer': 1,
    'Declining to answer': 2, 'Claims ignorance': 2, 'Clarification': 2
}

def create_qwen_indirect_prompt(question, answer):
    # System instruction derived from the CLARITY taxonomy [cite: 115, 961]
    system_instruction = """Based on a segment of the interview in which the interviewer poses a series of questions,
classify the type of response provided by the interviewee for the following question using the following taxonomy
and then provide a chain of thought explanation for your decision:

<Taxonomy>
1. Explicit: The information requested is explicitly stated (in the requested form).
2. Implicit: The information requested is given, but without being explicitly stated (not in the expected form).
3. General: The information provided is too general/lacks the requested specificity.
4. Partial/half-answer: Offers only a specific component of the requested information.
5. Dodging: Ignoring the question altogether.
6. Deflection: Starts on topic but shifts the focus and makes a different point than what is asked.
7. Declining to answer: Acknowledges the question but directly or indirectly refusing to answer at the moment.
8. Claims ignorance: The answerer claims/admits not to know the answer themselves.
9. Clarification: Does not provide the requested information and asks for clarification.
You are required to respond with a single term corresponding to the Taxonomy code and only."""

    # Qwen-specific ChatML template format
    prompt = f"""<|im_start|>system
{system_instruction}<|im_end|>
<|im_start|>user
### Part of the interview ###
Question: "{question}"
Answer: "{answer}"

### Question ###
"{question}"<|im_end|>
<|im_start|>assistant
Taxonomy code: """
    return prompt

In [None]:
def predict_indirect(model, tokenizer, question, answer):
    prompt = create_qwen_indirect_prompt(question, answer)
    model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=150, # Higher for Chain-of-Thought [cite: 221]
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(generated_ids[0][len(model_inputs.input_ids[0]):], skip_special_tokens=True).strip()
    return response

def parse_evasion(raw_text):
    clean_text = raw_text.lower()
    for label in EVASION_TO_CLARITY_ID.keys():
        if label.lower() in clean_text:
            return label
    return "Error"

In [8]:
# Mapping and Parsing Setup

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import re

# ID to Label Mapping for Evasion (9-Class)
ID_TO_LABEL = {
    5: 'Explicit', 7: 'Implicit', 6: 'General', 8: 'Partial/half-answer',
    4: 'Dodging', 3: 'Deflection', 2: 'Declining to answer',
    0: 'Claims ignorance', 1: 'Clarification'
}

# Mapping predicted evasion IDs to clarity IDs (Task 1)
def get_clarity_id(evasion_id):
    if evasion_id == 5: return 0  # Explicit -> Clear Reply
    if evasion_id in [7, 4, 3, 6, 8]: return 1 # Ambivalent
    if evasion_id in [2, 0, 1]: return 2 # Clear Non-Reply
    return -1

def parse_qwen_output(raw_output):
    text = raw_output.strip().lower()

    # 1. Search for a digit (1-9) at the start of the line
    match = re.search(r'^(\d)', text)
    if match:
        code = match.group(1)
        # Mapping taxonomy string codes to dataset IDs
        code_map = {'1': 5, '2': 7, '3': 6, '4': 8, '5': 4, '6': 3, '7': 2, '8': 0, '9': 1}
        return code_map.get(code, -1)

    # 2. Keyword fallback if the model didn't provide a code
    word_map = {
        'explicit': 5, 'implicit': 7, 'general': 6, 'partial': 8, 'half': 8,
        'dodging': 4, 'deflection': 3, 'declining': 2, 'ignorance': 0, 'clarification': 1
    }
    for word, id_val in word_map.items():
        if word in text:
            return id_val
    return -1

In [11]:
# Inference Loop

test_df = df[df['split_type'] == 'test'].copy()
predictions_evasion = []
predictions_clarity = []

print(f"Starting Inference on {len(test_df)} samples...")

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    raw_res = predict_indirect(model, tokenizer, row['question'], row['cleaned_answer'])

    # Parse the evasion ID
    ev_id = parse_qwen_output(raw_res)
    predictions_evasion.append(ev_id)

    # Map to clarity ID
    predictions_clarity.append(get_clarity_id(ev_id))

test_df['pred_evasion_id'] = predictions_evasion
test_df['pred_clarity_id'] = predictions_clarity

Starting Inference on 308 samples...


  0%|          | 0/308 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|██████████| 308/308 [1:07:41<00:00, 13.19s/it]


In [15]:
print(f"Total test samples: {len(test_df)}")
print(f"Parsed successfully: {len(test_df[test_df['pred_evasion_id'] != -1])}")
print(f"Unique Predicted IDs: {test_df['pred_evasion_id'].unique()}")
print(f"Unique True IDs: {test_df['evasion_id'].unique()}")

Total test samples: 308
Parsed successfully: 308
Unique Predicted IDs: [3 8 5 2 4 7 6 0 1]
Unique True IDs: [-1]


In [24]:
# This will show you the first 5 rows of your data so you can see which column actually has numbers (0-8)
print(test_df[['question', 'cleaned_answer']].head()) # See the text
print("\n--- Column Values Check ---")
for col in test_df.columns:
    if 'label' in col or 'id' in col:
        print(f"Column '{col}' unique values: {test_df[col].unique()[:10]}")

                                               question  \
3448   Inquiring about the status or information reg...   
3449  Will you invite them to the White House to neg...   
3450  Why was it necessary for Japan to drop the thr...   
3451                  When will we see this resolution?   
3452                Updating the figure of Iraqi deaths   

                                         cleaned_answer  
3448  Well, the world has made it clear that these t...  
3449  I think that anytime and anyplace that they ar...  
3450  I think that the purpose of the U.N. Security ...  
3451  I'll let Condi talk about the details of what ...  
3452  No, I don't consider it a credible report; nei...  

--- Column Values Check ---
Column 'president' unique values: [None]
Column 'annotator_id' unique values: [nan]
Column 'clarity_label' unique values: ['Ambivalent' 'Clear Reply' 'Clear Non-Reply']
Column 'evasion_label' unique values: ['']
Column 'clarity_id' unique values: [1 0 2]
Column 'evasi

In [None]:
import pandas as pd
from sklearn.metrics import classification_report

# 1. Define Taxonomy Mapping
ID_TO_LABEL = {
    0: 'Claims ignorance', 1: 'Clarification', 2: 'Declining to answer',
    3: 'Deflection', 4: 'Dodging', 5: 'Explicit', 6: 'General',
    7: 'Implicit', 8: 'Partial/half-answer'
}

# 9-class ground truth is in 'annotator1_id'
# 3-class ground truth is in 'clarity_id'
true_ev_col = 'annotator1_id'
true_cl_col = 'clarity_id'

# 3. Clean and Filter
eval_df = test_df.copy()

# Ensure all columns are numeric (converts any '' to NaN)
cols_to_fix = [true_ev_col, 'pred_evasion_id', true_cl_col, 'pred_clarity_id']
for col in cols_to_fix:
    eval_df[col] = pd.to_numeric(eval_df[col], errors='coerce')

# Drop rows that are now NaN
valid_results = eval_df.dropna(subset=cols_to_fix).copy()
valid_results[cols_to_fix] = valid_results[cols_to_fix].astype(int)

# 4. Prepare targets
target_ids = sorted(ID_TO_LABEL.keys())
target_names = [ID_TO_LABEL[i] for i in target_ids]

# --- EVALUATION 1: EVASION (9-CLASS) ---
print("\n" + "="*60)
print("QWEN2.5 INDIRECT EVALUATION: EVASION (9-Class)")
print("="*60)
print(classification_report(
    valid_results[true_ev_col],
    valid_results['pred_evasion_id'],
    labels=target_ids,
    target_names=target_names,
    digits=4,
    zero_division=0
))

# --- EVALUATION 2: CLARITY (3-CLASS) ---
print("\n" + "="*60)
print("QWEN2.5 INDIRECT EVALUATION: CLARITY (3-Class)")
print("="*60)
print(classification_report(
    valid_results[true_cl_col],
    valid_results['pred_clarity_id'],
    target_names=["Clear Reply (0)", "Ambivalent (1)", "Clear Non-Reply (2)"],
    digits=4,
    zero_division=0
))


QWEN2.5 INDIRECT EVALUATION: EVASION (9-Class)
                     precision    recall  f1-score   support

   Claims ignorance     0.2857    0.2222    0.2500         9
      Clarification     0.5000    0.2500    0.3333         4
Declining to answer     0.4000    0.2000    0.2667        10
         Deflection     0.1111    0.6333    0.1891        30
            Dodging     0.4167    0.1724    0.2439        58
           Explicit     1.0000    0.0283    0.0550       106
            General     0.0435    0.0345    0.0385        29
           Implicit     0.2222    0.2222    0.2222        54
Partial/half-answer     0.0000    0.0000    0.0000         8

           accuracy                         0.1623       308
          macro avg     0.3310    0.1959    0.1776       308
       weighted avg     0.5043    0.1623    0.1462       308


QWEN2.5 INDIRECT EVALUATION: CLARITY (3-Class)
                     precision    recall  f1-score   support

    Clear Reply (0)     1.0000    0.0380    0.