In [2]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 1. Define paths
base_model_id = "Qwen/Qwen2.5-0.5B"
adapter_path = "./binaries/qwen_lora" # Path to your unzipped downloaded folder

# 2. Load Tokenizer (saved along with your LoRA weights)
tokenizer = AutoTokenizer.from_pretrained(adapter_path)

In [4]:
# 3. Load Base Model
# IMPORTANT: Use the exact same torch_dtype you used during training (bfloat16 or float16)
base_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_id,
    num_labels=6,
    problem_type="regression",
    device_map="auto",
    torch_dtype=torch.bfloat16 
)
base_model.config.pad_token_id = tokenizer.pad_token_id

# 4. Snap the LoRA adapters onto the base model
model = PeftModel.from_pretrained(base_model, adapter_path)
model.eval() # Lock the model into evaluation mode

`torch_dtype` is deprecated! Use `dtype` instead!
Loading weights: 100%|██████████| 290/290 [00:01<00:00, 258.86it/s, Materializing param=model.norm.weight]                              
[1mQwen2ForSequenceClassification LOAD REPORT[0m from: Qwen/Qwen2.5-0.5B
Key          | Status  | 
-------------+---------+-
score.weight | MISSING | 

[3mNotes:
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): Qwen2ForSequenceClassification(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 896)
        (layers): ModuleList(
          (0-23): 24 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=896, out_features=896, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=896, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=896, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
             

In [5]:
# The order must match the target_columns from your training script exactly
emotions = [
    'sadness', 'anxiety', 'rumination', 'self_focus', 
    'hopelessness', 'emotional_volatility'
]

def analyze_text(text):
    # Tokenize the input text
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        max_length=512, 
        padding=True
    )
    
    # Move inputs to the same device (GPU/CPU) as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Run the forward pass without tracking gradients
    with torch.no_grad():
        outputs = model(**inputs)
        # Extract the 6 regression scores from the model's output
        raw_scores = outputs.logits[0].float().cpu().numpy()

    # Enforce the 0.0 to 1.0 bounds
    clipped_scores = np.clip(raw_scores, 0.0, 1.0)

    # Zip the emotion names with their scores into a clean JSON-like dictionary
    results = {
        emotion: round(float(score), 3) 
        for emotion, score in zip(emotions, clipped_scores)
    }
    
    return results

In [7]:
sample_text = "I want to die."

predictions = analyze_text(sample_text)
print(predictions)

# Expected Output shape: 
# {'sadness': 0.812, 'anxiety': 0.945, 'rumination': 0.88, 'self_focus': 0.72, 'hopelessness': 0.65, 'emotional_volatility': 0.41}

{'sadness': 0.578, 'anxiety': 0.0, 'rumination': 0.0, 'self_focus': 0.828, 'hopelessness': 0.0, 'emotional_volatility': 0.439}


## Clustering

In [8]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# 1. Load the scores your Qwen model generated for your dataset
df = pd.read_csv("./data/final/final_dataset.csv")
features = ['sadness', 'anxiety', 'rumination', 'self_focus', 'hopelessness', 'emotional_volatility']
X = df[features]

# 2. Scale the data
# Standardization is mandatory for distance-based clustering algorithms
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Apply K-Means Clustering
# Let's assume we want to find 4 distinct profiles
kmeans = KMeans(n_clusters=4, random_state=42, n_init='auto')
df['cluster_id'] = kmeans.fit_predict(X_scaled)

# 4. Profile the Centroids
# This calculates the average score for each emotion within each cluster
cluster_centroids = df.groupby('cluster_id')[features].mean()
display(cluster_centroids)

Unnamed: 0_level_0,sadness,anxiety,rumination,self_focus,hopelessness,emotional_volatility
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.793498,0.427534,0.686996,0.85583,0.778297,0.031563
1,0.756183,0.307626,0.135779,0.786356,0.652638,0.027205
2,0.190433,0.586243,0.296508,0.559637,0.10625,0.038687
3,0.705457,0.556982,0.483146,0.807384,0.593258,0.623114


In [9]:
import joblib

# 1. Define the mapping based on our interpretation
cluster_mapping = {
    0: "Severe Distress / Depressive Profile",
    1: "Passive Sadness / Apathy",
    2: "Baseline / Mild Anxiety",
    3: "Emotionally Volatile / Dysregulated"
}

# 2. Save the K-Means model and the Scaler to your local directory
joblib.dump(kmeans, "datich_kmeans_model.pkl")
joblib.dump(scaler, "datich_scaler.pkl")

['datich_scaler.pkl']