# QCHAT Classification Workflow

This notebook demonstrates the workflow for QCHAT-based ASD vs. TD classification using a fine-tuned RoBERTa model.

In [None]:

import os
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, average_precision_score
)
from scipy.special import softmax

# Set random seed for reproducibility
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    np.random.seed(random_seed)
    random.seed(random_seed)

random_seed = 42
set_seed(random_seed)


## Step 1: Load and Preprocess Data

In [None]:

# Load and preprocess data
data_path = './QCHAT_T_ASD_KQCHAT_1004PJT_CL_dec052024.csv'
data = pd.read_csv(data_path)

# Map 'Class/ASD Traits' to binary labels
data['Class/ASD Traits2'] = data['Class/ASD Traits'].map({'ASD': 'Yes', 'ASD_HIGH': 'Yes', 'NORMAL': 'No'})
Q_ASD_mapping = {'No': 0, 'Yes': 1}
data['label'] = data['Class/ASD Traits2'].map(Q_ASD_mapping)

# StratifiedGroupKFold for splitting
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, test_idx in sgkf.split(data, data['label'], groups=data['SubjectId']):
    break

train_val = data.iloc[train_idx].reset_index(drop=True)
test = data.iloc[test_idx].reset_index(drop=True)

sgkf_val = StratifiedGroupKFold(n_splits=4, shuffle=True, random_state=42)
for train_inner_idx, val_idx in sgkf_val.split(train_val, train_val['label'], groups=train_val['SubjectId']):
    break

train = train_val.iloc[train_inner_idx].reset_index(drop=True)
val = train_val.iloc[val_idx].reset_index(drop=True)


## Step 2: Tokenize Data

In [None]:

# Tokenizer and Model
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large')
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=2)

def tokenize_and_pad(dataset):
    return tokenizer(
        dataset['combined'].tolist(),  # Text column for tokenization
        truncation=True,
        padding=True,
        max_length=512
    )

train_encodings = tokenize_and_pad(train)
val_encodings = tokenize_and_pad(val)
test_encodings = tokenize_and_pad(test)

train_subject_ids = train['SubjectId'].tolist()
val_subject_ids = val['SubjectId'].tolist()
test_subject_ids = test['SubjectId'].tolist()


## Step 3: Create Dataset Class

In [None]:

# Define Dataset Class
class QCHATDataset(Dataset):
    def __init__(self, encodings, labels, subject_ids):
        self.encodings = encodings
        self.labels = labels
        self.subject_ids = subject_ids

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        item['SubjectId'] = self.subject_ids[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = QCHATDataset(train_encodings, train['label'].tolist(), train_subject_ids)
val_dataset = QCHATDataset(val_encodings, val['label'].tolist(), val_subject_ids)
test_dataset = QCHATDataset(test_encodings, test['label'].tolist(), test_subject_ids)


## Step 4: Train the Model

In [None]:

# Training Arguments
training_args = TrainingArguments(
    output_dir='./output',
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    gradient_accumulation_steps=4,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    logging_dir='./Roberta/log',
)

# Evaluation Metrics
def compute_metrics_binary(p):  
    predictions, labels = p
    probabilities = softmax(predictions, axis=1)
    predictions = np.argmax(probabilities, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions),
        "precision": precision_score(labels, predictions),
        "recall": recall_score(labels, predictions),
        "roc_auc": roc_auc_score(labels, probabilities[:, 1]),
        "average_precision": average_precision_score(labels, probabilities[:, 1]),
    }

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_binary,
)

trainer.train()


## Step 5: Save the Model

In [None]:

# Save Model
save_directory = "./Roberta_QCHAT_Model"
trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)
