### Connect to drive and import libraries





In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import re

from scipy.special import softmax
import numpy as np

os.chdir('/content/drive/MyDrive/Fairness_NLP/')

In [None]:
import re

def extract_text(input_string):
  # Check if the input is a string
    if not isinstance(input_string, str):
        print(input_string)
        raise ValueError("Expected a string input")
    # Use regex to find all text within parentheses and remove the text itself
    matches = re.findall(r'\((.*?)\)', input_string)

    if not matches:
        return input_string

    # Join the extracted texts with a space and return them
    return ' '.join(matches)

# Below helper function creates question-answer pairs (without filtering)
def create_question_answer_pairs(interview):
    question_answer_pairs = []
    current_question = []
    current_response = []

    for index, row in interview.iterrows():
        row['value'] = extract_text(str(row['value']))
        if row['speaker'] == "Ellie":
            # If there's an existing question and response, store the pair
            if current_question and current_response:

                question_answer_pairs.append({
                    'question': " ".join(current_question),
                    'answer': ". ".join(current_response)
                })
                current_response = []  # Reset responses for the next question
                current_question = []  # Reset question for the next batch

            # Add the new question or follow-up from Ellie to the current question
            current_question.append(str(row['value']))

        elif row['speaker'] == "Participant" and current_question:
            current_response.append(str(row['value']))

    # Add the last question-answer pair if it exists
    if current_question and current_response:

        question_answer_pairs.append({
            'question': " ".join(current_question),
            'answer': ". ".join(current_response)
        })

    return pd.DataFrame(question_answer_pairs, columns=['question', 'answer'])

In [None]:
# Function to create chunks of QA pairs with overlaps
def chunk_qa_pairs(df, max_tokens=80, max_overlap_tokens=40):
    chunks = []
    current_chunk = []
    current_chunk_word_count = 0

    # Combine questions and answers
    qa_pairs = [f"Interviewer: {row['question']} Interviewee: {row['answer']}" for _, row in df.iterrows()]

    for pair in qa_pairs:
        # Count words in the current pair
        pair_word_count = len(pair.split())

        # Check if adding this pair exceeds the max tokens
        if current_chunk_word_count + pair_word_count > max_tokens:
            # Save the current chunk
            chunks.append(" ".join(current_chunk))

            # Prepare for the next chunk
            # Determine overlap (complete QA pairs)
            overlap = []
            overlap_word_count = 0

            # Start from the last added complete pairs until it hits the token limit
            for qa in reversed(current_chunk):
                overlap_word_count += len(qa.split())
                if overlap_word_count >= max_overlap_tokens:
                    break
                overlap.append(qa)

            # Reverse to maintain original order
            overlap.reverse()

            # Start new chunk with overlap
            current_chunk = overlap
            current_chunk_word_count = sum(len(q.split()) for q in current_chunk)

        # Add the current pair to the chunk
        current_chunk.append(pair)
        current_chunk_word_count += pair_word_count

    # Add the last chunk if it has content
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


In [None]:
!pip install openpyxl



In [None]:
import warnings
warnings.filterwarnings('ignore')

# Create dictionary of {participant_id : PHQ_Binary}
id_depression_label_map = {}
all_ids = set()
sheet_name = 'Metadata_mapping'
file_path = 'DAIC demographc data.xlsx'
data_csv = pd.read_excel(file_path, sheet_name=sheet_name)
for i in range(len(data_csv['Participant_ID'])):
    id_depression_label_map[data_csv['Participant_ID'][i]] = data_csv['PHQ_Binary'][i]
    all_ids.add(data_csv['Participant_ID'][i])


In [None]:
from transformers import AutoTokenizer
access_token = "TOKEN"
tokenizer = AutoTokenizer.from_pretrained("mental/mental-roberta-base", token=access_token)
import torch

max_len = 510
def collect_train_test_data(directory):
    X_train = []
    Y_train = []
    attention_masks = []
    for filename in os.listdir(directory):
      if filename.endswith(".csv"):
        interview_id = re.findall(r'\d+', filename)[0]
        print(interview_id)

        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path, delimiter='\t')
        df.drop(columns=['start_time', 'stop_time'], axis=1, inplace=True)
        df.fillna('', inplace=True)

        # Step 1: Create QA Pair
        qa_df = create_question_answer_pairs(df)
        chunks = chunk_qa_pairs(qa_df, max_tokens=300, max_overlap_tokens=80)
        for idx, chunk in enumerate(chunks):
            encoded_text = tokenizer(chunk, add_special_tokens=True, max_length=max_len, truncation=True, padding="max_length")
            input_ids = encoded_text['input_ids']
            attention_mask = encoded_text['attention_mask']

            # Convert to torch tensors and add to the lists
            input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
            attention_mask_tensor = torch.tensor(attention_mask, dtype=torch.long)
            label_tensor = torch.tensor(id_depression_label_map[int(interview_id)], dtype=torch.long)

            X_train.append(input_ids_tensor)
            attention_masks.append(attention_mask_tensor)
            Y_train.append(label_tensor)

    X_train = torch.stack(X_train)
    Y_train = torch.stack(Y_train)
    attention_masks = torch.stack(attention_masks)
    return X_train, Y_train, attention_masks





tokenizer_config.json:   0%|          | 0.00/327 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:

# Load the data
directory = '/content/drive/MyDrive/Fairness_NLP/Dataset'
X_train, Y_train, attention_masks = collect_train_test_data(directory)


345
305
309
306
341
372
364
356
369
381
360
379
350
338
362
352
335
304
339
300
331
358
329
333
355
371
319
346
344
367
316
337
328
314
324
383
302
326
349
353
327
340
365
370
301
378
347
336
334
307
325
373
308
323
343
363
366
320
354
315
310
322
357
312
311
351
303
359
361
318
313
317
332
321
330
368
348
384
416
425
374
466
475
465
485
472
422
387
426
471
392
396
476
479
447
440
415
431
424
487
488
432
437
409
454
382
412
397
492
399
433
385
450
441
491
478
436
453
449
484
400
411
467
375
446
410
413
469
395
404
443
489
388
463
434
391
470
457
429
452
464
448
389
386
390
402
407
468
481
423
439
393
490
474
486
376
403
459
418
417
405
483
401
428
456
461
435
427
406
421
377
482
473
408
455
477
420
380
430
414
419
438
445
442
462
444


In [None]:
print(X_train.shape)
print(Y_train.shape)
print(attention_masks.shape)

torch.Size([1780, 510])
torch.Size([1780])
torch.Size([1780, 510])


In [None]:
import os
import re
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, balanced_accuracy_score
os.environ["WANDB_DISABLED"] = "true"



access_token = "TOKEN"

# Custom model class
class CustomSequenceClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(CustomSequenceClassifier, self).__init__()
        self.base_model = AutoModel.from_pretrained(model_name, token=access_token)
        self.classifier = nn.Linear(self.base_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)

        # Use the pooled output (e.g., CLS token) for classification
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        logits = self.classifier(pooled_output)

        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {'loss': loss, 'logits': logits}



# Define a custom Dataset class
class DepressionDataset(Dataset):
    def __init__(self, input_ids, labels, attention_masks):
        self.input_ids = input_ids
        self.labels = labels
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long).squeeze()
        }

# Initialize KFold and metrics storage
kfold = KFold(n_splits=5, shuffle=True, random_state=17)
k_fold_iteration = 0
total_bac = []
total_precision = []
total_recall = []
total_f1 = []

trained_models_base_dir = '/content/drive/MyDrive/Fairness_NLP/'

# Perform k-fold cross-validation
for train_index, test_index in kfold.split(X_train):
    #print(train_index, test_index)
    k_fold_iteration += 1

    # Split into training and validation sets for this fold
    train_inputs, val_inputs = X_train[train_index], X_train[test_index]
    # print("Train, Val Inputs")
    # print(train_inputs.shape, val_inputs.shape)
    # print(train_inputs)
    # print(val_inputs)
    train_labels, val_labels = Y_train[train_index], Y_train[test_index]
    attention_mask_train, attention_mask_val = attention_masks[train_index], attention_masks[test_index]

    # print("Train, Val Labels")
    # print(train_labels.shape, val_labels.shape)
    # print(train_labels)
    # print(val_labels)
    # print("Train, Val Attention Masks")
    # print(attention_mask_train.shape, attention_mask_val.shape)
    # print(attention_mask_train)
    # print(attention_mask_val)

    # Initialize datasets for the current fold
    train_dataset = DepressionDataset(train_inputs, train_labels, attention_mask_train)
    val_dataset = DepressionDataset(val_inputs, val_labels, attention_mask_val)

    # print("Dims")
    # print(train_dataset.labels.shape)
    # print(train_dataset.labels)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'{trained_models_base_dir}/trained_models/k_fold_{k_fold_iteration}/',
        num_train_epochs=10,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=4,
        evaluation_strategy='epoch',
        learning_rate=2e-5,
        weight_decay=0.01,
        save_strategy='epoch',
        logging_dir=f'{trained_models_base_dir}/logs/k_fold_{k_fold_iteration}',
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
        save_total_limit=1,
        logging_steps=1
    )

    # Define the compute_metrics function for evaluation
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
        acc = accuracy_score(labels, predictions)
        bac = balanced_accuracy_score(labels, predictions)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall,
            'balanced_accuracy': bac
        }


    # Initialize the model
    model = CustomSequenceClassifier("mental/mental-roberta-base", num_labels=2)
    # Create a Trainer for this fold
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the validation set and store metrics
    metrics = trainer.evaluate(val_dataset)
    total_bac.append(metrics['eval_balanced_accuracy'])
    total_precision.append(metrics['eval_precision'])
    total_recall.append(metrics['eval_recall'])
    total_f1.append(metrics['eval_f1'])

    print(f"Fold {k_fold_iteration} - Balanced Accuracy: {metrics['eval_balanced_accuracy']}, Precision: {metrics['eval_precision']}, Recall: {metrics['eval_recall']}, F1 Score: {metrics['eval_f1']}")

# Print overall results after cross-validation
print(f"Average Balanced Accuracy: {np.mean(total_bac)}, Average Precision: {np.mean(total_precision)}, Average Recall: {np.mean(total_recall)}, Average F1 Score: {np.mean(total_f1)}")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of RobertaModel were not initialized from the model checkpoint at mental/mental-roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Balanced Accuracy
1,0.6457,0.513422,0.792135,0.0,0.0,0.0,0.5
2,0.4207,0.514796,0.730337,0.5,0.40678,0.648649,0.700211
3,0.1087,0.450999,0.766854,0.471338,0.445783,0.5,0.66844
4,0.1335,0.537741,0.761236,0.572864,0.456,0.77027,0.764568
5,0.6954,0.797353,0.769663,0.539326,0.461538,0.648649,0.725034
6,0.0013,1.321468,0.764045,0.533333,0.45283,0.648649,0.721487
7,0.0002,1.037883,0.831461,0.552239,0.616667,0.5,0.70922
8,0.0001,1.530869,0.769663,0.554348,0.463636,0.689189,0.739985
9,0.0001,1.496773,0.792135,0.593407,0.5,0.72973,0.76912
10,0.0001,1.404107,0.803371,0.593023,0.520408,0.689189,0.761261


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 1 - Balanced Accuracy: 0.7092198581560284, Precision: 0.6166666666666667, Recall: 0.5, F1 Score: 0.5522388059701493


Some weights of RobertaModel were not initialized from the model checkpoint at mental/mental-roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Balanced Accuracy
1,0.3971,0.470466,0.792135,0.0,0.0,0.0,0.5
2,0.6499,0.433774,0.803371,0.416667,0.543478,0.337838,0.631685
3,0.3654,0.399808,0.83427,0.486957,0.682927,0.378378,0.66614
4,0.3114,1.000777,0.671348,0.526316,0.375723,0.878378,0.7477
5,0.4519,1.249924,0.727528,0.548837,0.41844,0.797297,0.753259
6,0.0007,1.404248,0.755618,0.583732,0.451852,0.824324,0.780956
7,0.0001,1.669422,0.741573,0.566038,0.434783,0.810811,0.767108
8,0.0001,1.505535,0.775281,0.587629,0.475,0.77027,0.773433
9,0.0,1.473434,0.783708,0.605128,0.487603,0.797297,0.78872
10,0.0001,1.536279,0.775281,0.59596,0.475806,0.797297,0.7834


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 2 - Balanced Accuracy: 0.6661395437991182, Precision: 0.6829268292682927, Recall: 0.3783783783783784, F1 Score: 0.48695652173913045


Some weights of RobertaModel were not initialized from the model checkpoint at mental/mental-roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Balanced Accuracy
1,0.5032,0.559355,0.764045,0.086957,0.8,0.045977,0.52113
2,0.9733,0.491438,0.789326,0.43609,0.630435,0.333333,0.635068
3,0.6244,0.48776,0.772472,0.580311,0.528302,0.643678,0.728902
4,0.1003,0.657664,0.814607,0.592593,0.64,0.551724,0.725676
5,0.6852,0.957684,0.794944,0.562874,0.5875,0.54023,0.708777
6,0.9829,1.155202,0.825843,0.557143,0.735849,0.448276,0.698116
7,0.0006,1.718234,0.727528,0.561086,0.462687,0.712644,0.722493
8,0.0001,1.551719,0.780899,0.566667,0.548387,0.586207,0.715037
9,0.0001,1.663193,0.772472,0.562162,0.530612,0.597701,0.713349
10,0.0001,1.675443,0.775281,0.569892,0.535354,0.609195,0.719096


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 3 - Balanced Accuracy: 0.698115626201769, Precision: 0.7358490566037735, Recall: 0.4482758620689655, F1 Score: 0.5571428571428572


Some weights of RobertaModel were not initialized from the model checkpoint at mental/mental-roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Balanced Accuracy
1,0.4719,0.467954,0.792135,0.0,0.0,0.0,0.5
2,0.6669,0.462704,0.783708,0.533333,0.483516,0.594595,0.713964
3,0.1934,0.430955,0.803371,0.597701,0.52,0.702703,0.766245
4,0.0702,0.599835,0.83427,0.604027,0.6,0.608108,0.750863
5,0.5147,0.767639,0.828652,0.590604,0.586667,0.594595,0.742333
6,0.0003,1.058661,0.789326,0.590164,0.495413,0.72973,0.767347
7,0.0001,1.043591,0.825843,0.630952,0.56383,0.716216,0.785413
8,0.0001,1.08434,0.83427,0.642424,0.582418,0.716216,0.790732
9,0.0001,1.306283,0.817416,0.640884,0.542056,0.783784,0.805012
10,0.0001,1.233944,0.820225,0.636364,0.54902,0.756757,0.796818


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 4 - Balanced Accuracy: 0.7508625646923519, Precision: 0.6, Recall: 0.6081081081081081, F1 Score: 0.6040268456375839


Some weights of RobertaModel were not initialized from the model checkpoint at mental/mental-roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Balanced Accuracy
1,0.2802,0.543729,0.775281,0.0,0.0,0.0,0.5
2,0.2474,0.479279,0.808989,0.392857,0.6875,0.275,0.619384
3,0.2172,0.550145,0.814607,0.484375,0.645833,0.3875,0.662953
4,0.3831,0.61908,0.808989,0.507246,0.603448,0.4375,0.677083
5,0.3697,0.811552,0.780899,0.52439,0.511905,0.5375,0.694475


In [None]:
import numpy as np

In [None]:
print(f"Average Balanced Accuracy: {np.mean(total_bac)}, Average Precision: {np.mean(total_precision)}, Average Recall: {np.mean(total_recall)}, Average F1 Score: {np.mean(total_f1)}")

NameError: name 'total_bac' is not defined