In [None]:
!pip install transformers datasets



In [None]:
from datasets import load_dataset

# Step 1: Load Dataset
print("Loading dataset...")
dataset = load_dataset("halilbabacan/combined_synthetic_cognitive_distortions")
print(dataset.column_names)

# Step 2: Define Valid Labels and Label Normalization
valid_labels = [
    "Mind Reading", "Overgeneralization", "Magnification", "Labelling",
    "Personalization", "Fortune-telling", "Emotional Reasoning",
    "Mental Filter", "Should Statements", "All-or-Nothing Thinking"
]

label_mapping = {
    "Mind Reading": "Mind Reading",
    "Overgeneralization": "Overgeneralization",
    "Magnification": "Magnification",
    "Labelling": "Labelling",
    "Labeling": "Labelling",  # Handle alternate spelling
    "Personalization": "Personalization",
    "Fortune-telling": "Fortune-telling",
    "Emotional Reasoning": "Emotional Reasoning",
    "Mental Filter": "Mental Filter",
    "Mental filter": "Mental Filter",  # Handle inconsistent capitalization
    "Should Statements": "Should Statements",
    "Should statements": "Should Statements",
    "All-or-Nothing Thinking": "All-or-Nothing Thinking",
    "All-or-nothing thinking": "All-or-Nothing Thinking",  # Handle case issues
}

# Step 3: Filter and Normalize Dataset
def clean_data(example):
    """
    Filter out invalid rows and normalize labels.
    """
    if example["text"] is None or example["label"] not in label_mapping:
        return False  # Exclude this row
    example["label"] = label_mapping[example["label"]]
    return True

print("Filtering dataset...")
dataset = dataset["train"].filter(clean_data)

print(f"Dataset size after cleaning: {len(dataset)}")

# Step 4: Verify Cleaned Dataset
print("Verifying dataset...")
unique_labels = set(dataset["label"])
print(f"Unique labels: {unique_labels}")
print(f"Sample row: {dataset[0]}")

# Step 5: Save Cleaned Dataset (Optional)
dataset.save_to_disk("cleaned_dataset")
print("Cleaned dataset saved.")

# Load and push the dataset to Hugging Face
dataset_to_push = load_from_disk("cleaned_dataset")
dataset_to_push.push_to_hub("shanthi-323/cleaned_dataset_synthetic_cognitive_distortions", token="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")

print("Dataset pushed to Hugging Face successfully!")



Loading dataset...
{'train': ['text', 'label']}
Filtering dataset...
Dataset size after cleaning: 3597
Verifying dataset...
Unique labels: {'Labelling', 'Magnification', 'All-or-Nothing Thinking', 'All-or-nothing thinking', 'Personalization', 'Emotional Reasoning', 'Overgeneralization', 'Mental Filter', 'Labeling', 'Should Statements', 'Mental filter', 'Mind Reading', 'Fortune-telling', 'Should statements'}
Sample row: {'text': 'John walked past me without saying a word He must be angry at me for something', 'label': 'Mind Reading'}


Saving the dataset (0/1 shards):   0%|          | 0/3597 [00:00<?, ? examples/s]

Cleaned dataset saved.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Dataset pushed to Hugging Face successfully!


In [None]:
!pip install --upgrade transformers datasets torch

Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20

In [None]:
!pip install -U transformers



In [None]:
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
import os

from huggingface_hub import notebook_login
from huggingface_hub import login

login()

# Step 1: Log in to Hugging Face Hub
print("Logging into Hugging Face Hub...")
notebook_login()


# Step 2: Load Cleaned Dataset
print("Loading cleaned dataset...")
dataset = load_from_disk("cleaned_dataset")

# Step 3: Map label strings to integers
label_mapping = {
    "Mind Reading": 0,
    "Overgeneralization": 1,
    "Magnification": 2,
    "Labelling": 3,
    "Labeling": 3,
    "Personalization": 4,
    "Fortune-telling": 5,
    "Emotional Reasoning": 6,
    "Mental Filter": 7,
    "Mental filter": 7,
    "Should Statements": 8,
    "Should statements": 8,
    "All-or-Nothing Thinking": 9,
    "All-or-nothing thinking": 9,
    "No Distortion": 10,
}

def encode_labels(example):
    example["label"] = label_mapping[example["label"]]
    return example

print("Encoding labels...")
dataset = dataset.map(encode_labels)

# Step 4: Split dataset into train and eval sets
print("Splitting dataset...")
train_test = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test["train"]
eval_dataset = train_test["test"]

print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")

# Step 5: Tokenize the dataset
print("Tokenizing dataset...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
eval_dataset = eval_dataset.remove_columns(["text"])

train_dataset.set_format("torch")
eval_dataset.set_format("torch")

# Step 6: Load BERT model
print("Initializing BERT model...")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_mapping),
    return_dict=True
).to("cuda")

# Override forward method to ignore unexpected arguments
class CustomBertForSequenceClassification(BertForSequenceClassification):
    def forward(self, **kwargs):
        return super().forward(**kwargs)

# Step 7: Training arguments
training_args = TrainingArguments(
    output_dir="./results_",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=True,  # Enable pushing to Hugging Face Hub
    hub_model_id="shanthi-323/fine-tuned-bert-CBT",  # Replace with your username and desired model name
    report_to="none"
)

# Step 8: Trainer initialization
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

# Step 9: Train the model
print("Training the model...")
trainer.train()

# Step 10: Save the fine-tuned model locally
print("Saving the fine-tuned model locally...")
trainer.save_model("fine_tuned_bert_CBT")
tokenizer.save_pretrained("fine_tuned_bert_CBT")

# Push to Hugging Face Hub
repo_id = "shanthi-323/fine-tuned-bert-CBT"  # Replace with your Hugging Face username and desired repo name
model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

print(f"Model pushed to Hugging Face: https://huggingface.co/{repo_id}")


print("Training complete and model saved to Hugging Face Hub!")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Logging into Hugging Face Hub...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Loading cleaned dataset...
Encoding labels...
Splitting dataset...
Training samples: 2877
Evaluation samples: 720
Tokenizing dataset...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initializing BERT model...


  trainer = Trainer(


Initializing Trainer...
Training the model...


Step,Training Loss,Validation Loss
10,2.6996,2.593189
20,2.5659,2.494923
30,2.5146,2.396897
40,2.4268,2.380316
50,2.3637,2.270284
60,2.2599,2.242888
70,2.2562,2.107768
80,2.0763,2.06181
90,2.0285,2.000077
100,2.0964,1.923367


Saving the fine-tuned model locally...


README.md:   0%|          | 0.00/6.95k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


Model pushed to Hugging Face: https://huggingface.co/shanthi-323/fine-tuned-bert-CBT
Training complete and model saved to Hugging Face Hub!


In [None]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from datasets import load_from_disk, Dataset, DatasetDict
from huggingface_hub import Repository
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


# Step 1: Load the Fine-Tuned Model and Tokenizer
print("Loading model and tokenizer...")
model_name = "shanthi-323/fine-tuned-bert-CBT"
tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    token="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"
).to("cpu")  # Use CPU for debugging

print(f"Number of labels in the model: {model.config.num_labels}")
model = AutoModelForSequenceClassification.from_pretrained(model_name, use_auth_token=True).to("cpu")
print("Model and tokenizer loaded successfully.")



# Step 2: Split the dataset into train and test sets
print("Loading evaluation dataset...")
eval_dataset = load_dataset("shanthi-323/cleaned_dataset_synthetic_cognitive_distortions", split="train")

print(f"Evaluation dataset contains {len(eval_dataset)} samples.")

print("Splitting dataset into train and test...")
ttrain_test_split = dataset["train"].train_test_split(test_size=0.2, seed=42)

# Step 3: Create a DatasetDict with train and test splits
updated_dataset = DatasetDict({
    "train": train_test_split["train"],
    "test": train_test_split["test"]
})

# Create a DatasetDict with train and test splits
updated_dataset = DatasetDict({
    "train": train_test_split["train"],
    "test": train_test_split["test"]
})

# Save the split dataset locally
updated_dataset.save_to_disk("updated_cleaned_datasetcognitive_distortions")
print("Dataset split and saved locally.")


# Save the updated dataset locally
updated_dataset.save_to_disk("updated_cleaned_datasetcognitive_distortions")

# Push the updated dataset to Hugging Face
updated_dataset.push_to_hub("shanthi-323/updated_cleaned_datasetcognitive_distortions")

# Load the test split
eval_dataset = load_dataset("shanthi-323/updated_cleaned_datasetcognitive_distortions", split="test")
print(f"Evaluation dataset contains {len(eval_dataset)} samples.")


# Tokenize the evaluation dataset
print("Tokenizing evaluation dataset...")
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

eval_dataset = eval_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.remove_columns(["text"])  # Remove text column if necessary
eval_dataset.set_format("torch")

# Extract labels from the evaluation dataset
labels = eval_dataset['label']

# Create a label encoder
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Step 4: Create DataLoader
eval_loader = DataLoader(eval_dataset, batch_size=16)  # Adjust batch size if necessary


# Step 5: Function to Get Predictions

def convert_labels(example):
    example["label"] = label_mapping[example["label"]]
    return example

# Apply the mapping
tokenized_eval_dataset = tokenized_eval_dataset.map(convert_labels)


def get_predictions(model, dataloader):
    model.eval()
    y_pred = []
    y_true = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label']

            # Then do your prediction logic
            ...

# Then you'd call it like:
y_pred, y_true = get_predictions(model, eval_loader)
# Encode evaluation dataset
encoded_eval_dataset = encode_labels(eval_dataset)

print(encoded_eval_dataset[0])


def collate_fn(batch):
    # Extract labels and encode them
    labels = [example['label'] for example in batch]
    encoded_labels = label_encoder.transform(labels)

    return {
        "input_ids": torch.stack([example["input_ids"] for example in batch]),
        "attention_mask": torch.stack([example["attention_mask"] for example in batch]),
        "labels": torch.tensor(encoded_labels, dtype=torch.long)
    }

# When creating your DataLoader, specify the collate_fn
eval_loader = DataLoader(
    eval_dataset,
    batch_size=32,
    collate_fn=collate_fn,
    shuffle=False
)

print("DataLoader created successfully!")


# Step 6: Generate Predictions
y_pred, y_true = get_predictions(model, eval_loader)

# Step 7: Generate Classification Report
print("Generating classification report...")
label_mapping = {
    "Mind Reading": 0,
    "Overgeneralization": 1,
    "Magnification": 2,
    "Labelling": 3,
    "Labeling": 3,
    "Personalization": 4,
    "Fortune-telling": 5,
    "Emotional Reasoning": 6,
    "Mental Filter": 7,
    "Mental filter": 7,
    "Should Statements": 8,
    "Should statements": 8,
    "All-or-Nothing Thinking": 9,
    "All-or-nothing thinking": 9,
    "No Distortion": 10,
    "Disqualifying the Positive": 11,
    "Jumping to Conclusions": 12,
    "Blaming": 13,
    "Comparisons": 14
}


classification_report_output = classification_report(y_true, y_pred, target_names=label_mapping.values())
print(classification_report_output)

# Step 8: Save Classification Report Locally
report_filename = "classification_report.txt"
with open(report_filename, "w") as f:
    f.write(classification_report_output)

print(f"Classification report saved locally as {report_filename}")

# Step 9: Push the Report to Hugging Face Hub
print("Pushing the classification report to Hugging Face Hub...")
repo = Repository(local_dir="./temp_repo", clone_from=model_name, use_auth_token=True)
repo.git_pull()  # Ensure the latest changes are pulled

# Copy the classification report to the repository directory
import shutil
shutil.copy(report_filename, "./temp_repo")

# Commit and push the classification report
repo.git_add("classification_report.txt")
repo.git_commit("Add classification report for fine-tuned model evaluation")
repo.git_push()

print("Classification report successfully pushed to the Hugging Face Hub!")


Loading model and tokenizer...
Number of labels in the model: 15




Model and tokenizer loaded successfully.
Loading dataset...
Splitting dataset into train and test...


Saving the dataset (0/1 shards):   0%|          | 0/2877 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/720 [00:00<?, ? examples/s]

Dataset split and saved locally.


Saving the dataset (0/1 shards):   0%|          | 0/2877 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/720 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Evaluation dataset contains 720 samples.
Tokenizing evaluation dataset...


TypeError: cannot unpack non-iterable NoneType object

In [None]:
!nvidia-smi


Sun Dec  8 19:02:18 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P0              30W /  70W |   1535MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
from sklearn.metrics import classification_report
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from datasets import load_from_disk
from huggingface_hub import Repository
import os
import torch
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


# Step 1: Load the Fine-Tuned Model and Tokenizer
model_name = "shanthi-323/fine-tuned-bert-CBT"  # Replace with your model name
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, use_auth_token=True).cpu()


# Step 2: Load the Evaluation Dataset from Hugging Face Hub
print("Loading evaluation dataset...")
eval_dataset = load_dataset("shanthi-323/cleaned_dataset_synthetic_cognitive_distortions", split="train")

print(f"Evaluation dataset contains {len(eval_dataset)} samples.")

# Step 3: Tokenize the Evaluation Dataset
print("Tokenizing evaluation dataset...")
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

eval_dataset = eval_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.remove_columns(["text"])  # Remove text column if necessary
eval_dataset.set_format("torch")

# Step 4: Create DataLoader
eval_loader = DataLoader(eval_dataset, batch_size=16)  # Adjust batch size if necessary

def collate_fn(batch):
    return {
        "input_ids": torch.stack([torch.tensor(example["input_ids"]) for example in batch]),
        "attention_mask": torch.stack([torch.tensor(example["attention_mask"]) for example in batch]),
        "labels": torch.tensor([example["label"] for example in batch]),  # Labels are now integers
    }

for batch in eval_loader:
    print(batch)
    break

# Step 5: Function to Get Predictions
def get_predictions(model, dataloader):
    print("Making predictions...")
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: value.to("cuda") for key, value in batch.items() if key != "labels"}
            labels = batch["labels"].cpu()
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return predictions, true_labels

# Step 6: Generate Predictions
y_pred, y_true = get_predictions(model, eval_loader)

# Step 7: Generate Classification Report
print("Generating classification report...")
label_mapping = {
    "Mind Reading": 0,
    "Overgeneralization": 1,
    "Magnification": 2,
    "Labelling": 3,
    "Labeling": 3,
    "Personalization": 4,
    "Fortune-telling": 5,
    "Emotional Reasoning": 6,
    "Mental Filter": 7,
    "Mental filter": 7,
    "Should Statements": 8,
    "Should statements": 8,
    "All-or-Nothing Thinking": 9,
    "All-or-nothing thinking": 9,
    "No Distortion": 10,
    "Disqualifying the Positive": 11,
    "Jumping to Conclusions": 12,
    "Blaming": 13,
    "Comparisons": 14
}

def encode_labels(dataset, label_mapping):
    for sample in dataset:
        sample["label"] = label_mapping[sample["label"]]
    return dataset

eval_dataset = encode_labels(eval_dataset, label_mapping)

for sample in eval_loader:
    print(sample["label"])
    assert (sample["label"] >= 0).all() and (sample["label"] < len(label_mapping)).all(), "Label out of range!"


classification_report_output = classification_report(y_true, y_pred, target_names=label_mapping.values())
print(classification_report_output)

# Step 8: Save Classification Report Locally
report_filename = "classification_report.txt"
with open(report_filename, "w") as f:
    f.write(classification_report_output)

print(f"Classification report saved locally as {report_filename}")

# Step 9: Push the Report to Hugging Face Hub
print("Pushing the classification report to Hugging Face Hub...")
repo = Repository(local_dir="./temp_repo", clone_from=model_name, use_auth_token=True)
repo.git_pull()  # Ensure the latest changes are pulled

# Copy the classification report to the repository directory
import shutil
shutil.copy(report_filename, "./temp_repo")

# Commit and push the classification report
repo.git_add("classification_report.txt")
repo.git_commit("Add classification report for fine-tuned model evaluation")
repo.git_push()

print("Classification report successfully pushed to the Hugging Face Hub!")




Loading evaluation dataset...
Evaluation dataset contains 3597 samples.
Tokenizing evaluation dataset...
{'label': ['Mind Reading', 'Mind Reading', 'Mind Reading', 'Mind Reading', 'Mind Reading', 'Mind Reading', 'Mind Reading', 'Mind Reading', 'Mind Reading', 'Mind Reading', 'Mind Reading', 'Mind Reading', 'Mind Reading', 'Mind Reading', 'Mind Reading', 'Mind Reading'], 'input_ids': tensor([[ 101, 2198, 2939,  ...,    0,    0,    0],
        [ 101, 2076, 1996,  ...,    0,    0,    0],
        [ 101, 3505, 2134,  ...,    0,    0,    0],
        ...,
        [ 101, 8129, 2134,  ...,    0,    0,    0],
        [ 101, 3782, 2134,  ...,    0,    0,    0],
        [ 101, 2043, 4869,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]

AttributeError: 'list' object has no attribute 'to'

In [None]:
from datasets import load_dataset, concatenate_datasets # Import the necessary functions
import logging
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B")
EOS_TOKEN = tokenizer.eos_token

# Define the fine-tuned model name
classification_model_name = "shanthi-323/fine-tuned-bert-CBT"

print(f"Number of labels in the model: {model.config.num_labels}")

# Load the fine-tuned classification model
distortion_classifier = BertForSequenceClassification.from_pretrained(classification_model_name,return_dict=True).to("cuda")

# Load the tokenizer associated with the fine-tuned model
classifier_tokenizer = AutoTokenizer.from_pretrained(classification_model_name)

print(f"Number of labels in the model: {model.config.num_labels}")


# Define label mapping for classification
label_mapping = {
    0: "Mind Reading",
    1: "Overgeneralization",
    2: "Magnification",
    3: "Labelling",
    4: "Personalization",
    5: "Fortune Telling",
    6: "Emotional Reasoning",
    7: "Mental Filter",
    8: "Should Statements",
    9: "All-or-Nothing Thinking",
    10: "Catastrophizing",
    11: "Disqualifying the Positive",
    12: "Jumping to Conclusions",
    13: "Blaming",
    14: "Comparisons"
}


chatbot_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Function to classify user input using dataset2 and BERT
def classify_distortion(user_input):
    try:
        # Move inputs to the classifier's device
        device = next(distortion_classifier.parameters()).device
        inputs = classifier_tokenizer(user_input, return_tensors="pt").to(device)

        # Predict distortion class
        outputs = distortion_classifier(**inputs)
        predicted_class = outputs.logits.argmax(dim=1).item()
        class_label = label_mapping.get(predicted_class, "Unknown class")

        # Debug output
        print(f"Predicted Class: {predicted_class}, Class Label: {class_label}")

        return predicted_class, class_label
    except Exception as e:
        print(f"Classification Error: {e}")
        return None, "Classification Failed"


# Function to Determine if Classification is Needed
def should_classify_input(user_input):
    # Simple heuristic: classify if input contains emotional or reflective content
    keywords = [
        "feel", "failure", "never", "always", "distorted",
        "guilt", "worthless", "anxious", "overwhelmed", "hopeless",
        "useless", "pointless", "unfair", "impossible",
        "my fault", "blame", "should have", "wrong", "bad",
        "ruined", "disaster", "can't", "won't", "broken",
        "abandoned", "rejected", "lonely", "lost", "betrayed",
        "tired", "sick", "hurt", "angry", "scared"
    ]
    return any(keyword in user_input.lower() for keyword in keywords)

def validate_classification(predicted_class, user_input):
    # Define contexts where classification is meaningful
    if len(user_input.split()) < 5:  # Too short for meaningful classification
        return None, "No classification needed"
    return predicted_class, label_mapping[predicted_class]

# Main Processing Function
# Process the input and dynamically adjust the instruction
def process_input(user_input):
    enable_classification = should_classify_input(user_input)
    instruction = ""

    if enable_classification:
        try:
            distortion = classify_distortion(user_input)
            label_mapping = {
                 0: "Mind Reading",
                 1: "Overgeneralization",
                 2: "Magnification",
                 3: "Labelling",
                 4: "Personalization",
                 5: "Fortune Telling",
                 6: "Emotional Reasoning",
                 7: "Mental Filter",
                 8: "Should Statements",
                 9: "All-or-Nothing Thinking",
                10: "Catastrophizing",
                11: "Disqualifying the Positive",
                12: "Jumping to Conclusions",
                13: "Blaming",
                14: "Comparisons"
            }
            instruction = f"Challenge this cognitive distortion: {label_mapping.get(distortion, 'Unknown')}"
        except Exception as e:
            print(f"Classification Error: {e}")
            instruction = "Continue the conversation."
    else:
        instruction = "Continue the conversation."

    print(f"Final Instruction: {instruction}")  # Debugging
    return instruction


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
# Function to inspect dataset schema and apply formatting dynamically
# Function to Format Datasets

def formatting_prompts_func_dataset1(examples):
    instruction = examples.get("instruction", [])
    input_text = examples.get("input", [])
    output_text = examples.get("output", [])
    texts = []

    for instr, inp, out in zip(instruction, input_text, output_text):
        # Skip rows with missing values
        if not instr or not inp or not out:
            texts.append("")  # Add an empty placeholder
        else:
            text = chatbot_prompt.format(instr, inp, out)
            texts.append(text)

    return {"text": texts}

# Function to format dataset3 (cognitive distortion-specific dataset)
def formatting_prompts_func_dataset3(examples):
    system = examples.get("system", "")
    user = examples.get("user", "")
    assistant = examples.get("assistant", "")
    texts = []

    for sys, usr, asst in zip(system, user, assistant):
        if not sys or not usr or not asst:
            continue  # Skip empty rows
        # Format with chatbot_prompt
        text = chatbot_prompt.format(sys, usr, asst) + tokenizer.eos_token
        texts.append(text)
    return {"text": texts}

# Function to tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=1024)

# Load and process datasets
formatted_datasets = []

# Dataset1: General conversational dataset
dataset1 = load_dataset("fadodr/mental_health_therapy", split="train")
dataset1 = dataset1.map(formatting_prompts_func_dataset1, batched=True)
formatted_datasets.append(dataset1)

# Dataset3: Cognitive distortion instruction dataset
dataset3 = load_dataset("epsilon3/cbt-cognitive-distortions-analysis", split="train")
dataset3 = dataset3.map(formatting_prompts_func_dataset3, batched=True)
formatted_datasets.append(dataset3)

# Combine datasets after formatting
combined_dataset = concatenate_datasets(formatted_datasets)
print(f"Combined dataset contains {len(combined_dataset)} examples.")

# Tokenize combined dataset
tokenized_dataset = combined_dataset.map(tokenize_function, batched=True)
print("Tokenized dataset is ready for training!")

# Example classification-based instruction generation
user_input = "I know I will do poorly in the presentation tomorrow."
predicted_class, class_label = classify_distortion(user_input)

# Dynamically create instruction based on classification
instruction = f"Challenge this cognitive distortion: {class_label}" if predicted_class is not None else "Continue the conversation."
print(f"Instruction: {instruction}")

# Combine all datasets after formatting
if formatted_datasets:
    combined_datasets = concatenate_datasets(formatted_datasets)
    print(f"Combined dataset contains {len(combined_datasets)} examples.")
else:
    print("No valid datasets to combine.")


Number of labels in the model: 15
Number of labels in the model: 15
Dataset Length: 8580
['instruction', 'input', 'output', 'text']
{'instruction': "You are a helpful mental health counselling assistant, please answer the mental health questions based on the patient's description.  The assistant gives helpful, comprehensive, and appropriate answers to the user's questions.", 'input': "Lately, I've been experiencing heightened levels of anxiety, particularly in social situations. Meeting new people or speaking in public causes intense feelings of nervousness, sweating, and racing thoughts. I would like to explore ways to overcome this social anxiety and improve my ability to connect with others on a deeper level.", 'output': "One possible approach to addressing your social anxiety is through gradual exposure and practice. Start by identifying specific social situations that make you anxious, such as meeting new people or speaking in public. Once you have determined these triggers, gradu

Repo card metadata block was not found. Setting CardData to empty.


Combined dataset contains 9201 examples.
Tokenized dataset is ready for training!
Predicted Class: 5, Class Label: Fortune Telling
Instruction: Challenge this cognitive distortion: Fortune Telling
Combined dataset contains 9201 examples.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = combined_datasets,
    dataset_text_field = "text",
    max_seq_length = 1024,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)


def fix_untrained_tokens(model, tokenizer, train_dataset, IGNORED_TOKENIZER_NAMES, eps):
    embedding_matrix = model.get_input_embeddings().weight
    lm_head_matrix = (
        model.get_output_embeddings().weight if model.get_output_embeddings() else None
    )
    if lm_head_matrix is None:
        print("Skipping output embedding adjustment for classification models.")
        return


trainer_stats = trainer.train()

AttributeError: 'NoneType' object has no attribute 'weight'

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer_stats = trainer.train()


  trainer = Trainer(


Step,Training Loss,Validation Loss
10,0.6152,0.998551
20,0.6311,0.992628
30,0.551,1.019438
40,0.8541,1.056139
50,0.5246,0.996485
60,0.4891,0.993419
70,0.7162,1.013578
80,0.5341,1.016543
90,0.5505,1.06758
100,0.7471,1.005861


In [None]:
import unsloth_zoo.tokenizer_utils as tokenizer_utils

# Override the problematic function
def fix_untrained_tokens_override(model, tokenizer, train_dataset, IGNORED_TOKENIZER_NAMES, eps):
    print("Skipping fix_untrained_tokens for classification model.")

# Replace the function in unsloth_zoo
tokenizer_utils.fix_untrained_tokens = fix_untrained_tokens_override

trainer_stats = trainer.train()

Step,Training Loss,Validation Loss
10,0.1639,1.333813
20,0.1023,1.34576
30,0.0634,1.39747
40,0.2962,1.383634
50,0.2121,1.340842
60,0.105,1.419428


KeyboardInterrupt: 

In [None]:
model.save_pretrained("model_cbtchatbot") # Local saving
tokenizer.save_pretrained("model_cbtchatbot")
model.push_to_hub("shanthi-323/model_cbtchatbot", token = "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") # Online saving
tokenizer.push_to_hub("shanthi-323/model_cbtchatbot", token = "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") # Online saving

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shanthi-323/model_cbtchatbot/commit/e7bcb1bca5cef74ff60d7e5a06d4c91d00425200', commit_message='Upload tokenizer', commit_description='', oid='e7bcb1bca5cef74ff60d7e5a06d4c91d00425200', pr_url=None, repo_url=RepoUrl('https://huggingface.co/shanthi-323/model_cbtchatbot', endpoint='https://huggingface.co', repo_type='model', repo_id='shanthi-323/model_cbtchatbot'), pr_revision=None, pr_num=None)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Step 1: Load the fine-tuned model and tokenizer from Hugging Face
model_name = "shanthi-323/fine-tuned-bert-CBT"

print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cpu")
print("Model and tokenizer loaded successfully.")

# Step 2: Define the label mapping
label_mapping = {
                 0: "Mind Reading",
                 1: "Overgeneralization",
                 2: "Magnification",
                 3: "Labelling",
                 4: "Personalization",
                 5: "Fortune Telling",
                 6: "Emotional Reasoning",
                 7: "Mental Filter",
                 8: "Should Statements",
                 9: "All-or-Nothing Thinking",
                10: "Catastrophizing",
                11: "Disqualifying the Positive",
                12: "Jumping to Conclusions",
                13: "Blaming",
                14: "Comparisons"
}

print(f"Model expects {model.config.num_labels} labels.")
print(f"Label mapping contains {len(label_mapping)} labels.")

# Step 3: Define a function for classification
def classify_text(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to("cpu")

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted label
    predicted_label = outputs.logits.argmax(dim=1).item()
    predicted_label_description = label_mapping.get(predicted_label, "Unknown Label")

    return predicted_label, predicted_label_description

# Step 4: Test with sample inputs
def test_chatbot():
    print("Testing chatbot with sample inputs...")
    user_inputs = [
        "I feel like I always fail at everything I try.",
        "People are always judging me.",
        "I should have done better in the exam.",
        "I know I will embarrass myself tomorrow during the presentation.",
        "I am happy because i scored well",
        "Ah, I’ve got finals coming up and I’ve got teaching interviews to face, so I’m a little stressed out."
    ]

    for user_input in user_inputs:
        predicted_label, label_description = classify_text(user_input)
        print(f"Input: {user_input}")
        print(f"Predicted Label: {predicted_label} - {label_description}\n")

# Step 5: Run the test
if __name__ == "__main__":
    test_chatbot()


Loading model and tokenizer...
Model and tokenizer loaded successfully.
Model expects 15 labels.
Label mapping contains 15 labels.
Testing chatbot with sample inputs...
Input: I feel like I always fail at everything I try.
Predicted Label: 1 - Overgeneralization

Input: People are always judging me.
Predicted Label: 1 - Overgeneralization

Input: I should have done better in the exam.
Predicted Label: 4 - Personalization

Input: I know I will embarrass myself tomorrow during the presentation.
Predicted Label: 5 - Fortune Telling

Input: I am happy because i scored well
Predicted Label: 4 - Personalization

Input: Ah, I’ve got finals coming up and I’ve got teaching interviews to face, so I’m a little stressed out.
Predicted Label: 7 - Mental Filter

