In [None]:
!pip install datasets
!pip install transformers



In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# dataset = load_dataset("Amod/mental_health_counseling_conversations")
dataset_1 = load_dataset("marmikpandya/mental-health")
dataset_2 = load_dataset("Amod/mental_health_counseling_conversations")
dataset_3 = load_dataset("fadodr/mental_health_therapy")
# dataset_3['train'][0]
# dataset['train'][0]

{'instruction': "You are a helpful mental health counselling assistant, please answer the mental health questions based on the patient's description.  The assistant gives helpful, comprehensive, and appropriate answers to the user's questions.",
 'input': "Lately, I've been experiencing heightened levels of anxiety, particularly in social situations. Meeting new people or speaking in public causes intense feelings of nervousness, sweating, and racing thoughts. I would like to explore ways to overcome this social anxiety and improve my ability to connect with others on a deeper level.",
 'output': "One possible approach to addressing your social anxiety is through gradual exposure and practice. Start by identifying specific social situations that make you anxious, such as meeting new people or speaking in public. Once you have determined these triggers, gradually expose yourself to them in a controlled manner. For example, you could start by attending small social gatherings with famili

In [None]:
# model = AutoModelForCausalLM.from_pretrained('gpt2')

In [None]:
print(dataset_1.column_names)
print(dataset_2.column_names)
print(dataset_3.column_names)

# make the column name same for each dataset
dataset_1 = dataset_1.rename_columns({
    "input": "input",
    "output": "output"
})

# Do the same for dataset_2 and dataset_3
dataset_2 = dataset_2.rename_columns({
    "Context": "input",
    "Response": "output"
})

dataset_3 = dataset_3.rename_columns({
    "input": "input",
    "output": "output"
})

print(dataset_1.column_names)
print(dataset_2.column_names)
print(dataset_3.column_names)

{'train': ['instruction', 'output', 'input', 'input_ids', 'attention_mask', 'labels']}
{'train': ['Context', 'Response']}
{'train': ['instruction', 'input', 'output'], 'test': ['instruction', 'input', 'output']}
{'train': ['instruction', 'output', 'input', 'input_ids', 'attention_mask', 'labels']}
{'train': ['input', 'output']}
{'train': ['instruction', 'input', 'output'], 'test': ['instruction', 'input', 'output']}


In [None]:
#select only the columns needed (input and output)
dataset_1 = dataset_1.select_columns(["input", "output"])
dataset_2 = dataset_2.select_columns(["input", "output"])
dataset_3 = dataset_3.select_columns(["input", "output"])

In [None]:
#load tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
#gpt2 has no padding token by default. so end of sequence token is introduced
tokenizer.pad_token = tokenizer.eos_token

#tokenize the dataset
def tokenize_function(examples):
  #concatenate the context and response
  text = [f"{input} {output}" for input, output in zip(examples['input'], examples['output'])]
  inputs = tokenizer(text, truncation = True, padding='max_length', max_length = 512)
  inputs['labels'] = inputs['input_ids'].copy()
  return inputs

#apply the tokenize_function to each example in the dataset
# tokenized_dataset = dataset_1.map(tokenize_function, batched=True)
# tokenized_dataset
# Apply the tokenize function to each dataset
tokenized_dataset_1 = dataset_1.map(tokenize_function, batched=True)
tokenized_dataset_2 = dataset_2.map(tokenize_function, batched=True)
tokenized_dataset_3 = dataset_3.map(tokenize_function, batched=True)


Map:   0%|          | 0/13358 [00:00<?, ? examples/s]

Map:   0%|          | 0/3512 [00:00<?, ? examples/s]

Map:   0%|          | 0/8580 [00:00<?, ? examples/s]

Map:   0%|          | 0/3678 [00:00<?, ? examples/s]

In [None]:
from datasets import concatenate_datasets

# Combine tokenized datasets
combined_tokenized_dataset = concatenate_datasets([tokenized_dataset_1["train"],
                                                   tokenized_dataset_2["train"],
                                                   tokenized_dataset_3["train"]])


Fine tuning the model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

In [None]:
model = AutoModelForCausalLM.from_pretrained('gpt2')

#training argument define
training_args = TrainingArguments(
    output_dir = './result',
    eval_strategy = 'epoch',
    num_train_epochs = 1,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    warmup_steps = 100,
    weight_decay = 0.01,
    logging_dir = './logs',
    report_to = 'none'
)

#initialize trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = combined_tokenized_dataset,
    eval_dataset = tokenized_dataset_3["test"]
    # eval_dataset = combined_tokenized_dataset['train'].train_test_split(test_size = 0.2)
)

#train the model
trainer.train()
model_output_dir = './result/model'
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)


Epoch,Training Loss,Validation Loss
1,0.8971,1.335104


('./result/model/tokenizer_config.json',
 './result/model/special_tokens_map.json',
 './result/model/vocab.json',
 './result/model/merges.txt',
 './result/model/added_tokens.json',
 './result/model/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def get_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    return total_params

def main():
    # Load the tokenizer and model from the saved directory
    model_path = './result/model'  # Adjust the path if necessary
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)

    # Calculate the Number of Parameters in the model being used for inference
    total_params = get_model_parameters(model)
    print(f"Total number of parameters: {total_params}")

    # Interactive input
    input_text = input("Give input.")

    # Prepare the input text you want to generate predictions for
    inputs = tokenizer(input_text, return_tensors='pt')

    # Generate Text
    outputs = model.generate(**inputs, max_length=50, num_return_sequences=1)

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("Generated Text:", generated_text)

# Call the main function
main()


Total number of parameters: 124439808
Give input.my friends ignore me.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text: my friends ignore me. I'm always on the phone with them and just talk to them. I don't know how to stop them. I'm sorry to hear that you're feeling this way. It's important to remember that you are not alone


In [None]:
print("hello world")