In [None]:
# STEP 1: Install required packages (Colab)
!pip install transformers datasets pandas



In [None]:
# STEP 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# STEP 3: Import Libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
import pandas as pd
import pickle
import os

In [None]:
# STEP 4: Load Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def load_and_prepare_data(pkl_path, limit=100000):
    with open(pkl_path, "rb") as f:
        data = pickle.load(f)

    # Extract only 'entries' from the dictionary
    entries = data["entries"]

    # Convert to DataFrame
    df = pd.DataFrame(entries)
    df = df[["input", "output"]].dropna()
    df = df[df["input"].str.len() > 5]
    df = df[df["output"].str.len() > 5]
    df = df.iloc[:limit]

    return Dataset.from_pandas(df)

In [None]:
# STEP 6: Tokenization Preprocessing
def preprocess_function(example):
    model_inputs = tokenizer(example["input"], max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(example["output"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# STEP 7: Fine-Tuning Function (Save model to Google Drive)
def fine_tune_flan_t5(dataset, output_dir="/content/drive/MyDrive/flan-t5-therapy-finetuned"):
    tokenized_dataset = dataset.map(preprocess_function)

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,
        num_train_epochs=3,
        learning_rate=5e-5,
        logging_dir=os.path.join(output_dir, "logs"),
        save_strategy="epoch",
        report_to=[]
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
    )

    trainer.train()
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved at {output_dir}")


In [None]:
# Load first 1 lakh data points
dataset = load_and_prepare_data("/content/drive/MyDrive/final_merged_therapy_data.pkl", limit=100000)

# Train and save to Drive
fine_tune_flan_t5(dataset)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,3.264
1000,1.0329
1500,0.9793
2000,0.9522
2500,0.9345
3000,0.9187
3500,0.891
4000,0.8949
4500,0.8829
5000,0.8913


Step,Training Loss
500,3.264
1000,1.0329
1500,0.9793
2000,0.9522
2500,0.9345
3000,0.9187
3500,0.891
4000,0.8949
4500,0.8829
5000,0.8913


Model saved at /content/drive/MyDrive/flan-t5-therapy-finetuned


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_path = "/content/drive/MyDrive/flan-t5-therapy-finetuned"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
def generate_therapist_reply(user_input):
    prompt = f"Respond like a therapist: {user_input}"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs, max_new_tokens=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
test_input = "I'm feeling very anxious and can’t sleep."
print("User:", test_input)
print("Therapist:", generate_therapist_reply(test_input))

User: I'm feeling very anxious and can’t sleep.
Therapist: I'm here to listen and support you. Can you tell me more about what's been going on?


In [None]:
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load from Google Drive
model_path = "/content/drive/MyDrive/flan-t5-therapy-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

def generate_therapist_reply(user_input):
    prompt = f"Respond like a therapist: {user_input}"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
    outputs = model.generate(**inputs, max_new_tokens=100)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

gr.Interface(
    fn=generate_therapist_reply,
    inputs=gr.Textbox(lines=3, placeholder="What's on your mind?"),
    outputs="text",
    title="Therapist Chatbot",
    description="Trained on real therapy conversations using FLAN-T5."
).launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3aa723e2aabf65248c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


