In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('counselchat-data.csv')
df.columns

Index(['questionID', 'questionTitle', 'questionText', 'questionUrl', 'topics',
       'therapistName', 'therapistUrl', 'answerText', 'upvotes'],
      dtype='object')

In [6]:
! pip install bert-score

Collecting bert-score
  Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cufft_cu12-11.0.2.5

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target (y)
# You'll need to decide which column is your target variable
X = df[['questionText']]
y = df[['answerText']]

# Split the data into training and testing sets
# Let's use a 80-20 split as an example
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# If you want the train and test data as separate DataFrames including the target variable:
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# You can now save these to csv files if needed
train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

In [16]:
train_df_small = train_df[:6]
train_df_small = train_df_small.dropna()

In [54]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from bert_score import BERTScorer
import torch
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

# Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Initialize the BERTScorer object
scorer = BERTScorer(lang='en')

def generate_response(question):
    prompt = f"Question: {question}\nAnswer:"
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(input_ids, max_length=500, num_return_sequences=1, do_sample=True)
    full_output = tokenizer.decode(output[0], skip_special_tokens=True)
    # Extract only the answer part
    answer = full_output.split("Answer:")[-1].strip()
    return answer

# Task-specific metrics
def empathy_score(response):
    empathetic_phrases = ["I understand", "That must be difficult", "I hear you", "You're not alone"]
    return sum(1 for phrase in empathetic_phrases if phrase.lower() in response.lower()) / len(empathetic_phrases)

def relevance_score(question, answer):
    vectorizer = CountVectorizer().fit_transform([question, answer])
    return cosine_similarity(vectorizer)[0][1]

def safety_check(response):
    sentiment = TextBlob(response).sentiment.polarity
    return 'Safe' if sentiment > -0.5 else 'Potentially Concerning'

def structure_score(response):
    parts = ['validation', 'exploration', 'suggestion']
    return sum(1 for part in parts if part in response.lower()) / len(parts)

def client_centered_score(response):
    client_centered_phrases = ["What do you think about", "How do you feel about", "What's your perspective on"]
    return any(phrase in response for phrase in client_centered_phrases)

def comprehensive_evaluation(question, response, reference):
    # BERTScore
    P, R, F1 = scorer.score([response], [reference])
    bert_score = F1.item()

    # ROUGE
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = rouge.score(reference, response)

    # BLEU
    reference_tokens = nltk.word_tokenize(reference)
    response_tokens = nltk.word_tokenize(response)
    smoothie = SmoothingFunction().method1
    bleu_score = sentence_bleu([reference_tokens], response_tokens,
                               weights=(0.25, 0.25, 0.25, 0.25),
                               smoothing_function=smoothie)

    # Task-specific metrics
    emp_score = empathy_score(response)
    rel_score = relevance_score(question, response)
    safety = safety_check(response)
    struct_score = structure_score(response)
    client_cent_score = client_centered_score(response)

    return {
        'bert_score': bert_score,
        'rouge1_f': rouge_scores['rouge1'].fmeasure,
        'rouge2_f': rouge_scores['rouge2'].fmeasure,
        'rougeL_f': rouge_scores['rougeL'].fmeasure,
        'bleu': bleu_score,
        'empathy': emp_score,
        'relevance': rel_score,
        'safety': safety,
        'structure': struct_score,
        'client_centered': client_cent_score
    }

# Generate responses and evaluate
results = []
for _, row in train_df_small.iterrows():
    generated_answer = generate_response(row['questionText'])
    scores = comprehensive_evaluation(row['questionText'], generated_answer, row['answerText'])
    results.append({
        'question': row['questionText'],
        'original_answer': row['answerText'],
        'generated_answer': generated_answer,
        **scores
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print results
for _, row in results_df.iterrows():
    print(f"Question: {row['question'][:50]}...")
    print(f"Original Answer: {row['original_answer'][:50]}...")
    print(f"Generated Answer: {row['generated_answer'][:50]}...")
    print(f"BERTScore: {row['bert_score']:.2f}")
    print(f"ROUGE-1 F1: {row['rouge1_f']:.2f}")
    print(f"ROUGE-2 F1: {row['rouge2_f']:.2f}")
    print(f"ROUGE-L F1: {row['rougeL_f']:.2f}")
    print(f"BLEU: {row['bleu']:.2f}")
    print(f"Empathy Score: {row['empathy']:.2f}")
    print(f"Relevance Score: {row['relevance']:.2f}")
    print(f"Safety: {row['safety']}")
    print(f"Structure Score: {row['structure']:.2f}")
    print(f"Client-Centered Score: {row['client_centered']}")
    print("\n")

# Save results to CSV
# results_df.to_csv('evaluation_results.csv', index=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpec

Question: I've been experiencing a lot of anxiety and panic ...
Original Answer: <p>Hi New Jersey,</p><p>You talk about two very bi...
Generated Answer: I don't know. I have a little OCD so I always worr...
BERTScore: 0.82
ROUGE-1 F1: 0.15
ROUGE-2 F1: 0.01
ROUGE-L F1: 0.09
BLEU: 0.00
Empathy Score: 0.00
Relevance Score: 0.24
Safety: Safe
Structure Score: 0.00
Client-Centered Score: False


Question: I found out my boyfriend takes anti-depression med...
Original Answer: <p>Give him the time and space he needs.&nbsp; Obv...
Generated Answer: We'll see....
Thank you so much, dear man!...
BERTScore: 0.80
ROUGE-1 F1: 0.02
ROUGE-2 F1: 0.00
ROUGE-L F1: 0.02
BLEU: 0.00
Empathy Score: 0.00
Relevance Score: 0.05
Safety: Safe
Structure Score: 0.00
Client-Centered Score: False


Question: I self-harm, and I stop for awhile. Then when I se...
Original Answer: <p>In a way, self-harm can present somewhat like a...
Generated Answer: I feel like you need to stop before you start and ...
BERTScore: 0.81

In [21]:
# Task-specific metrics
def empathy_score(response):
    empathetic_phrases = ["I understand", "That must be difficult", "I hear you", "You're not alone"]
    return sum(1 for phrase in empathetic_phrases if phrase.lower() in response.lower()) / len(empathetic_phrases)

def relevance_score(question, answer):
    vectorizer = CountVectorizer().fit_transform([question, answer])
    return cosine_similarity(vectorizer)[0][1]

def safety_check(response):
    sentiment = TextBlob(response).sentiment.polarity
    return 'Safe' if sentiment > -0.5 else 'Potentially Concerning'

def structure_score(response):
    parts = ['validation', 'exploration', 'suggestion']
    return sum(1 for part in parts if part in response.lower()) / len(parts)

def client_centered_score(response):
    client_centered_phrases = ["What do you think about", "How do you feel about", "What's your perspective on"]
    return any(phrase in response for phrase in client_centered_phrases)


In [19]:
def comprehensive_evaluation(question, response, reference):
    # BERTScore
    P, R, F1 = scorer.score([response], [reference])
    bert_score = F1.item()

    # ROUGE
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = rouge.score(reference, response)

    # BLEU
    reference_tokens = nltk.word_tokenize(reference)
    response_tokens = nltk.word_tokenize(response)
    smoothie = SmoothingFunction().method1
    bleu_score = sentence_bleu([reference_tokens], response_tokens,
                               weights=(0.25, 0.25, 0.25, 0.25),
                               smoothing_function=smoothie)

    # Task-specific metrics
    emp_score = empathy_score(response)
    rel_score = relevance_score(question, response)
    safety = safety_check(response)
    struct_score = structure_score(response)
    client_cent_score = client_centered_score(response)

    return {
        'bert_score': bert_score,
        'rouge1_f': rouge_scores['rouge1'].fmeasure,
        'rouge2_f': rouge_scores['rouge2'].fmeasure,
        'rougeL_f': rouge_scores['rougeL'].fmeasure,
        'bleu': bleu_score,
        'empathy': emp_score,
        'relevance': rel_score,
        'safety': safety,
        'structure': struct_score,
        'client_centered': client_cent_score
    }

In [53]:
train_df_small

Unnamed: 0,questionText,answerText,generated_answer,precision,recall,f1,precision.1,recall.1,f1.1,precision.2,recall.2,f1.2,precision.3,recall.3,f1.3
381,I've been experiencing a lot of anxiety and pa...,"<p>Hi New Jersey,</p><p>You talk about two ver...",The reason is that the anxiety goes away when ...,0.810383,0.817542,0.813947,0.817042,0.816246,0.816644,0.819913,0.797538,0.808571,0.857257,0.802515,0.828983
532,I found out my boyfriend takes anti-depression...,<p>Give him the time and space he needs.&nbsp;...,I'm trying to take it upon myself to get my bo...,0.805058,0.820707,0.812807,0.804065,0.816646,0.810307,0.817334,0.796687,0.806879,0.80534,0.806111,0.805725
482,"I self-harm, and I stop for awhile. Then when ...","<p>In a way, self-harm can present somewhat li...",I am quite happy to stop self-deprecating myse...,0.80426,0.797815,0.801025,0.817696,0.795797,0.806598,0.814943,0.801714,0.808275,0.820406,0.801889,0.811042
405,"I'm a young adult woman, and I have trouble fi...","<p>What a tough situation you must be in, feel...","I feel like my life is better as a person, and...",0.822138,0.804471,0.813208,0.822022,0.804264,0.813046,0.847766,0.77807,0.811424,0.833435,0.773856,0.802541
910,I'm feeling rejected and frustrated. This is n...,<p>&nbsp; &nbsp;Feeling rejected and frustrate...,I'm not at all feeling the same. I am feeling ...,0.811748,0.774582,0.792729,0.802866,0.770476,0.786337,0.802134,0.761523,0.781301,0.807608,0.765249,0.785858


In [40]:
print(train_df_small['generated_answer'].iloc[0])

This is a wonderful time as we get so invested in being the next generation and not just our younger generation. Being able to grow my business is really a privilege and one of the main things I miss is a lot of people going out and doing what they love or even have children with this product. I know that I still have no money to pay the rent, pay the bills, take


In [51]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluate_response(reference, candidate):
    # BERTScore
    scorer = BERTScorer(lang="en")
    P, R, F1 = scorer.score([candidate], [reference])
    bert_score = F1.item()

    # ROUGE
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = rouge.score(reference, candidate)

    # BLEU with smoothing
    reference_tokens = nltk.word_tokenize(reference)
    candidate_tokens = nltk.word_tokenize(candidate)
    smoothie = SmoothingFunction().method1
    bleu_score = sentence_bleu([reference_tokens], candidate_tokens,
                               weights=(0.25, 0.25, 0.25, 0.25),  # This gives equal weight to 1, 2, 3, and 4-grams
                               smoothing_function=smoothie)

    return {
        'bert_score': bert_score,
        'rouge1_f': rouge_scores['rouge1'].fmeasure,
        'rouge2_f': rouge_scores['rouge2'].fmeasure,
        'rougeL_f': rouge_scores['rougeL'].fmeasure,
        'bleu': bleu_score
    }

# Usage
reference = "The client should focus on self-care and stress management techniques."
# candidate = "I recommend that you prioritize self-care and learn some stress management strategies."
candidate = "My desk is on sale"
scores = evaluate_response(reference, candidate)
print(scores)

print("\nScore Interpretations:")
for metric, score in scores.items():
    print(f"{metric}: {score:.4f}")
    if metric in ['bert_score', 'rouge1_f', 'rouge2_f', 'rougeL_f']:
        print(f"  Interpretation: {'Excellent' if score > 0.8 else 'Good' if score > 0.6 else 'Fair' if score > 0.4 else 'Poor'}")
    elif metric == 'bleu':
        print(f"  Interpretation: {'Excellent' if score > 0.5 else 'Good' if score > 0.3 else 'Fair' if score > 0.1 else 'Poor'}")
    print()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'bert_score': 0.8567789196968079, 'rouge1_f': 0.12500000000000003, 'rouge2_f': 0.0, 'rougeL_f': 0.12500000000000003, 'bleu': 0.016182712188007015}

Score Interpretations:
bert_score: 0.8568
  Interpretation: Excellent

rouge1_f: 0.1250
  Interpretation: Poor

rouge2_f: 0.0000
  Interpretation: Poor

rougeL_f: 0.1250
  Interpretation: Poor

bleu: 0.0162
  Interpretation: Poor



In [44]:
scores

{'bert_score': 0.938726007938385,
 'rouge1_f': 0.41666666666666663,
 'rouge2_f': 0.2727272727272727,
 'rougeL_f': 0.41666666666666663,
 'bleu': 7.505697654413981e-155}

In [42]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=301c0404fab4ad79c71917405f47898b5ee6a5963e616e47967ce5e3d54df5dd
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [56]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from bert_score import BERTScorer
import torch
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

# Load T5 model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Initialize the BERTScorer object
scorer = BERTScorer(lang='en')

def generate_response(question):
    input_text = f"answer: {question}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)
    output = model.generate(input_ids, max_length=500, num_return_sequences=1, do_sample=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Generate responses and evaluate
results = []
for _, row in train_df_small.iterrows():
    generated_answer = generate_response(row['questionText'])
    scores = comprehensive_evaluation(row['questionText'], generated_answer, row['answerText'])
    results.append({
        'question': row['questionText'],
        'original_answer': row['answerText'],
        'generated_answer': generated_answer,
        **scores
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print results
for _, row in results_df.iterrows():
    print(f"Question: {row['question'][:50]}...")
    print(f"Original Answer: {row['original_answer'][:50]}...")
    print(f"Generated Answer: {row['generated_answer'][:50]}...")
    print(f"BERTScore: {row['bert_score']:.2f}")
    print(f"ROUGE-1 F1: {row['rouge1_f']:.2f}")
    print(f"ROUGE-2 F1: {row['rouge2_f']:.2f}")
    print(f"ROUGE-L F1: {row['rougeL_f']:.2f}")
    print(f"BLEU: {row['bleu']:.2f}")
    print(f"Empathy Score: {row['empathy']:.2f}")
    print(f"Relevance Score: {row['relevance']:.2f}")
    print(f"Safety: {row['safety']}")
    print(f"Structure Score: {row['structure']:.2f}")
    print(f"Client-Centered Score: {row['client_centered']}")
    print("\n")

# Save results to CSV
results_df.to_csv('evaluation_results_t5.csv', index=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Question: I've been experiencing a lot of anxiety and panic ...
Original Answer: <p>Hi New Jersey,</p><p>You talk about two very bi...
Generated Answer: True...
BERTScore: 0.78
ROUGE-1 F1: 0.00
ROUGE-2 F1: 0.00
ROUGE-L F1: 0.00
BLEU: 0.00
Empathy Score: 0.00
Relevance Score: 0.00
Safety: Safe
Structure Score: 0.00
Client-Centered Score: False


Question: I found out my boyfriend takes anti-depression med...
Original Answer: <p>Give him the time and space he needs.&nbsp; Obv...
Generated Answer: True...
BERTScore: 0.77
ROUGE-1 F1: 0.00
ROUGE-2 F1: 0.00
ROUGE-L F1: 0.00
BLEU: 0.00
Empathy Score: 0.00
Relevance Score: 0.00
Safety: Safe
Structure Score: 0.00
Client-Centered Score: False


Question: I self-harm, and I stop for awhile. Then when I se...
Original Answer: <p>In a way, self-harm can present somewhat like a...
Generated Answer: False...
BERTScore: 0.77
ROUGE-1 F1: 0.00
ROUGE-2 F1: 0.00
ROUGE-L F1: 0.00
BLEU: 0.00
Empathy Score: 0.00
Relevance Score: 0.00
Safety: Safe
Structure S

In [57]:
train_df_small

Unnamed: 0,questionText,answerText,generated_answer,precision,recall,f1,precision.1,recall.1,f1.1,precision.2,recall.2,f1.2,precision.3,recall.3,f1.3
381,I've been experiencing a lot of anxiety and pa...,"<p>Hi New Jersey,</p><p>You talk about two ver...",The reason is that the anxiety goes away when ...,0.810383,0.817542,0.813947,0.817042,0.816246,0.816644,0.819913,0.797538,0.808571,0.857257,0.802515,0.828983
532,I found out my boyfriend takes anti-depression...,<p>Give him the time and space he needs.&nbsp;...,I'm trying to take it upon myself to get my bo...,0.805058,0.820707,0.812807,0.804065,0.816646,0.810307,0.817334,0.796687,0.806879,0.80534,0.806111,0.805725
482,"I self-harm, and I stop for awhile. Then when ...","<p>In a way, self-harm can present somewhat li...",I am quite happy to stop self-deprecating myse...,0.80426,0.797815,0.801025,0.817696,0.795797,0.806598,0.814943,0.801714,0.808275,0.820406,0.801889,0.811042
405,"I'm a young adult woman, and I have trouble fi...","<p>What a tough situation you must be in, feel...","I feel like my life is better as a person, and...",0.822138,0.804471,0.813208,0.822022,0.804264,0.813046,0.847766,0.77807,0.811424,0.833435,0.773856,0.802541
910,I'm feeling rejected and frustrated. This is n...,<p>&nbsp; &nbsp;Feeling rejected and frustrate...,I'm not at all feeling the same. I am feeling ...,0.811748,0.774582,0.792729,0.802866,0.770476,0.786337,0.802134,0.761523,0.781301,0.807608,0.765249,0.785858


In [63]:
print(train_df_small['generated_answer'].iloc[1])

I'm trying to take it upon myself to get my boyfriend back on the drug. It probably isn't good for him before we're old and his body is not strong enough to take it. We'd rather get some time to talk, and he's never been to my hospital before. He's not quite ready yet, but I'm waiting for him to show up tomorrow. He doesn't want to be alone. Let me see if he's alright!
Now we're working on it! I got a lot on mine. I think it feels good, so I'm not worried about anyone sleeping with us.
We've been putting a lot of time in our relationships. Sometimes he's out on medications (even with a few pills being in the bag), then in the afternoon he doesn't seem to even notice or notice anything bad until I try and help him stop taking them. He's not going to stop, he just refuses to give up. He says it's only a matter of time before they are gone, so I thought it might be some time for an emergency pill and then a prescription.
Now, I'm concerned about something. I went home and woke up this mor

In [69]:
from transformers import BartForConditionalGeneration, BartTokenizer
from bert_score import BERTScorer
import torch
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

# Load BART model and tokenizer
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Initialize the BERTScorer object
scorer = BERTScorer(lang='en')

def generate_response(question):
    inputs = tokenizer(f"Respond as a therapist to this question: {question}", return_tensors="pt", max_length=1024, truncation=True)
    outputs = model.generate(inputs.input_ids, max_length=150, num_return_sequences=1, do_sample=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# The rest of the functions (empathy_score, relevance_score, etc.) remain the same

for _, row in train_df_small.iterrows():
    generated_answer = generate_response(row['questionText'])
    scores = comprehensive_evaluation(row['questionText'], generated_answer, row['answerText'])
    results.append({
        'question': row['questionText'],
        'original_answer': row['answerText'],
        'generated_answer': generated_answer,
        **scores
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print results
for _, row in results_df.iterrows():
    print(f"Question: {row['question'][:50]}...")
    print(f"Original Answer: {row['original_answer'][:50]}...")
    print(f"Generated Answer: {row['generated_answer'][:50]}...")
    print(f"BERTScore: {row['bert_score']:.2f}")
    print(f"ROUGE-1 F1: {row['rouge1_f']:.2f}")
    print(f"ROUGE-2 F1: {row['rouge2_f']:.2f}")
    print(f"ROUGE-L F1: {row['rougeL_f']:.2f}")
    print(f"BLEU: {row['bleu']:.2f}")
    print(f"Empathy Score: {row['empathy']:.2f}")
    print(f"Relevance Score: {row['relevance']:.2f}")
    print(f"Safety: {row['safety']}")
    print(f"Structure Score: {row['structure']:.2f}")
    print(f"Client-Centered Score: {row['client_centered']}")
    print("\n")

# Save results to CSV
results_df.to_csv('evaluation_results_bart.csv', index=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Question: I've been experiencing a lot of anxiety and panic ...
Original Answer: <p>Hi New Jersey,</p><p>You talk about two very bi...
Generated Answer: ? . . . ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ...
BERTScore: 0.71
ROUGE-1 F1: 0.01
ROUGE-2 F1: 0.00
ROUGE-L F1: 0.01
BLEU: 0.00
Empathy Score: 0.00
Relevance Score: 0.00
Safety: Safe
Structure Score: 0.00
Client-Centered Score: False


Question: I found out my boyfriend takes anti-depression med...
Original Answer: <p>Give him the time and space he needs.&nbsp; Obv...
Generated Answer: ? . . . . . . . . . ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ...
BERTScore: 0.71
ROUGE-1 F1: 0.00
ROUGE-2 F1: 0.00
ROUGE-L F1: 0.00
BLEU: 0.00
Empathy Score: 0.00
Relevance Score: 0.00
Safety: Safe
Structure Score: 0.00
Client-Centered Score: False


Question: I self-harm, and I stop for awhile. Then when I se...
Original Answer: <p>In a way, self-harm can present somewhat like a...
Generated Answer: . . . . . . . . . . . . . . . . . . ( " " " " ) . ...
BERTSco

In [70]:
train_df_small

Unnamed: 0,questionText,answerText,generated_answer,precision,recall,f1,precision.1,recall.1,f1.1,precision.2,recall.2,f1.2,precision.3,recall.3,f1.3
381,I've been experiencing a lot of anxiety and pa...,"<p>Hi New Jersey,</p><p>You talk about two ver...",The reason is that the anxiety goes away when ...,0.810383,0.817542,0.813947,0.817042,0.816246,0.816644,0.819913,0.797538,0.808571,0.857257,0.802515,0.828983
532,I found out my boyfriend takes anti-depression...,<p>Give him the time and space he needs.&nbsp;...,I'm trying to take it upon myself to get my bo...,0.805058,0.820707,0.812807,0.804065,0.816646,0.810307,0.817334,0.796687,0.806879,0.80534,0.806111,0.805725
482,"I self-harm, and I stop for awhile. Then when ...","<p>In a way, self-harm can present somewhat li...",I am quite happy to stop self-deprecating myse...,0.80426,0.797815,0.801025,0.817696,0.795797,0.806598,0.814943,0.801714,0.808275,0.820406,0.801889,0.811042
405,"I'm a young adult woman, and I have trouble fi...","<p>What a tough situation you must be in, feel...","I feel like my life is better as a person, and...",0.822138,0.804471,0.813208,0.822022,0.804264,0.813046,0.847766,0.77807,0.811424,0.833435,0.773856,0.802541
910,I'm feeling rejected and frustrated. This is n...,<p>&nbsp; &nbsp;Feeling rejected and frustrate...,I'm not at all feeling the same. I am feeling ...,0.811748,0.774582,0.792729,0.802866,0.770476,0.786337,0.802134,0.761523,0.781301,0.807608,0.765249,0.785858


In [76]:
print(train_df_small['generated_answer'].iloc[3])

I feel like my life is better as a person, and therefore no two people are


In [77]:
train_df

Unnamed: 0,questionText,answerText
381,I've been experiencing a lot of anxiety and pa...,"<p>Hi New Jersey,</p><p>You talk about two ver..."
532,I found out my boyfriend takes anti-depression...,<p>Give him the time and space he needs.&nbsp;...
672,,<p>This can be tough to do in this money-drive...
482,"I self-harm, and I stop for awhile. Then when ...","<p>In a way, self-harm can present somewhat li..."
405,"I'm a young adult woman, and I have trouble fi...","<p>What a tough situation you must be in, feel..."
...,...,...
1130,"Sometimes, I'm fine and can go out or meet peo...",<p>Feelings of anxiety can be scary and someti...
1294,My dad doesn't like the fact that I'm a boy. H...,<p>Maybe this is emotional abuse.</p><p>It cer...
860,"I’m a man, and I’m soon to be married. I have ...","<p>Hello, and thank you for your question. Whe..."
1459,I know that I need to get past my feelings for...,<p>There is no wrong or right way to define a ...


In [8]:
! pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [8]:
%pip install accelerate -U
%pip install transformers[torch]



In [53]:
train_df

Unnamed: 0,questionText,answerText
381,I've been experiencing a lot of anxiety and pa...,"<p>Hi New Jersey,</p><p>You talk about two ver..."
532,I found out my boyfriend takes anti-depression...,<p>Give him the time and space he needs.&nbsp;...
672,,<p>This can be tough to do in this money-drive...
482,"I self-harm, and I stop for awhile. Then when ...","<p>In a way, self-harm can present somewhat li..."
405,"I'm a young adult woman, and I have trouble fi...","<p>What a tough situation you must be in, feel..."
...,...,...
1130,"Sometimes, I'm fine and can go out or meet peo...",<p>Feelings of anxiety can be scary and someti...
1294,My dad doesn't like the fact that I'm a boy. H...,<p>Maybe this is emotional abuse.</p><p>It cer...
860,"I’m a man, and I’m soon to be married. I have ...","<p>Hello, and thank you for your question. Whe..."
1459,I know that I need to get past my feelings for...,<p>There is no wrong or right way to define a ...


In [6]:
import pandas as pd
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import torch


# Load pre-trained model and tokenizer
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Prepare the dataset
def prepare_data(examples):
    inputs = [f"Respond as a therapist to this question: {q}" for q in examples['questionText']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    # Tokenize targets
    labels = tokenizer(examples['answerText'], max_length=512, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(train_df)

# Tokenize and prepare the dataset
tokenized_dataset = dataset.map(prepare_data, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_bart_therapist")
tokenizer.save_pretrained("./fine_tuned_bart_therapist")

print("Fine-tuning complete. Model saved.")

Map:   0%|          | 0/1185 [00:00<?, ? examples/s]

Step,Training Loss
10,13.0131
20,11.3565
30,11.5163
40,8.7473
50,9.3899
60,8.6008
70,7.7803
80,7.3331
90,6.794
100,6.0174


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Fine-tuning complete. Model saved.


In [54]:
def generate_response(question, max_length=1000, num_return_sequences=1, temperature=0.7, top_k=50, top_p=0.95, repetition_penalty=1.0, no_repeat_ngram_size=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(f"Respond as a therapist to this question: {question}", return_tensors="pt", max_length=512, truncation=True).to(device)

    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        do_sample=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        early_stopping=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Make sure the model is on the correct device
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Test with a sample question
sample_question = """I've been experiencing a lot of anxiety and panic attacks lately. I was recently diagnosed by my psychiatrist with obsessive-compulsive disorder. Lately, I've been questioning everything from my career to my relationship. My boyfriend and I just moved in a few months ago. All of a sudden, I don't feel as comfortable around him as I used to, although I can't seem to find a reason as to why I feel this way."""

response = generate_response(sample_question)
print(f"Sample question: {sample_question}")
print(f"Generated response: {response}")

Sample question: I've been experiencing a lot of anxiety and panic attacks lately. I was recently diagnosed by my psychiatrist with obsessive-compulsive disorder. Lately, I've been questioning everything from my career to my relationship. My boyfriend and I just moved in a few months ago. All of a sudden, I don't feel as comfortable around him as I used to, although I can't seem to find a reason as to why I feel this way.
Generated response: <p>Hello, and thank you for your question. &nbsp;It sounds like you're experiencing a lot of anxiety and panic attacks, and it sounds like there are some other things going on as well that you may want to talk with your therapist about some of the things that you are experiencing in your relationship and what is going on.</p><p>If you are feeling anxious and anxious about your relationship, I'd recommend talking with your primary care physician.</p>


In [14]:
! pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=f950937028e84e159078482fd6e3b0a7ac8a651d6ffb76e5331a762bfb66e4ac
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [52]:
from transformers import BartForConditionalGeneration, BartTokenizer
from bert_score import BERTScorer
import torch
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

# # Load BART model and tokenizer
# model_name = "facebook/bart-base"
# tokenizer = BartTokenizer.from_pretrained(model_name)
# model = BartForConditionalGeneration.from_pretrained(model_name)

# Initialize the BERTScorer object
scorer = BERTScorer(lang='en')

def generate_response(question):
    # Ensure inputs are on the same device as the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(f"Respond as a therapist to this question: {question}", return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model.generate(inputs.input_ids, max_length=500, num_return_sequences=1, do_sample=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Make sure the model is on the correct device
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))


# The rest of the functions (empathy_score, relevance_score, etc.) remain the same
results = []
for _, row in train_df_small.iterrows():
    generated_answer = generate_response(row['questionText'])
    scores = comprehensive_evaluation(row['questionText'], generated_answer, row['answerText'])
    results.append({
        'question': row['questionText'],
        'original_answer': row['answerText'],
        'generated_answer': generated_answer,
        **scores
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print results
for _, row in results_df.iterrows():
    print(f"Question: {row['question'][:50]}...")
    print(f"Original Answer: {row['original_answer'][:50]}...")
    print(f"Generated Answer: {row['generated_answer'][:50]}...")
    print(f"BERTScore: {row['bert_score']:.2f}")
    print(f"ROUGE-1 F1: {row['rouge1_f']:.2f}")
    print(f"ROUGE-2 F1: {row['rouge2_f']:.2f}")
    print(f"ROUGE-L F1: {row['rougeL_f']:.2f}")
    print(f"BLEU: {row['bleu']:.2f}")
    print(f"Empathy Score: {row['empathy']:.2f}")
    print(f"Relevance Score: {row['relevance']:.2f}")
    print(f"Safety: {row['safety']}")
    print(f"Structure Score: {row['structure']:.2f}")
    print(f"Client-Centered Score: {row['client_centered']}")
    print("\n")

# Save results to CSV
results_df.to_csv('evaluation_results_bart.csv', index=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Question: I've been experiencing a lot of anxiety and panic ...
Original Answer: <p>Hi New Jersey,</p><p>You talk about two very bi...
Generated Answer: <p>I'm sorry to hear about your anxiety and panic ...
BERTScore: 0.84
ROUGE-1 F1: 0.07
ROUGE-2 F1: 0.02
ROUGE-L F1: 0.06
BLEU: 0.00
Empathy Score: 0.00
Relevance Score: 0.28
Safety: Potentially Concerning
Structure Score: 0.00
Client-Centered Score: False


Question: I found out my boyfriend takes anti-depression med...
Original Answer: <p>Give him the time and space he needs.&nbsp; Obv...
Generated Answer: <p>Hi,</p><p><br></p>...
BERTScore: 0.84
ROUGE-1 F1: 0.05
ROUGE-2 F1: 0.02
ROUGE-L F1: 0.05
BLEU: 0.00
Empathy Score: 0.00
Relevance Score: 0.00
Safety: Safe
Structure Score: 0.00
Client-Centered Score: False


Question: I self-harm, and I stop for awhile. Then when I se...
Original Answer: <p>In a way, self-harm can present somewhat like a...
Generated Answer: <p>I would suggest looking at what is motivating y...
BERTScore: 0.85
RO

In [24]:
train_df_small

Unnamed: 0,questionText,answerText
381,I've been experiencing a lot of anxiety and pa...,"<p>Hi New Jersey,</p><p>You talk about two ver..."
532,I found out my boyfriend takes anti-depression...,<p>Give him the time and space he needs.&nbsp;...
482,"I self-harm, and I stop for awhile. Then when ...","<p>In a way, self-harm can present somewhat li..."
405,"I'm a young adult woman, and I have trouble fi...","<p>What a tough situation you must be in, feel..."
910,I'm feeling rejected and frustrated. This is n...,<p>&nbsp; &nbsp;Feeling rejected and frustrate...


In [37]:
print(results_df['question'].iloc[0])

I've been experiencing a lot of anxiety and panic attacks lately. I was recently diagnosed by my psychiatrist with obsessive-compulsive disorder. Lately, I've been questioning everything from my career to my relationship. My boyfriend and I just moved in a few months ago. All of a sudden, I don't feel as comfortable around him as I used to, although I can't seem to find a reason as to why I feel this way.
