In [1]:
# In[1]:
# Importing necessary libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset, random_split
from tqdm.notebook import tqdm  # Importing the notebook version of tqdm

In [2]:
# In[2]:
# Load a dataset from the internet. For example, you might use the Reddit Self-reported Depression Diagnosis Dataset from Kaggle.
# You would have to download it and upload to your environment. Here's a basic loading example:
df = pd.read_csv('mental_health.csv')  # Replace with your path

In [3]:
# In[3]:
# Custom dataset for our data loading
class MentalHealthDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [4]:
# In[4]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)  # Binary classification

# Splitting dataset into training and validation
train_size = int(0.8 * len(df))
val_size = len(df) - train_size
train_df, val_df = df.iloc[:train_size], df.iloc[train_size:]

train_dataset = MentalHealthDataset(train_df['text'].values, train_df['label'].values, tokenizer, 256)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

val_dataset = MentalHealthDataset(val_df['text'].values, val_df['label'].values, tokenizer, 256)
val_loader = DataLoader(val_dataset, batch_size=16)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# In[5]:
# Model training setup
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.BCEWithLogitsLoss()  # Binary Cross-Entropy
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model = model.train()
    losses = []
    
    # Wrapping the data_loader with tqdm to show the progress bar
    for batch in tqdm(data_loader, desc="Training", unit="batch"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].float().unsqueeze(1).to(device)  # Adjust for BCE loss

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    
    return sum(losses) / len(losses)



In [6]:
# In[6]:
# Model training loop
EPOCHS = 3
for epoch in range(EPOCHS):
    avg_loss = train_epoch(model, train_loader, loss_fn, optimizer, device)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {avg_loss:.4f}")

Training:   0%|          | 0/1399 [00:00<?, ?batch/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/3, Loss: 0.1668


Training:   0%|          | 0/1399 [00:00<?, ?batch/s]

Epoch 2/3, Loss: 0.0712


Training:   0%|          | 0/1399 [00:00<?, ?batch/s]

Epoch 3/3, Loss: 0.0377


In [10]:
def evaluate(model, data_loader, device):
    model = model.eval()
    correct_predictions = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].float().unsqueeze(1).to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = (torch.sigmoid(outputs.logits) > 0.5).float()  # Assuming a threshold of 0.5 for binary classification

            correct_predictions += (predictions == labels).sum().item()
            total += labels.size(0)
    
    return correct_predictions / total

val_accuracy = evaluate(model, val_loader, device)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

Validation Accuracy: 96.18%


In [7]:
# In[7]:
# Evaluation
def predict(model, text, tokenizer, device):
    model = model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        prediction = torch.sigmoid(outputs.logits)
    return prediction.item()

In [8]:
# In[8]:
# Risk prediction
def risk_score(text, model, tokenizer, device):
    score = predict(model, text, tokenizer, device)
    # Rescale to [1,10]
    return 1 + 9 * score

In [9]:
# In[9]:
# Trigger word flagging
trigger_words = ['suicide', 'kill', 'death', 'die', 'end my life', 'hurt myself']
def has_trigger_words(text):
    for word in trigger_words:
        if word in text:
            return True
    return False

In [14]:
sample_text = "I've been feeling really down lately."

# Predict depression/suicidal tendencies probability
prediction_probability = predict(model, sample_text, tokenizer, device)
print(f"Probability of depression/suicidal tendencies: {prediction_probability:.2f}")

# Calculate risk score
risk = risk_score(sample_text, model, tokenizer, device)
print(f"Risk score (1-10): {risk:.2f}")

# Check for triggering words
if has_trigger_words(sample_text):
    print("Warning: The text contains triggering words.")
else:
    print("This text contains no triggering words")



Probability of depression/suicidal tendencies: 0.52
Risk score (1-10): 5.70
This text contains no triggering words


In [18]:
# Define a function to extract true labels from DataLoader
def extract_true_labels(data_loader):
    true_labels = []

    for batch in data_loader:
        labels = batch['label'].numpy()
        true_labels.extend(labels)
    
    return true_labels

# Extract true labels for validation data
labels_val_true = extract_true_labels(val_loader)

# Now, use these labels for evaluation
# Option 1: Using classification_report
print(classification_report(labels_val_true, labels_pred))

# Option 2: Using individual functions
precision = precision_score(labels_val_true, labels_pred)
recall = recall_score(labels_val_true, labels_pred)
f1 = f1_score(labels_val_true, labels_pred)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")



              precision    recall  f1-score   support

           0       0.96      0.96      0.96      2827
           1       0.96      0.96      0.96      2769

    accuracy                           0.96      5596
   macro avg       0.96      0.96      0.96      5596
weighted avg       0.96      0.96      0.96      5596

Precision: 0.9627
Recall: 0.9599
F1 Score: 0.9613


In [None]:
import tkinter as tk
from tkinter import ttk, messagebox
from matplotlib.figure import Figure
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg

def get_results():
    # Extracting text from the GUI input field
    sample_text = text_input.get("1.0", "end-1c")
    
    # Predict depression/suicidal tendencies probability
    prediction_probability = predict(model, sample_text, tokenizer, device)
    # Calculate risk score
    risk = risk_score(sample_text, model, tokenizer, device)
    
    # Update the canvas
    draw_severity_indicator(risk)
    
    # Trigger words check
    if has_trigger_words(sample_text):
        trigger_label["text"] = "Warning: The text contains triggering words."
        trigger_label.config(fg='red')
    else:
        trigger_label["text"] = "This text contains no triggering words."
        trigger_label.config(fg='black')

def draw_severity_indicator(risk):
    # Pie chart
    fig = Figure(figsize=(4, 4))
    ax = fig.add_subplot()
    
    data = [risk, 10-risk]
    labels = ['Risk Score', 'Remaining']
    colors = ['red', 'green'] if risk > 5 else ['yellow', 'green'] if risk > 3 else ['green', 'lightgreen']
    explode = [0.1, 0]

    ax.pie(data, labels=labels, colors=colors, explode=explode, autopct='%1.1f%%', startangle=140)
    ax.set_title(f"Risk Score: {risk:.2f}")

    # Embed the chart into tkinter
    chart = FigureCanvasTkAgg(fig, master=chart_frame)
    chart.draw()
    chart.get_tk_widget().pack(pady=15)

# GUI setup
app = tk.Tk()
app.title("Mental Health Analysis")

main_frame = ttk.Frame(app)
main_frame.pack(padx=20, pady=20)

# Label and Text input field for sample text
text_frame = ttk.LabelFrame(main_frame, text="Input Text")
text_frame.grid(row=0, column=0, padx=10, pady=10)

text_input = tk.Text(text_frame, height=5, width=50)
text_input.pack(padx=5, pady=5)

# Button to compute results
compute_button = ttk.Button(text_frame, text="Analyze", command=get_results)
compute_button.pack(pady=15)

# Results frame
results_frame = ttk.LabelFrame(main_frame, text="Results")
results_frame.grid(row=0, column=1, padx=10, pady=10)

trigger_label = ttk.Label(results_frame, text="", font=('Arial', 12))
trigger_label.pack(pady=10)

# Chart frame
chart_frame = ttk.Frame(results_frame)
chart_frame.pack(pady=15)

app.mainloop()