In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


data = pd.read_csv('CombinedData.csv')
# Download NLTK data (for first-time use)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize necessary components
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Data Cleaning Function
def clean_text(text):
    # Lowering letters
    text = text.lower()
    # Removing html tags
    text = re.sub('<[^>]*>', '', text)
    # Removing emails
    text = re.sub('\S*@\S*\s?', '', text)
    # Removing urls
    text = re.sub('https?://[A-Za-z0-9]','',text)
    # Removing numbers
    text = re.sub('[^a-zA-Z]',' ',text)
    word_tokens = word_tokenize(text)    
    filtered_sentence = []
    for word_token in word_tokens:
        if word_token not in stop_words:
            filtered_sentence.append(word_token)
    
    # Joining words
    text = (' '.join(filtered_sentence))
    #print(len(text))
    return text

# Load your dataset
# Assuming `data` is a pandas DataFrame containing the statements and status columns
data_cleaned = data[['statement', 'status']].dropna()

# Apply text cleaning to the 'statement' column
data_cleaned['cleaned_statement'] = data_cleaned['statement'].apply(clean_text)



Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kanda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kanda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kanda\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
%pip install torchbnn


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import torchbnn as bnn
from sklearn.preprocessing import LabelEncoder

# Data Preprocessing (assuming the data is in 'data_cleaned' DataFrame)
# Encode the labels (mental health categories)
label_encoder = LabelEncoder()
data_cleaned['status_encoded'] = label_encoder.fit_transform(data_cleaned['status'])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data_cleaned['statement'],
    data_cleaned['status_encoded'],
    test_size=0.2,
    random_state=42,
    stratify=data_cleaned['status_encoded']
)

# Convert text data into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_tfidf.toarray(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_tfidf.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# DataLoader for batching
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define a Bayesian Neural Network (with Dropout as a proxy for uncertainty estimation)
class BayesianNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BayesianNN, self).__init__()
        # Bayesian Linear layers with prior distributions
        self.blinear1 = bnn.BayesLinear(prior_mu=0, prior_sigma=0.1, in_features=input_size, out_features=hidden_size)
        self.blinear2 = bnn.BayesLinear(prior_mu=0, prior_sigma=0.1, in_features=hidden_size, out_features=hidden_size)
        self.blinear3 = bnn.BayesLinear(prior_mu=0, prior_sigma=0.1, in_features=hidden_size, out_features=hidden_size)
        self.blinear4 = bnn.BayesLinear(prior_mu=0, prior_sigma=0.1, in_features=hidden_size, out_features=output_size)
    def forward(self, x):
        x = F.relu(self.blinear1(x))
        x = F.relu(self.blinear2(x))
        x = F.relu(self.blinear3(x))
        x = self.blinear4(x)
        return x

# Model parameters
input_size = 3000  # Number of features (TF-IDF)
hidden_size = 256  # Number of neurons in hidden layers
output_size = len(label_encoder.classes_)  # Number of categories (mental health statuses)

# Instantiate model, loss, and optimizer
model = BayesianNN(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train_model(model, train_loader, criterion, optimizer, num_epochs=100):
    model.train()  # Set model to training mode
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            optimizer.zero_grad()  # Zero the parameter gradients
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

# Testing function
def test_model(model, test_loader):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Test Accuracy: {100 * correct / total:.2f}%')

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs=50)

# Test the model
test_model(model, test_loader)

# Function to classify new input text
def classify_text(model, text, vectorizer, label_encoder):
    model.eval()
    # Vectorize the input text using the same TF-IDF vectorizer
    text_vector = vectorizer.transform([text]).toarray()
    text_tensor = torch.tensor(text_vector, dtype=torch.float32)

    # Get model predictions
    with torch.no_grad():
        outputs = model(text_tensor)
        _, predicted = torch.max(outputs.data, 1)

    # Decode the predicted label
    predicted_label = label_encoder.inverse_transform([predicted.item()])
    return predicted_label[0]




[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.
Epoch [1/50], Loss: 1.4155
Epoch [2/50], Loss: 0.9514
Epoch [3/50], Loss: 0.7985
Epoch [4/50], Loss: 0.7001
Epoch [5/50], Loss: 0.6265
Epoch [6/50], Loss: 0.5683
Epoch [7/50], Loss: 0.5176
Epoch [8/50], Loss: 0.4752
Epoch [9/50], Loss: 0.4309
Epoch [10/50], Loss: 0.3921
Epoch [11/50], Loss: 0.3540
Epoch [12/50], Loss: 0.3106
Epoch [13/50], Loss: 0.2721
Epoch [14/50], Loss: 0.2417
Epoch [15/50], Loss: 0.2118
Epoch [16/50], Loss: 0.1773
Epoch [17/50], Loss: 0.1546
Epoch [18/50], Loss: 0.1305
Epoch [19/50], Loss: 0.1136
Epoch [20/50], Loss: 0.0957
Epoch [21/50], Loss: 0.0835
Epoch [22/50], Loss: 0.0697
Epoch [23/50], Loss: 0.0618
Epoch [24/50], Loss: 0.0580
Epoch [25/50], Loss: 0.0512
Epoch [26/50], Loss: 0.0470
Epoch [27/50], Loss: 0.0417
Epoch [28/50], Loss: 0.0365
Epoch [29/50], Loss: 0.0339
Epoch [30/50], Loss: 0.0340
Epoch [31/50], Loss: 0.0291
Epoch [32/50], Loss: 0.0287
Epoch [33/50], Loss: 0.0260
Epoch [34/50], Loss

In [3]:
# Example usage
new_text = "I am feeling very anxious and stressed about my work."
predicted_category = classify_text(model, new_text, vectorizer, label_encoder)
print(f"The predicted mental health category is: {predicted_category}")


The predicted mental health category is: Anxiety
