In [1]:
# %%
# In[1]:
# Importing necessary libraries and modules
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, log_loss
from tqdm.notebook import tqdm

In [2]:
# %%
# In[2]:
# Load the dataset
df = pd.read_csv('mental_health.csv')  # Replace with your path

In [3]:
# %%
# In[3]:
# Splitting the dataset into training and validation sets
texts = df['text'].values
labels = df['label'].values
texts_train, texts_val, labels_train, labels_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [4]:
# %%
# In[4]:
# Using TF-IDF to convert text data to numerical format
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Consider top 5000 frequent terms
texts_train_tfidf = tfidf_vectorizer.fit_transform(texts_train)
texts_val_tfidf = tfidf_vectorizer.transform(texts_val)

In [5]:
# %%
# In[5]:
# Initialize and train the Random Forest model
n_estimators = 100
rf_model = RandomForestClassifier(n_estimators=n_estimators)
for i in tqdm(range(n_estimators), desc="Training Random Forest"):
    rf_model.n_estimators = i+1
    rf_model.fit(texts_train_tfidf, labels_train)

Training Random Forest:   0%|          | 0/100 [00:00<?, ?it/s]

In [6]:
# %%
# In[6]:
# Making predictions
labels_pred = rf_model.predict(texts_val_tfidf)
labels_pred_proba = rf_model.predict_proba(texts_val_tfidf)[:, 1]

# Calculate the log loss
loss = log_loss(labels_val, labels_pred_proba)

# Calculate accuracy
accuracy = accuracy_score(labels_val, labels_pred)

# Print out the metrics
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(labels_val, labels_pred))

Loss: 0.3117
Accuracy: 89.08%
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2802
           1       0.88      0.90      0.89      2794

    accuracy                           0.89      5596
   macro avg       0.89      0.89      0.89      5596
weighted avg       0.89      0.89      0.89      5596

