In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
df = pd.read_csv("/content/Combined Data.csv")

In [9]:
df.shape

(53043, 3)

In [10]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [25]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Lowercase, remove URLs, mentions, special chars, and stopwords"""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+|@\w+|#\w+|[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    return ' '.join([t for t in tokens if t not in stop_words and len(t) > 2])

# Apply preprocessing
df['processed_statement'] = df['statement'].apply(preprocess_text)
df = df[df['processed_statement'].str.len() > 0]  # Remove empty entries

# Prepare features and labels
X = df['processed_statement'].values
label_encoder = LabelEncoder() # Define label_encoder here
y = label_encoder.fit_transform(df['status'].values) # Fit label_encoder here

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [12]:
VOCAB_SIZE = 10000
MAX_LENGTH = 100

# Initialize and fit tokenizer
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

# Convert text to padded sequences
X_train_padded = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LENGTH, padding='post')
X_test_padded = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LENGTH, padding='post')

In [13]:
RNN_UNITS = 64
NUM_CLASSES = len(set(y_train))  # You can use label_encoder.classes_ if needed

# Build the model
model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LENGTH),

    SimpleRNN(RNN_UNITS, return_sequences=True, dropout=0.2),
    SimpleRNN(RNN_UNITS // 2, dropout=0.2),

    Dense(64, activation='relu'),
    Dropout(0.5),

    Dense(32, activation='relu'),
    Dropout(0.3),

    Dense(NUM_CLASSES, activation='softmax' if NUM_CLASSES > 2 else 'sigmoid')
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy' if NUM_CLASSES > 2 else 'binary_crossentropy',
    metrics=['accuracy']
)

In [14]:
# Hyperparameters
EPOCHS = 50
BATCH_SIZE = 32

# Callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=10, restore_best_weights=True, verbose=1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.2, patience=5, min_lr=1e-4, verbose=1
    )
]

# Train the model
history = model.fit(
    X_train_padded, y_train,
    validation_data=(X_test_padded, y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)


Epoch 1/50
[1m1313/1313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 25ms/step - accuracy: 0.3190 - loss: 1.6745 - val_accuracy: 0.3783 - val_loss: 1.5630 - learning_rate: 0.0010
Epoch 2/50
[1m1313/1313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 20ms/step - accuracy: 0.3772 - loss: 1.5793 - val_accuracy: 0.3768 - val_loss: 1.5648 - learning_rate: 0.0010
Epoch 3/50
[1m1313/1313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 20ms/step - accuracy: 0.3231 - loss: 1.6377 - val_accuracy: 0.3631 - val_loss: 1.5729 - learning_rate: 0.0010
Epoch 4/50
[1m1313/1313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 20ms/step - accuracy: 0.3558 - loss: 1.5886 - val_accuracy: 0.3662 - val_loss: 1.6096 - learning_rate: 0.0010
Epoch 5/50
[1m1313/1313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 20ms/step - accuracy: 0.3618 - loss: 1.5829 - val_accuracy: 0.3709 - val_loss: 1.5442 - learning_rate: 0.0010
Epoch 6/50
[1m1313/1313[0m [32m━━━━━━━━━━━━━━━━

In [26]:
def predict_sentiment(text):
    """Predict the sentiment class and confidence for a single input text."""
    processed = preprocess_text(text)
    seq = tokenizer.texts_to_sequences([processed])
    padded = pad_sequences(seq, maxlen=MAX_LENGTH, padding='post')

    pred = model.predict(padded, verbose=0)

    if NUM_CLASSES > 2:
        class_idx = np.argmax(pred[0])
        confidence = pred[0][class_idx]
    else:
        class_idx = int(pred[0] > 0.5)
        confidence = pred[0][0] if class_idx == 1 else 1 - pred[0][0]

    # Use the globally defined label_encoder
    label = str(label_encoder.inverse_transform([class_idx])[0])

    print(f"Text: {text}")
    print(f"➤ Predicted Sentiment: {label.upper()}")
    print(f"➤ Confidence: {confidence:.2f}")
    print(f"➤ Probabilities: {dict(zip(label_encoder.classes_, map(float, pred[0])))}")

In [22]:
# Predict sentiment for a single input
input_text = "I feel so anxious and worried about everything in my life"
predict_sentiment(input_text)


Text: I feel so anxious and worried about everything in my life
➤ Predicted Sentiment: 3
➤ Confidence: 0.37
➤ Probabilities: {np.int64(0): 0.045296963304281235, np.int64(1): 0.02715577930212021, np.int64(2): 0.28780049085617065, np.int64(3): 0.3718903660774231, np.int64(4): 0.009853990748524666, np.int64(5): 0.03733086213469505, np.int64(6): 0.22067156434059143}


In [23]:
model.save('mental_health_sentiment_rnn.h5')
print("Model saved as 'mental_health_sentiment_rnn.h5'")



Model saved as 'mental_health_sentiment_rnn.h5'


In [28]:
import pickle
with open('tokenizer.pkl', 'wb') as f:
     pickle.dump(tokenizer, f)
print("Tokenizer saved as 'tokenizer.pkl'")

with open('label_encoder.pkl', 'wb') as f:
     pickle.dump(label_encoder, f)
print("Label encoder saved as 'label_encoder.pkl'")

Tokenizer saved as 'tokenizer.pkl'
Label encoder saved as 'label_encoder.pkl'
