In [None]:
import re
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



In [None]:
# Load and preprocess the dataset
data = pd.read_csv('/content/Combined Data.csv')
data = data.dropna(subset=['statement'])


In [None]:

# Preprocess text
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\d+|\W+', ' ', text)  # Remove special characters, numbers, punctuations
    return text.lower()  # Convert to lowercase
    data['statement'] = data['statement'].apply(preprocess_text)

In [None]:

X = data['statement']
y = data['status']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)


In [None]:

# Tokenize the text data
max_vocab_size = 10000  # Reduced vocab size
max_sequence_length = 100  # Reduced sequence length
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
# Pad the sequences
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding="post", truncating="post")

# Tokenize and pad the test data as well
X_test_seq = tokenizer.texts_to_sequences(X_test) #Tokenize X_test and store the result in X_test_seq
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding="post", truncating="post")

In [None]:

# Convert target labels to categorical
num_classes = len(label_encoder.classes_)
y_train_categorical = to_categorical(y_train, num_classes=num_classes)
y_test_categorical = to_categorical(y_test, num_classes=num_classes)


In [None]:

# Compute class weights
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(num_classes)}


In [None]:
# Build the optimized LSTM model
model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=100, input_length=max_sequence_length),  # Reduced embedding dim
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),  # Single LSTM layer
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Callbacks for faster training
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5)

# Train the model
Train = model.fit(
    X_train_padded,
    y_train_categorical,
    epochs=15,
    batch_size=128,  # Larger batch size
    validation_data=(X_test_padded, y_test_categorical),
    class_weight=class_weights_dict,
    callbacks=[early_stopping, reduce_lr]
)



Epoch 1/15
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 185ms/step - accuracy: 0.3413 - loss: 1.8323 - val_accuracy: 0.3341 - val_loss: 1.5245 - learning_rate: 0.0010
Epoch 2/15
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 188ms/step - accuracy: 0.3649 - loss: 1.6982 - val_accuracy: 0.5332 - val_loss: 1.3189 - learning_rate: 0.0010
Epoch 3/15
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 188ms/step - accuracy: 0.4994 - loss: 1.4949 - val_accuracy: 0.5398 - val_loss: 1.0336 - learning_rate: 0.0010
Epoch 4/15
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 184ms/step - accuracy: 0.5549 - loss: 1.2745 - val_accuracy: 0.5938 - val_loss: 0.9017 - learning_rate: 0.0010
Epoch 5/15
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 188ms/step - accuracy: 0.6097 - loss: 1.0472 - val_accuracy: 0.6006 - val_loss: 0.8662 - learning_rate: 0.0010
Epoch 6/15
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test_categorical)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 46ms/step - accuracy: 0.7173 - loss: 0.6884
Test Accuracy: 71.79%


In [None]:

# Classification report
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_padded)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))


[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 59ms/step
                      precision    recall  f1-score   support

             Anxiety       0.79      0.77      0.78       768
             Bipolar       0.79      0.79      0.79       556
          Depression       0.79      0.41      0.54      3081
              Normal       0.94      0.90      0.92      3269
Personality disorder       0.51      0.69      0.59       215
              Stress       0.47      0.75      0.57       517
            Suicidal       0.53      0.83      0.65      2131

            accuracy                           0.72     10537
           macro avg       0.69      0.74      0.69     10537
        weighted avg       0.76      0.72      0.71     10537



In [None]:

# Save the model and tokenizer
model.save('optimized_lstm_text_classifier.h5')
import pickle
with open('optimized_tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [None]:
text = " Self Massage for Stress Relief: 4 Relaxation Techniques to Try Stress levels are on the rise. 84% of Americans feel stressed at least one day a week. With financial "
# Use preprocess_text instead of clean_text
text = preprocess_text(text)
# Use tokenizer instead of vectorizer
text_seq = tokenizer.texts_to_sequences([text])
text_padded = pad_sequences(text_seq, maxlen=max_sequence_length, padding="post", truncating="post")

# Predict using the Keras model
sentiment_probs = model.predict(text_padded)
predicted_label_index = np.argmax(sentiment_probs, axis=1)

sentiment_label = label_encoder.inverse_transform(predicted_label_index)
print(f'Predicted Sentiment: {sentiment_label[0]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
Predicted Sentiment: Stress
