In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout

# Load HDF5 datasets
train_df = pd.read_hdf(r"C:\Users\sagni\Downloads\Emotion Chatbot\train.h5", key='train_data')
intents_df = pd.read_hdf(r"C:\Users\sagni\Downloads\Emotion Chatbot\intents_dataset.h5", key='intents')  # Optional

# Check columns
print("📋 Columns in train_df:", train_df.columns.tolist())

# Use the actual column names
texts = train_df['Context'].astype(str).values
labels = train_df['Response'].astype(str).values

# Tokenize user input
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, padding='post')

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
y = to_categorical(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define vocab size and output classes
vocab_size = len(tokenizer.word_index) + 1
num_classes = y.shape[1]

# Build simple intent classification model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64),
    GlobalAveragePooling1D(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=8, validation_data=(X_test, y_test))

# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print("\n✅ Final Test Accuracy:", round(acc * 100, 2), "%")
print("✅ X_train shape:", X_train.shape)
print("✅ y_train shape:", y_train.shape)
print("✅ Classes:", label_encoder.classes_)
print("✅ Vocabulary size:", vocab_size)


📋 Columns in train_df: ['Context', 'Response']
Epoch 1/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 7.5013e-04 - loss: 7.8302 - val_accuracy: 0.0000e+00 - val_loss: 7.9042
Epoch 2/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0000e+00 - loss: 7.7816 - val_accuracy: 0.0000e+00 - val_loss: 8.0282
Epoch 3/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 7.8513e-04 - loss: 7.7525 - val_accuracy: 0.0000e+00 - val_loss: 8.1408
Epoch 4/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0000e+00 - loss: 7.7329 - val_accuracy: 0.0000e+00 - val_loss: 8.2553
Epoch 5/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0012 - loss: 7.7224 - val_accuracy: 0.0000e+00 - val_loss: 8.3508
Epoch 6/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 5.3083e-0