In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
import pandas as pd

# Step 1: Clean the original file
input_file = "output_trained.csv"
output_file = "output_trained_fixed.csv"

fixed_lines = []
bad_lines = []

with open(input_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

for i, line in enumerate(lines, start=1):
    line = line.strip()
    if not line:
        continue  # skip empty lines
    parts = line.split(",", 1)  # split only at the first comma
    if len(parts) == 2:
        label = parts[0].strip()
        sentence = parts[1].strip().replace('"', '""')  # escape inner quotes
        fixed_lines.append(f'{label},"{sentence}"\n')
    else:
        bad_lines.append((i, line))

# Step 2: Save the cleaned file
with open(output_file, "w", encoding="utf-8") as f:
    f.write("Label,Sentence\n")  # CSV header
    f.writelines(fixed_lines)

# Step 3: Report any issues
if bad_lines:
    print(f"⚠️ {len(bad_lines)} malformed lines found:")
    for line_num, content in bad_lines:
        print(f"Line {line_num}: {content}")
else:
    print("✅ All lines processed correctly.")

# Step 4: Read cleaned CSV
df = pd.read_csv(output_file)
print("✅ Fixed file loaded successfully.")
print("📊 Number of rows:", len(df))
df.head()


✅ All lines processed correctly.
✅ Fixed file loaded successfully.
📊 Number of rows: 8499


Unnamed: 0,Label,Sentence
0,Label,Sentence
1,neutral,आप अपने हाथ भरे होंगे।
2,neutral,कि मैंने किया। कि मैंने किया।
3,neutral,तो चलो अपने कर्तव्यों के बारे में थोड़ा बात करें।
4,surprise,मेरे कर्तव्य? ठीक है.


In [None]:
# 📌 Step 2: Extract text and labels
texts = df['Sentence'].astype(str).tolist()
labels = df['Label'].astype(str).tolist()


In [None]:
# 📌 Step 3: Encode labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

In [None]:
# 📌 Step 4: Tokenize and pad text
max_words = 8000
max_len = 30

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = tf.keras.utils.to_categorical(labels_encoded, num_classes=num_classes)

In [None]:
# 📌 Step 5: Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 📌 Step 6: Build and compile the model
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build(input_shape=(None, max_len))
model.summary()

In [None]:
# 📌 Step 7: Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 48ms/step - accuracy: 0.4310 - loss: 1.7331 - val_accuracy: 0.4635 - val_loss: 1.5661
Epoch 2/10
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 42ms/step - accuracy: 0.4639 - loss: 1.5296 - val_accuracy: 0.4694 - val_loss: 1.5227
Epoch 3/10
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 42ms/step - accuracy: 0.4874 - loss: 1.4099 - val_accuracy: 0.4859 - val_loss: 1.4952
Epoch 4/10
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 45ms/step - accuracy: 0.5664 - loss: 1.2327 - val_accuracy: 0.4753 - val_loss: 1.5419
Epoch 5/10
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 46ms/step - accuracy: 0.6353 - loss: 1.0710 - val_accuracy: 0.4482 - val_loss: 1.6090
Epoch 6/10
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 44ms/step - accuracy: 0.6767 - loss: 0.9655 - val_accuracy: 0.4471 - val_loss: 1.8099
Epoch 7/10
[1m213/

<keras.src.callbacks.history.History at 0x79cc46419cd0>

In [None]:
def predict_emotion(text):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')
    pred = model.predict(padded)[0]
    emotion = label_encoder.inverse_transform([np.argmax(pred)])[0]
    confidence = np.max(pred)
    print(f"Prediction: {emotion.capitalize()} ({confidence:.2f} confidence)")

# 🔍 Test It
predict_emotion("कल मौसम साफ रहेगा।")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Prediction: Neutral (1.00 confidence)
