In [1]:
import warnings
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
data = pd.read_csv('cleaned_train.csv')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
max_words = 10000  # Limit the number of unique words to 10,000
max_sequence_length = 100


# Preprocess text
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text
    else:
        return ""


data['text'] = data['text'].apply(preprocess_text)

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['text'])
X = tokenizer.texts_to_sequences(data['text'])
X = pad_sequences(X, maxlen=max_sequence_length)
# Label Encoding the categories (You can also encode subcategories in a similar way if needed)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['category'])
y = to_categorical(y)  # Convert labels to one-hot encoding
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build the LSTM Model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))  # Output layer for multi-class classification
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
# Save the model
model.save('lstm_model.h5')
import pandas as pd
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

# 1. Load the Pre-Trained Model
model = load_model('lstm_model.h5')  # Load your pre-trained model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 2. Load the Dataset (cleaned_test.csv)
df = pd.read_csv("cleaned_test.csv")

# Ensure the 'text' column contains strings
df['text'] = df['text'].astype(str).fillna("")
X_test_data = df['text']

# Ensure the 'category' column has no NaN values
df['category'] = df['category'].fillna("Unknown")
y_test = df['category']

# 3. Tokenization and Padding
# Initialize the Tokenizer (same parameters as during training)
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_test_data)  # Use the same tokenizer as during training

# Tokenize and pad test data
X_test_sequences = tokenizer.texts_to_sequences(X_test_data)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100, padding='post', truncating='post')

# 4. Predict Using the Pre-Trained Model
y_pred_prob = model.predict(X_test_padded)  # Get predicted probabilities

# Convert predicted probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# 5. Encode the True Labels (if LabelEncoder was used during training)
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)  # Encode the true labels
warnings.filterwarnings("ignore", category=UserWarning, module="absl")
# 6. Calculate the Evaluation Metrics
accuracy = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred, average='weighted')
recall = recall_score(y_test_encoded, y_pred, average='weighted')
f1 = f1_score(y_test_encoded, y_pred, average='weighted')

# 7. Print the Metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pradeesh11/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pradeesh11/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/5
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m347s[0m 315ms/step - accuracy: 0.6912 - loss: 1.0392 - val_accuracy: 0.7426 - val_loss: 0.7685
Epoch 2/5
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 234ms/step - accuracy: 0.7461 - loss: 0.7529 - val_accuracy: 0.7504 - val_loss: 0.7368
Epoch 3/5
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 92ms/step - accuracy: 0.7641 - loss: 0.6831 - val_accuracy: 0.7460 - val_loss: 0.7240
Epoch 4/5
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 97ms/step - accuracy: 0.7819 - loss: 0.6307 - val_accuracy: 0.7547 - val_loss: 0.7288
Epoch 5/5
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 96ms/step - accuracy: 0.8061 - loss: 0.5631 - val_accuracy: 0.7491 - val_loss: 0.7808
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - accuracy: 0.7437 - loss: 0.7988




Test Accuracy: 74.91%




[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step
Accuracy: 0.0035
Precision: 0.0284
Recall: 0.0035
F1 Score: 0.0030


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
