In [4]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping


# Load data
df = pd.read_csv('data.csv')

# Encode labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['LABEL'])

# Tokenize text
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['TEXT'])
sequences = tokenizer.texts_to_sequences(df['TEXT'])
X = pad_sequences(sequences, maxlen=200)
y = df['label_encoded'].values

# Build deeper Bi-LSTM model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=200))
model.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)))
model.add(Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(len(le.classes_), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Save model checkpoint
checkpoint = ModelCheckpoint('deep_lstm_model.h5', monitor='accuracy', save_best_only=True, verbose=1)

# Train on full data (no validation split)

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(X, y, 
          validation_split=0.1, 
          epochs=50, 
          batch_size=32, 
          callbacks=[checkpoint, early_stop])

# Save final model (best weights)
model.save('deep_lstm_model_final.h5')

# Save tokenizer and label encoder
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)


Epoch 1/50




[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.7354 - loss: 1.4305
Epoch 1: accuracy improved from -inf to 0.80084, saving model to deep_lstm_model.h5




[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 1s/step - accuracy: 0.7357 - loss: 1.4281 - val_accuracy: 0.9696 - val_loss: 0.1720
Epoch 2/50
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8663 - loss: 0.5361
Epoch 2: accuracy improved from 0.80084 to 0.87162, saving model to deep_lstm_model.h5




[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 1s/step - accuracy: 0.8663 - loss: 0.5360 - val_accuracy: 0.9787 - val_loss: 0.1750
Epoch 3/50
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8645 - loss: 0.4787
Epoch 3: accuracy improved from 0.87162 to 0.88361, saving model to deep_lstm_model.h5




[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m226s[0m 1s/step - accuracy: 0.8646 - loss: 0.4784 - val_accuracy: 0.9711 - val_loss: 0.1253
Epoch 4/50
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.9017 - loss: 0.3365
Epoch 4: accuracy improved from 0.88361 to 0.90405, saving model to deep_lstm_model.h5




[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 1s/step - accuracy: 0.9017 - loss: 0.3365 - val_accuracy: 0.9605 - val_loss: 0.1244
Epoch 5/50
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.9244 - loss: 0.2573
Epoch 5: accuracy improved from 0.90405 to 0.92534, saving model to deep_lstm_model.h5




[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 1s/step - accuracy: 0.9244 - loss: 0.2573 - val_accuracy: 0.9347 - val_loss: 0.2411
Epoch 6/50
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 749ms/step - accuracy: 0.9266 - loss: 0.2498
Epoch 6: accuracy improved from 0.92534 to 0.92703, saving model to deep_lstm_model.h5




[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 775ms/step - accuracy: 0.9266 - loss: 0.2498 - val_accuracy: 0.9742 - val_loss: 0.1143
Epoch 7/50
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 749ms/step - accuracy: 0.9268 - loss: 0.2262
Epoch 7: accuracy improved from 0.92703 to 0.93091, saving model to deep_lstm_model.h5




[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 765ms/step - accuracy: 0.9268 - loss: 0.2262 - val_accuracy: 0.9681 - val_loss: 0.1205
Epoch 8/50
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 703ms/step - accuracy: 0.9377 - loss: 0.1928
Epoch 8: accuracy improved from 0.93091 to 0.93378, saving model to deep_lstm_model.h5




[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 720ms/step - accuracy: 0.9377 - loss: 0.1929 - val_accuracy: 0.9726 - val_loss: 0.1098
Epoch 9/50
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 663ms/step - accuracy: 0.9434 - loss: 0.1852
Epoch 9: accuracy improved from 0.93378 to 0.93784, saving model to deep_lstm_model.h5




[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 685ms/step - accuracy: 0.9434 - loss: 0.1852 - val_accuracy: 0.9514 - val_loss: 0.1615
Epoch 10/50
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 558ms/step - accuracy: 0.9396 - loss: 0.1907
Epoch 10: accuracy improved from 0.93784 to 0.93953, saving model to deep_lstm_model.h5




[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 573ms/step - accuracy: 0.9396 - loss: 0.1906 - val_accuracy: 0.9666 - val_loss: 0.1540
Epoch 11/50
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 535ms/step - accuracy: 0.9365 - loss: 0.1884
Epoch 11: accuracy improved from 0.93953 to 0.94172, saving model to deep_lstm_model.h5




[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 550ms/step - accuracy: 0.9365 - loss: 0.1883 - val_accuracy: 0.9559 - val_loss: 0.1595




In [2]:
import numpy as np
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# Load tokenizer, label encoder, and model
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

with open('label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

model = load_model('deep_lstm_model_final.h5')

def predict_texts(texts, max_len=200):
    # Tokenize and pad
    sequences = tokenizer.texts_to_sequences(texts)
    X = pad_sequences(sequences, maxlen=max_len)
    
    probs = model.predict(X)
    
    # Predicted class indices
    preds = np.argmax(probs, axis=1)
    
    # Decode labels
    decoded_preds = le.inverse_transform(preds)
    
    # Confidence scores
    confidence_scores = probs[np.arange(len(probs)), preds]
    
    # Return predictions with confidence
    return list(zip(texts, decoded_preds, confidence_scores))

# Example usage:
if __name__ == '__main__':
    sample_texts = [
        "We would like to inform you that there is an order placed for Apple iPhone 11 Pro using your Amazon account. If you do not authorize this order, press 1 or press 2 to authorize this order.",
        "Please confirm your identity to avoid account freeze.",
        "Thank you for reaching out to us regarding your invoice.",
        "Your account has been compromised. Please reset your password immediately.",
    ]
    
    results = predict_texts(sample_texts)
    for text, label, conf in results:
        print(f"Text: {text}\nPredicted Label: {label}\nConfidence: {conf:.4f}\n")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
Text: We would like to inform you that there is an order placed for Apple iPhone 11 Pro using your Amazon account. If you do not authorize this order, press 1 or press 2 to authorize this order.
Predicted Label: fraud
Confidence: 0.9815

Text: Please confirm your identity to avoid account freeze.
Predicted Label: normal
Confidence: 0.7133

Text: Thank you for reaching out to us regarding your invoice.
Predicted Label: normal
Confidence: 0.9947

Text: Your account has been compromised. Please reset your password immediately.
Predicted Label:  scam
Confidence: 0.3560

