<a href="https://colab.research.google.com/github/shabahmd/Machine-Learning-Notebooks/blob/main/Text_Classification_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score


Load Dataset

In [7]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv'
data = pd.read_csv(url, sep='\t', header=None, names=['label', 'text'])


Preprocessing

In [9]:
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])

In [11]:
tokenizer = Tokenizer(num_words= 10000, oov_token= "<OOV>")
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])

Padding sequences to the same length

In [12]:
max_sequence_len =100
padded_sequences = pad_sequences(sequences, maxlen = max_sequence_len, padding= 'post')

In [13]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, data['label'])

In [14]:
embedding_dim = 64

Building LSTM model

In [28]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=embedding_dim, input_length=max_sequence_len),
    Bidirectional(LSTM(256, return_sequences=False)),
    Dropout(0.6),
    Dense(128, activation='relu'),
    Dropout(0.6),
    Dense(1, activation='sigmoid')
])


Compiliing the model

In [30]:
model.compile(optimizer= 'Nadam', loss = 'binary_crossentropy', metrics = ['accuracy'])


In [31]:
epochs = 5
batch_size = 32
history = model.fit(X_train, y_train, validation_data =(X_test, y_test), epochs =epochs, batch_size= batch_size)

Epoch 1/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 648ms/step - accuracy: 0.8827 - loss: 0.3230 - val_accuracy: 0.9907 - val_loss: 0.0453
Epoch 2/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 649ms/step - accuracy: 0.9783 - loss: 0.0649 - val_accuracy: 0.9914 - val_loss: 0.0363
Epoch 3/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 646ms/step - accuracy: 0.9940 - loss: 0.0319 - val_accuracy: 0.9907 - val_loss: 0.0338
Epoch 4/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 645ms/step - accuracy: 0.9979 - loss: 0.0088 - val_accuracy: 0.9892 - val_loss: 0.0487
Epoch 5/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 625ms/step - accuracy: 0.9968 - loss: 0.0088 - val_accuracy: 0.9914 - val_loss: 0.0502


In [32]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy:{accuracy *100:.2f}%')



[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 155ms/step - accuracy: 0.9888 - loss: 0.0818
Test accuracy:99.14%


In [33]:
preds = model.predict(X_test)
preds = [1 if p>0.5 else 0 for p in preds]
print(f"Test Accuracy (manual): {accuracy_score(y_test, preds) * 100:.2f}%")


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 138ms/step
Test Accuracy (manual): 99.14%
