# LSTM


In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Activation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load dataset
df = pd.read_csv('Embedded_data.csv')

# Drop unnecessary columns
train = df.drop(['Tokens'], axis=1)

# Handle missing values (if any)
train = train.dropna()

# Prepare input and output
X = train['Text']
Y = train['Label']

# Encode labels
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1, 1)

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15)

# Convert X_train to string type
X_train = X_train.astype(str)

# Tokenization and sequence padding parameters
max_words = 1000
max_len = 200

# Tokenize and pad sequences
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences_train = tok.texts_to_sequences(X_train)
sequences_matrix_train = pad_sequences(sequences_train, maxlen=max_len)

# Build LSTM model
def LSTM_model():
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(max_words, 1000, input_length=max_len)(inputs)
    layer = LSTM(256)(layer)  # LSTM layer with 512 units
    layer = Dense(128, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

# Create an instance of the LSTM model
model = LSTM_model()

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()

from keras import utils
from tensorflow.keras.callbacks import EarlyStopping

sequences = tok.texts_to_sequences(X_train)
sequences_matrix = utils.pad_sequences(sequences, maxlen=max_len)

model.fit(sequences_matrix, Y_train, batch_size=128, epochs=10,
          validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])

test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = utils.pad_sequences(test_sequences,maxlen=max_len)

accr = model.evaluate(test_sequences_matrix,Y_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix

# Predict probabilities using the model
y_pred_proba = model.predict(test_sequences_matrix)

# Convert probabilities to binary predictions (0 or 1)
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate and print evaluation metrics
print("Precision:", precision_score(Y_test, y_pred))
print("Recall:", recall_score(Y_test, y_pred))
print("Accuracy:", accuracy_score(Y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(Y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, y_pred))


print(' Loss: {:0.3f}\n'.format(accr[0],accr[1]))




Epoch 1/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m287s[0m 3s/step - accuracy: 0.6112 - loss: 0.6517 - val_accuracy: 0.6843 - val_loss: 0.5837
Epoch 2/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m413s[0m 4s/step - accuracy: 0.7246 - loss: 0.5438 - val_accuracy: 0.6853 - val_loss: 0.5841
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 670ms/step - accuracy: 0.6807 - loss: 0.5731
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 626ms/step
Precision: 0.578860445912469
Recall: 0.6865817825661117
Accuracy: 0.6854869268662372
ROC-AUC Score: 0.6856889135327469
Confusion Matrix:
 [[1108  510]
 [ 320  701]]
 Loss: 0.579



Changing Layers (256,64)


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Activation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load dataset
df = pd.read_csv('Embedded_data.csv')

# Drop unnecessary columns
train = df.drop(['Tokens'], axis=1)

# Handle missing values (if any)
train = train.dropna()

# Prepare input and output
X = train['Text']
Y = train['Label']

# Encode labels
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1, 1)

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15)

# Convert X_train to string type
X_train = X_train.astype(str)

# Tokenization and sequence padding parameters
max_words = 1000
max_len = 200

# Tokenize and pad sequences
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences_train = tok.texts_to_sequences(X_train)
sequences_matrix_train = pad_sequences(sequences_train, maxlen=max_len)

# Build LSTM model
def LSTM_model():
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(max_words, 1000, input_length=max_len)(inputs)
    layer = LSTM(256)(layer)  # LSTM layer with 512 units
    layer = Dense(64, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

# Create an instance of the LSTM model
model = LSTM_model()

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()


from keras import utils
from tensorflow.keras.callbacks import EarlyStopping

sequences = tok.texts_to_sequences(X_train)
sequences_matrix = utils.pad_sequences(sequences, maxlen=max_len)

model.fit(sequences_matrix, Y_train, batch_size=128, epochs=10,
          validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])

test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = utils.pad_sequences(test_sequences,maxlen=max_len)

accr = model.evaluate(test_sequences_matrix,Y_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix

# Predict probabilities using the model
y_pred_proba = model.predict(test_sequences_matrix)

# Convert probabilities to binary predictions (0 or 1)
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate and print evaluation metrics
print("Precision:", precision_score(Y_test, y_pred))
print("Recall:", recall_score(Y_test, y_pred))
print("Accuracy:", accuracy_score(Y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(Y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, y_pred))


print(' Loss: {:0.3f}\n'.format(accr[0],accr[1]))




Epoch 1/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m446s[0m 4s/step - accuracy: 0.6055 - loss: 0.6578 - val_accuracy: 0.6712 - val_loss: 0.5825
Epoch 2/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m410s[0m 4s/step - accuracy: 0.7204 - loss: 0.5478 - val_accuracy: 0.7110 - val_loss: 0.5567
Epoch 3/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 4s/step - accuracy: 0.7353 - loss: 0.5192 - val_accuracy: 0.7157 - val_loss: 0.5608
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 635ms/step - accuracy: 0.7013 - loss: 0.5676
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 720ms/step
Precision: 0.6502793296089385
Recall: 0.5454545454545454
Accuracy: 0.6976127320954907
ROC-AUC Score: 0.6731725653481377
Confusion Matrix:
 [[1259  313]
 [ 485  582]]
 Loss: 0.572



changing layers (128,64)

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Activation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load dataset
df = pd.read_csv('Embedded_data.csv')

# Drop unnecessary columns
train = df.drop(['Tokens'], axis=1)

# Handle missing values (if any)
train = train.dropna()

# Prepare input and output
X = train['Text']
Y = train['Label']

# Encode labels
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1, 1)

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15)

# Convert X_train to string type
X_train = X_train.astype(str)

# Tokenization and sequence padding parameters
max_words = 1000
max_len = 200

# Tokenize and pad sequences
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences_train = tok.texts_to_sequences(X_train)
sequences_matrix_train = pad_sequences(sequences_train, maxlen=max_len)

# Build LSTM model
def LSTM_model():
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(max_words, 1000, input_length=max_len)(inputs)
    layer = LSTM(128)(layer)  # LSTM layer with 512 units
    layer = Dense(64, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

# Create an instance of the LSTM model
model = LSTM_model()

X = train['Text']
Y = train['Label']



# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()



from keras import utils
from tensorflow.keras.callbacks import EarlyStopping

sequences = tok.texts_to_sequences(X_train)
sequences_matrix = utils.pad_sequences(sequences, maxlen=max_len)

model.fit(sequences_matrix, Y_train, batch_size=128, epochs=10,
          validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])


test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = utils.pad_sequences(test_sequences,maxlen=max_len)


accr = model.evaluate(test_sequences_matrix,Y_test)

from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix

# Predict probabilities using the model
y_pred_proba = model.predict(test_sequences_matrix)

# Convert probabilities to binary predictions (0 or 1)
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate and print evaluation metrics
print("Precision:", precision_score(Y_test, y_pred))
print("Recall:", recall_score(Y_test, y_pred))
print("Accuracy:", accuracy_score(Y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(Y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, y_pred))


print(' Loss: {:0.3f}\n'.format(accr[0],accr[1]))




Epoch 1/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 849ms/step - accuracy: 0.6076 - loss: 0.6578 - val_accuracy: 0.6946 - val_loss: 0.5756
Epoch 2/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 836ms/step - accuracy: 0.7183 - loss: 0.5583 - val_accuracy: 0.7184 - val_loss: 0.5568
Epoch 3/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 676ms/step - accuracy: 0.7537 - loss: 0.5088 - val_accuracy: 0.7114 - val_loss: 0.5668
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 66ms/step - accuracy: 0.6841 - loss: 0.6013
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 66ms/step
Precision: 0.6220614828209765
Recall: 0.6346863468634686
Accuracy: 0.691549829480864
ROC-AUC Score: 0.6829380287372004
Confusion Matrix:
 [[1137  418]
 [ 396  688]]
 Loss: 0.595



changing layers (128,128)

In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Activation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load dataset
df = pd.read_csv('Embedded_data.csv')

# Drop unnecessary columns
train = df.drop(['Tokens'], axis=1)

# Handle missing values (if any)
train = train.dropna()

# Prepare input and output
X = train['Text']
Y = train['Label']

# Encode labels
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1, 1)

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15)

# Convert X_train to string type
X_train = X_train.astype(str)

# Tokenization and sequence padding parameters
max_words = 1000
max_len = 200

# Tokenize and pad sequences
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences_train = tok.texts_to_sequences(X_train)
sequences_matrix_train = pad_sequences(sequences_train, maxlen=max_len)

# Build LSTM model
def LSTM_model():
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(max_words, 1000, input_length=max_len)(inputs)
    layer = LSTM(128)(layer)  # LSTM layer with 512 units
    layer = Dense(128, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

# Create an instance of the LSTM model
model = LSTM_model()

X = train['Text']
Y = train['Label']



# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()



from keras import utils
from tensorflow.keras.callbacks import EarlyStopping

sequences = tok.texts_to_sequences(X_train)
sequences_matrix = utils.pad_sequences(sequences, maxlen=max_len)

model.fit(sequences_matrix, Y_train, batch_size=128, epochs=10,
          validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])


test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = utils.pad_sequences(test_sequences,maxlen=max_len)


accr = model.evaluate(test_sequences_matrix,Y_test)

from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix

# Predict probabilities using the model
y_pred_proba = model.predict(test_sequences_matrix)

# Convert probabilities to binary predictions (0 or 1)
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate and print evaluation metrics
print("Precision:", precision_score(Y_test, y_pred))
print("Recall:", recall_score(Y_test, y_pred))
print("Accuracy:", accuracy_score(Y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(Y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, y_pred))


print(' Loss: {:0.3f}\n'.format(accr[0],accr[1]))




Epoch 1/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 769ms/step - accuracy: 0.6081 - loss: 0.6589 - val_accuracy: 0.6953 - val_loss: 0.5776
Epoch 2/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 825ms/step - accuracy: 0.7176 - loss: 0.5488 - val_accuracy: 0.7030 - val_loss: 0.5628
Epoch 3/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 806ms/step - accuracy: 0.7552 - loss: 0.5061 - val_accuracy: 0.6916 - val_loss: 0.5780
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 81ms/step - accuracy: 0.6761 - loss: 0.6042
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 78ms/step
Precision: 0.6090225563909775
Recall: 0.6044776119402985
Accuracy: 0.6816976127320955
ROC-AUC Score: 0.6695010905904428
Confusion Matrix:
 [[1151  416]
 [ 424  648]]
 Loss: 0.601

