In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
df=pd.read_csv("Spam-Data.csv")

In [3]:
X = df['Message']
y = df['Category']

In [4]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [5]:
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: Category, dtype: object

In [6]:
# Convert labels to numeric using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[0 0 1 ... 0 0 0]


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000)

# Fit the tokenizer on the text data
tokenizer.fit_on_texts(X_train)
tokenizer.fit_on_texts(X_test)

# Convert the text data into integer sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to a maximum length of 10
max_length = 10
padded_sequences = pad_sequences(X_train, maxlen=max_length, padding='post')
padded_sequences = pad_sequences(X_test, maxlen=max_length, padding='post')



In [10]:
# Convert the labels to numpy arrays
import numpy as np
y_train = np.array(y_train)
y_test = np.array(y_test)


In [11]:
X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')

In [12]:
# Convert the labels into one-hot encoded vectors
num_classes = np.max(y_train) + 1
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

In [13]:
# Create the sequential model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(10000,)))
model.add(Dense(num_classes, activation='softmax'))

In [14]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [15]:
# Train the model on the training data
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa95ca3ad70>

In [16]:
# Create the Autoencoder
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

input_layer = Input(shape=(X_train.shape[1],))

# Encoder
encoder_layer1 = Dense(512, activation='relu')(input_layer)
encoder_layer2 = Dense(256, activation='relu')(encoder_layer1)
encoder_layer3 = Dense(128, activation='relu')(encoder_layer2)
encoder_layer4 = Dense(64, activation='relu')(encoder_layer3)


In [17]:
# Latent space
latent_space = Dense(32, activation='relu')(encoder_layer4)

In [18]:
# Decoder
decoder_layer1 = Dense(64, activation='relu')(latent_space)
decoder_layer2 = Dense(128, activation='relu')(decoder_layer1)
decoder_layer3 = Dense(256, activation='relu')(decoder_layer2)
decoder_layer4 = Dense(512, activation='relu')(decoder_layer3)

In [19]:
# Output
output_layer = Dense(X_train.shape[1], activation='sigmoid')(decoder_layer4)

In [20]:

# Define the Autoencoder model
autoencoder = Model(inputs=input_layer, outputs=output_layer)

In [21]:
# Compile the Autoencoder model
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

In [22]:
# Fit the Autoencoder on the training data
autoencoder.fit(X_train, X_train, epochs=10, batch_size=32, shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa93a593cd0>

In [23]:
# Generate the malicious data
num_poison_samples = int(len(X_train) * 0.1)
poisoned_data = X_train[:num_poison_samples]
malicious_data = autoencoder.predict(poisoned_data)



In [24]:
# Create the labels for the malicious data
malicious_labels = np.full((num_poison_samples,), num_classes - 1)

In [25]:
# Add the malicious data to the training set
train_data_malicious = np.concatenate((X_train, malicious_data), axis=0)
train_labels_malicious = np.concatenate((y_train, to_categorical(malicious_labels, num_classes)), axis=0)

In [26]:
# Retrain the model on the poisoned data
model.fit(train_data_malicious, train_labels_malicious, epochs=10, batch_size=32, validation_split=0.1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa93a5634c0>

In [27]:
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

Test accuracy: 0.9919282793998718
