In [3]:
!pip install -q tensorflow pandas numpy scikit-learn

In [6]:
import numpy as np
import pandas as pd
import os
import re
import string

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout
from sklearn.model_selection import train_test_split

# Load your dataset (example, replace with your actual data)
# Assume df has two columns: 'question_text' and 'target' (1 = spam, 0 = not spam)
df = pd.read_csv("/content/train (1).csv")  # adjust filename if different
texts = df['question_text'].astype(str).tolist()
labels = df['target'].values

# Text preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

texts = [clean_text(text) for text in texts]

# Tokenization
MAX_NUM_WORDS = 200000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.asarray(labels)
print("Shape of data tensor:", data.shape)
print("Shape of label tensor:", labels.shape)

# Load GloVe embeddings
embeddings_index = {}
with open("glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print(f"Found {len(embeddings_index)} word vectors in GloVe.")

# Create embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Build CNN model
embedding_layer = Embedding(
    input_dim=num_words,
    output_dim=EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
    trainable=False
)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)

x = Conv1D(128, 5, activation='relu')(x)
# Skip additional MaxPooling to prevent dimension reduction below 5

x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)

x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
out = Dense(1, activation='sigmoid')(x)

model = Model(sequence_input, out)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train/Test split and fit
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=42)
model.fit(X_train, y_train, batch_size=128, epochs=5, validation_data=(X_val, y_val))


Found 258289 unique tokens.
Shape of data tensor: (1306122, 100)
Shape of label tensor: (1306122,)
Found 400000 word vectors in GloVe.




Epoch 1/5
[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1104s[0m 135ms/step - accuracy: 0.9463 - loss: 0.1464 - val_accuracy: 0.9535 - val_loss: 0.1218
Epoch 2/5
[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1157s[0m 142ms/step - accuracy: 0.9523 - loss: 0.1244 - val_accuracy: 0.9540 - val_loss: 0.1209
Epoch 3/5
[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1147s[0m 140ms/step - accuracy: 0.9541 - loss: 0.1185 - val_accuracy: 0.9551 - val_loss: 0.1175
Epoch 4/5
[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1115s[0m 137ms/step - accuracy: 0.9556 - loss: 0.1141 - val_accuracy: 0.9551 - val_loss: 0.1179
Epoch 5/5
[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1152s[0m 135ms/step - accuracy: 0.9571 - loss: 0.1101 - val_accuracy: 0.9543 - val_loss: 0.1208


<keras.src.callbacks.history.History at 0x7c75a5455010>

In [7]:
import pandas as pd
from keras.preprocessing.sequence import pad_sequences


MAX_SEQUENCE_LENGTH = 100


from google.colab import files
uploaded = files.upload()


df = pd.read_csv(next(iter(uploaded)))
print("Preview of uploaded data:")
print(df.head())


texts = df['question_text'].astype(str).tolist()
sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)


predictions = model.predict(padded)
df['spam_probability'] = predictions


df.to_csv('predicted_spam_output.csv', index=False)
print("Saved to predicted_spam_output.csv")


files.download('predicted_spam_output.csv')


Saving test_spam_questions.csv to test_spam_questions.csv
Preview of uploaded data:
                                       question_text
0           Earn $5000 a week from home. Is it real?
1    Click here to get free iPhones – is this legit?
2  Visit this link and lose 10kg in a week – does...
3  How can I make money fast online without working?
4  Is this the best crypto investment site: www.f...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step
Saved to predicted_spam_output.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:

model.save("spam_classifier_model.h5")




In [9]:
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [10]:
from keras.models import load_model
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)
model = load_model("spam_classifier_model.h5")




In [11]:

model.save("quora_spam_cnn.h5")

import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

from google.colab import files

files.download("quora_spam_cnn.h5")
files.download("tokenizer.pkl")




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>