# Spam-News Detection using Generic Models and RNN

# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load Dataset

In [2]:
data = {
    "text": [
        "This is a genuine news article",
        "Click here to win $1,000,000!",
        "Breaking: Important event just happened",
        "Cheap medications available online",
        "Trusted source for daily updates",
        "You are a winner! Claim your prize now",
    ],
    "label": [0, 1, 0, 1, 0, 1],  # 0 = genuine, 1 = spam
}
df = pd.DataFrame(data)

# Preprocessing

In [3]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["text"]).toarray()
y = df["label"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Random Forest

In [4]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest:")
print(classification_report(y_test, y_pred_rf, zero_division=1))

Random Forest:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.75      0.50      0.33         2
weighted avg       0.75      0.50      0.33         2



# Gradient Boosting

In [5]:
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print("\nGradient Boosting:")
print(classification_report(y_test, y_pred_gb, zero_division=1))


Gradient Boosting:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.75      0.50      0.33         2
weighted avg       0.75      0.50      0.33         2



# XGBoost

In [6]:
xgb = XGBClassifier(random_state=42, eval_metric="logloss")
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("\nXGBoost Classifier:")
print(classification_report(y_test, y_pred_xgb, zero_division=1))


XGBoost Classifier:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.75      0.50      0.33         2
weighted avg       0.75      0.50      0.33         2



# RNN Model

In [8]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df["text"])
sequences = tokenizer.texts_to_sequences(df["text"])
padded_sequences = pad_sequences(sequences, maxlen=100)

# Build the RNN model

In [9]:
X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(
    padded_sequences, y, test_size=0.2, random_state=42
)

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

rnn_model = Sequential([
    Embedding(input_dim=5000, output_dim=64),
    SimpleRNN(64, activation="tanh"),
    Dense(1, activation="sigmoid"),
])

In [17]:
if 'rnn_model' in locals():
    rnn_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
else:
    print("Error: rnn_model is not defined.")

# Compile the model

In [18]:
rnn_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])


In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

rnn_model = Sequential([
    Embedding(input_dim=5000, output_dim=64),
    SimpleRNN(64, activation="tanh"),
    Dense(1, activation="sigmoid"),
])

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df["text"])
sequences = tokenizer.texts_to_sequences(df["text"])
padded_sequences = pad_sequences(sequences, maxlen=100)

X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(
    padded_sequences, df["label"], test_size=0.2, random_state=42
)

 # Train the model

In [23]:
rnn_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
rnn_model.fit(X_train_rnn, y_train_rnn, epochs=5, batch_size=32, validation_data=(X_test_rnn, y_test_rnn))

Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.5000 - loss: 0.6841 - val_accuracy: 0.5000 - val_loss: 0.6991
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step - accuracy: 1.0000 - loss: 0.6359 - val_accuracy: 0.5000 - val_loss: 0.6951
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step - accuracy: 1.0000 - loss: 0.5920 - val_accuracy: 0.5000 - val_loss: 0.6912
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step - accuracy: 1.0000 - loss: 0.5498 - val_accuracy: 0.5000 - val_loss: 0.6874
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step - accuracy: 1.0000 - loss: 0.5079 - val_accuracy: 0.5000 - val_loss: 0.6832


<keras.src.callbacks.history.History at 0x1e40ac92f70>