# Spam-News Detection using Generic Models and RNN

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load Dataset

In [4]:
data = {
    "text": [
        "This is a genuine news article",
        "Click here to win $1,000,000!",
        "Breaking: Important event just happened",
        "Cheap medications available online",
        "Trusted source for daily updates",
        "You are a winner! Claim your prize now",
    ],
    "label": [0, 1, 0, 1, 0, 1],  # 0 = genuine, 1 = spam
}
df = pd.DataFrame(data)

# Preprocessing

In [5]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["text"]).toarray()
y = df["label"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Random Forest

In [7]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest:")
print(classification_report(y_test, y_pred_rf, zero_division=1))

Random Forest:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.75      0.50      0.33         2
weighted avg       0.75      0.50      0.33         2



# Gradient Boosting

In [10]:
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print("\nGradient Boosting:")
print(classification_report(y_test, y_pred_gb, zero_division=1))


Gradient Boosting:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.75      0.50      0.33         2
weighted avg       0.75      0.50      0.33         2



# XGBoost

In [16]:
xgb = XGBClassifier(random_state=42, eval_metric="logloss")
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("\nXGBoost Classifier:")
print(classification_report(y_test, y_pred_xgb, zero_division=1))


XGBoost Classifier:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.75      0.50      0.33         2
weighted avg       0.75      0.50      0.33         2



# RNN Model

In [46]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df["text"])
sequences = tokenizer.texts_to_sequences(df["text"])
padded_sequences = pad_sequences(sequences, maxlen=100)

NameError: name 'Tokenizer' is not defined

# Build the RNN model

In [None]:
X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(
    padded_sequences, y, test_size=0.2, random_state=42
)

# Compile the model

In [None]:
rnn_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])


 # Train the model

In [None]:
rnn_model.fit(X_train_rnn, y_train_rnn, epochs=5, batch_size=32, validation_data=(X_test_rnn, y_test_rnn))