In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract features and labels
texts = df['content'].astype(str)
labels = df['label']

# Vectorizers
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Transform
X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

# Train-test split (80/20)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)
X_train_count, X_test_count, _, _ = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# ----------------------
# Multinomial Naive Bayes with TF-IDF
# ----------------------
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)

train_acc_tfidf = accuracy_score(y_train, nb_tfidf.predict(X_train_tfidf))
test_acc_tfidf = accuracy_score(y_test, nb_tfidf.predict(X_test_tfidf))

print("Naive Bayes with TF-IDF")
print("Training Accuracy:", round(train_acc_tfidf * 100, 2), "%")
print("Testing Accuracy:", round(test_acc_tfidf * 100, 2), "%")

# ----------------------
# Multinomial Naive Bayes with Count Vectorizer
# ----------------------
nb_count = MultinomialNB()
nb_count.fit(X_train_count, y_train)

train_acc_count = accuracy_score(y_train, nb_count.predict(X_train_count))
test_acc_count = accuracy_score(y_test, nb_count.predict(X_test_count))

print("\nNaive Bayes with Count Vectorizer")
print("Training Accuracy:", round(train_acc_count * 100, 2), "%")
print("Testing Accuracy:", round(test_acc_count * 100, 2), "%")


Naive Bayes with TF-IDF
Training Accuracy: 88.77 %
Testing Accuracy: 85.33 %

Naive Bayes with Count Vectorizer
Training Accuracy: 91.57 %
Testing Accuracy: 86.68 %


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract features and labels
texts = df['content'].astype(str)
labels = df['label']

# Vectorizers
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Transform
X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

# Train-test split (80/20)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)
X_train_count, X_test_count, _, _ = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# Compute sample weights to handle imbalance
weight_real = len(y_train) / (2 * sum(y_train == 1))
weight_fake = len(y_train) / (2 * sum(y_train == 0))
sample_weights = np.array([weight_real if y==1 else weight_fake for y in y_train])

# ----------------------
# Multinomial Naive Bayes with TF-IDF (weighted)
# ----------------------
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train, sample_weight=sample_weights)

y_train_pred_tfidf = nb_tfidf.predict(X_train_tfidf)
y_test_pred_tfidf = nb_tfidf.predict(X_test_tfidf)

print("Naive Bayes with TF-IDF (Weighted)")
print("\nTraining Metrics:\n", classification_report(y_train, y_train_pred_tfidf))
print("Testing Metrics:\n", classification_report(y_test, y_test_pred_tfidf))

# ----------------------
# Multinomial Naive Bayes with Count Vectorizer (weighted)
# ----------------------
nb_count = MultinomialNB()
nb_count.fit(X_train_count, y_train, sample_weight=sample_weights)

y_train_pred_count = nb_count.predict(X_train_count)
y_test_pred_count = nb_count.predict(X_test_count)

print("\nNaive Bayes with Count Vectorizer (Weighted)")
print("\nTraining Metrics:\n", classification_report(y_train, y_train_pred_count))
print("Testing Metrics:\n", classification_report(y_test, y_test_pred_count))


Naive Bayes with TF-IDF (Weighted)

Training Metrics:
               precision    recall  f1-score   support

           0       0.98      0.88      0.92      5929
           1       0.72      0.93      0.82      2029

    accuracy                           0.89      7958
   macro avg       0.85      0.91      0.87      7958
weighted avg       0.91      0.89      0.90      7958

Testing Metrics:
               precision    recall  f1-score   support

           0       0.95      0.84      0.89      1482
           1       0.66      0.88      0.75       508

    accuracy                           0.85      1990
   macro avg       0.81      0.86      0.82      1990
weighted avg       0.88      0.85      0.86      1990


Naive Bayes with Count Vectorizer (Weighted)

Training Metrics:
               precision    recall  f1-score   support

           0       0.97      0.88      0.92      5929
           1       0.73      0.92      0.81      2029

    accuracy                           0.89

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract features and labels
texts = df['content'].astype(str)
labels = df['label']

# TF-IDF Vectorizer with bigrams and more features
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=15000, ngram_range=(1,2))
X_tfidf = tfidf_vectorizer.fit_transform(texts)

# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)

# Compute sample weights to handle imbalance
weight_real = len(y_train) / (2 * sum(y_train == 1))
weight_fake = len(y_train) / (2 * sum(y_train == 0))
sample_weights = np.array([weight_real if y==1 else weight_fake for y in y_train])

# Train Naive Bayes
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train, y_train, sample_weight=sample_weights)

# Predict probabilities
y_test_probs = nb_tfidf.predict_proba(X_test)[:,1]  # probability for class 1 (real news)

# Adjust threshold for minority class
threshold = 0.45  # can be tuned
y_test_pred = (y_test_probs >= threshold).astype(int)

y_train_probs = nb_tfidf.predict_proba(X_train)[:,1]
y_train_pred = (y_train_probs >= threshold).astype(int)

# Evaluation
print("Naive Bayes TF-IDF (Weighted + Bigrams + Threshold Adjustment)")
print("\nTraining Metrics:\n", classification_report(y_train, y_train_pred))
print("Testing Metrics:\n", classification_report(y_test, y_test_pred))


Naive Bayes TF-IDF (Weighted + Bigrams + Threshold Adjustment)

Training Metrics:
               precision    recall  f1-score   support

           0       0.99      0.85      0.92      5929
           1       0.69      0.98      0.81      2029

    accuracy                           0.88      7958
   macro avg       0.84      0.92      0.86      7958
weighted avg       0.92      0.88      0.89      7958

Testing Metrics:
               precision    recall  f1-score   support

           0       0.97      0.79      0.87      1482
           1       0.60      0.93      0.73       508

    accuracy                           0.82      1990
   macro avg       0.79      0.86      0.80      1990
weighted avg       0.88      0.82      0.83      1990



In [5]:
import numpy as np
from sklearn.metrics import f1_score

# Predict probabilities for real news class
y_probs = nb_tfidf.predict_proba(X_test)[:, 1]

# Search for the threshold that maximizes F1-score
thresholds = np.linspace(0.1, 0.9, 81)  # from 0.1 to 0.9 in steps of 0.01
f1_scores = []

for t in thresholds:
    y_pred_thresh = (y_probs >= t).astype(int)
    f1 = f1_score(y_test, y_pred_thresh, pos_label=1)
    f1_scores.append(f1)

# Best threshold
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f"Optimal Threshold: {best_threshold:.2f}")
print(f"Best F1-score for Real News: {best_f1:.3f}")

# Predict using the best threshold
y_test_pred_opt = (y_probs >= best_threshold).astype(int)

# Final evaluation report
from sklearn.metrics import classification_report
print("\nFinal Metrics with Optimal Threshold:")
print(classification_report(y_test, y_test_pred_opt))


Optimal Threshold: 0.58
Best F1-score for Real News: 0.787

Final Metrics with Optimal Threshold:
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      1482
           1       0.74      0.85      0.79       508

    accuracy                           0.88      1990
   macro avg       0.84      0.87      0.85      1990
weighted avg       0.89      0.88      0.89      1990



In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score

class WeightedNBDetector:
    def __init__(self, max_features=15000, ngram_range=(1,2)):
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features, ngram_range=ngram_range)
        self.model = MultinomialNB()
        self.threshold = 0.5  # default threshold
        self.fitted = False

    def fit(self, texts, labels):
        # Vectorize
        X = self.vectorizer.fit_transform(texts)

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, labels, test_size=0.2, random_state=42, stratify=labels
        )

        # Compute sample weights for imbalance
        weight_real = len(y_train) / (2 * sum(y_train == 1))
        weight_fake = len(y_train) / (2 * sum(y_train == 0))
        sample_weights = np.array([weight_real if y==1 else weight_fake for y in y_train])

        # Train Naive Bayes
        self.model.fit(X_train, y_train, sample_weight=sample_weights)

        # Optimize threshold for class 1 (real news)
        y_probs = self.model.predict_proba(X_test)[:,1]
        thresholds = np.linspace(0.1, 0.9, 81)
        f1_scores = [f1_score(y_test, (y_probs >= t).astype(int), pos_label=1) for t in thresholds]
        best_idx = np.argmax(f1_scores)
        self.threshold = thresholds[best_idx]

        # Evaluate
        y_test_pred = (y_probs >= self.threshold).astype(int)
        print(f"Optimal Threshold for Real News: {self.threshold:.2f}")
        print("Test Metrics with Optimal Threshold:")
        print(classification_report(y_test, y_test_pred))

        self.fitted = True
        return self

    def predict(self, texts):
        if not self.fitted:
            raise ValueError("Model is not fitted yet. Call fit() first.")
        X = self.vectorizer.transform(texts)
        probs = self.model.predict_proba(X)[:,1]
        return (probs >= self.threshold).astype(int)

    def predict_proba(self, texts):
        if not self.fitted:
            raise ValueError("Model is not fitted yet. Call fit() first.")
        X = self.vectorizer.transform(texts)
        return self.model.predict_proba(X)

# -------------------------
# Usage
# -------------------------
# Load your dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")
texts = df['content'].astype(str)
labels = df['label']

# Initialize and train
detector = WeightedNBDetector()
detector.fit(texts, labels)

# Predict on new texts
# new_texts = ["Some news article here...", "Another example..."]
# predictions = detector.predict(new_texts)
# probabilities = detector.predict_proba(new_texts)


Optimal Threshold for Real News: 0.58
Test Metrics with Optimal Threshold:
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      1482
           1       0.74      0.85      0.79       508

    accuracy                           0.88      1990
   macro avg       0.84      0.87      0.85      1990
weighted avg       0.89      0.88      0.89      1990



<__main__.WeightedNBDetector at 0x7e3dfdf75550>

In [8]:
import joblib
import os

# Path to save the model
folder_path = "/content/drive/MyDrive/Colab Notebooks/Checkpoints/NB"
save_path = os.path.join(folder_path, "weighted_nb_detector.pkl")

# Create folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the entire detector object
joblib.dump(detector, save_path)
print(f"Model saved to {save_path}")


Model saved to /content/drive/MyDrive/Colab Notebooks/Checkpoints/NB/weighted_nb_detector.pkl


In [11]:
ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/
