In [1]:
# ==========================================================
# NewsSense — Full Prediction Script
# Trains TF-IDF + Logistic Regression, evaluates, saves:
#  - model (news_model.pkl)
#  - test predictions (news_test_predictions.csv)
#  - metrics (news_metrics.json)
#  - plots: confusion, ROC, PR
#  - quick single-text prediction helper at the end
# ==========================================================

from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from textwrap import shorten

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    average_precision_score,
    roc_curve,
    precision_recall_curve,
)

import joblib

# ----------------------------
# Paths
# ----------------------------
TRUE_PATH = Path(r"C:\Users\sagni\Downloads\News Sense\archive\News _dataset\True.csv")
FAKE_PATH = Path(r"C:\Users\sagni\Downloads\News Sense\archive\News _dataset\Fake.csv")
OUT_DIR   = Path(r"C:\Users\sagni\Downloads\News Sense")
OUT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_PKL  = OUT_DIR / "news_model.pkl"
PRED_CSV   = OUT_DIR / "news_test_predictions.csv"
METRICS_JS = OUT_DIR / "news_metrics.json"
CM_PNG     = OUT_DIR / "news_confusion_matrix.png"
ROC_PNG    = OUT_DIR / "news_roc_curve.png"
PR_PNG     = OUT_DIR / "news_pr_curve.png"
REPORT_TXT = OUT_DIR / "news_classification_report.txt"

# ----------------------------
# Load & label
# ----------------------------
true_df = pd.read_csv(TRUE_PATH, low_memory=False)
true_df["label"] = 0  # 0 = True/Real

fake_df = pd.read_csv(FAKE_PATH, low_memory=False)
fake_df["label"] = 1  # 1 = Fake

df = pd.concat([true_df, fake_df], ignore_index=True)
print("[INFO] Combined shape:", df.shape)

# ----------------------------
# Build text field (title + text if present)
# ----------------------------
def build_text(series_title, series_text):
    if series_title is None and series_text is None:
        return pd.Series([""] * len(df))
    if series_title is None:
        return series_text.fillna("").astype(str)
    if series_text is None:
        return series_title.fillna("").astype(str)
    return (series_title.fillna("") + " " + series_text.fillna("")).astype(str)

title_series = df["title"] if "title" in df.columns else None
text_series  = df["text"]  if "text"  in df.columns else None
X_text = build_text(title_series, text_series)
y      = df["label"].astype(int).values

# ----------------------------
# Train / test split (stratified)
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------------------
# Pipeline: TF-IDF + Logistic Regression (balanced)
# ----------------------------
pipe = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=2,
        max_features=200_000,
        strip_accents="unicode",
        lowercase=True
    )),
    ("clf", LogisticRegression(
        solver="saga",
        penalty="l2",
        class_weight="balanced",   # handle class imbalance
        max_iter=2000,
        n_jobs=-1,
        random_state=42
    ))
])

print("[INFO] Training model...")
pipe.fit(X_train, y_train)

# ----------------------------
# Predict on test set
# ----------------------------
y_prob = pipe.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

acc    = accuracy_score(y_test, y_pred)
rocauc = roc_auc_score(y_test, y_prob)
prau   = average_precision_score(y_test, y_prob)
cm     = confusion_matrix(y_test, y_pred, labels=[0,1])
report = classification_report(y_test, y_pred, digits=4, target_names=["True","Fake"])

print(f"[INFO] Acc: {acc:.4f} | ROC-AUC: {rocauc:.4f} | PR-AUC: {prau:.4f}")
print("[INFO] Confusion matrix:\n", cm)

# ----------------------------
# Save predictions CSV (with text preview)
# ----------------------------
def preview(s, n=160):
    return shorten(str(s), width=n, placeholder="…")

pred_df = pd.DataFrame({
    "text_preview": [preview(t) for t in X_test],
    "y_true": y_test,
    "prob_fake": y_prob,
    "y_pred": y_pred
})
pred_df.to_csv(PRED_CSV, index=False)
print(f"[SAVED] Predictions -> {PRED_CSV}")

# ----------------------------
# Save metrics
# ----------------------------
metrics = {
    "accuracy": float(acc),
    "roc_auc": float(rocauc),
    "pr_auc": float(prau),
    "confusion_matrix": {
        "tn": int(cm[0,0]), "fp": int(cm[0,1]),
        "fn": int(cm[1,0]), "tp": int(cm[1,1]),
    }
}
with open(METRICS_JS, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)
print(f"[SAVED] Metrics JSON -> {METRICS_JS}")

with open(REPORT_TXT, "w", encoding="utf-8") as f:
    f.write("=== NewsSense: TF-IDF + Logistic Regression (Test) ===\n\n")
    f.write(report + "\n")
print(f"[SAVED] Classification report -> {REPORT_TXT}")

# ----------------------------
# Plots: Confusion, ROC, PR
# ----------------------------
# Confusion matrix heatmap
plt.figure(figsize=(5.6, 4.8))
im = plt.imshow(cm, aspect='equal')
plt.title("NewsSense: Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.colorbar(im)
plt.xticks([0,1], ["True (0)", "Fake (1)"])
plt.yticks([0,1], ["True (0)", "Fake (1)"])
for (i, j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha='center', va='center')
plt.tight_layout()
plt.savefig(CM_PNG, dpi=220)
plt.close()
print(f"[SAVED] Confusion matrix -> {CM_PNG}")

# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"ROC-AUC={rocauc:.3f}")
plt.plot([0,1],[0,1],"--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("NewsSense: ROC Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.savefig(ROC_PNG, dpi=220)
plt.close()
print(f"[SAVED] ROC curve -> {ROC_PNG}")

# PR curve
prec, rec, _ = precision_recall_curve(y_test, y_prob)
plt.figure(figsize=(6,5))
plt.plot(rec, prec, label=f"PR-AUC={prau:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("NewsSense: Precision-Recall Curve")
plt.legend(loc="upper right")
plt.grid(True)
plt.tight_layout()
plt.savefig(PR_PNG, dpi=220)
plt.close()
print(f"[SAVED] PR curve -> {PR_PNG}")

# ----------------------------
# Save trained model
# ----------------------------
joblib.dump(pipe, MODEL_PKL)
print(f"[SAVED] Model -> {MODEL_PKL}")

print("\n[DONE] All artifacts saved in:", OUT_DIR)

# ----------------------------
# Quick single-text prediction helper
# ----------------------------
def predict_single(text: str, threshold: float = 0.5):
    """
    Returns ('True' or 'Fake', prob_fake) for an input text using the trained model.
    """
    model = joblib.load(MODEL_PKL)
    prob = float(model.predict_proba([text])[0, 1])
    label = "Fake" if prob >= threshold else "True"
    return label, prob

# Example:
# lbl, p = predict_single("Breaking!!! You won't believe what happened...")
# print(lbl, p)


[INFO] Combined shape: (44898, 5)
[INFO] Training model...
[INFO] Acc: 0.9891 | ROC-AUC: 0.9993 | PR-AUC: 0.9993
[INFO] Confusion matrix:
 [[4240   44]
 [  54 4642]]
[SAVED] Predictions -> C:\Users\sagni\Downloads\News Sense\news_test_predictions.csv
[SAVED] Metrics JSON -> C:\Users\sagni\Downloads\News Sense\news_metrics.json
[SAVED] Classification report -> C:\Users\sagni\Downloads\News Sense\news_classification_report.txt
[SAVED] Confusion matrix -> C:\Users\sagni\Downloads\News Sense\news_confusion_matrix.png
[SAVED] ROC curve -> C:\Users\sagni\Downloads\News Sense\news_roc_curve.png
[SAVED] PR curve -> C:\Users\sagni\Downloads\News Sense\news_pr_curve.png
[SAVED] Model -> C:\Users\sagni\Downloads\News Sense\news_model.pkl

[DONE] All artifacts saved in: C:\Users\sagni\Downloads\News Sense
