In [1]:
# ==========================================================
# NewsSense — Predictions CSV + Prediction Graphs (Full Code)
#  - Trains TF-IDF + Logistic Regression (balanced)
#  - Saves predictions CSV on test split
#  - Saves prediction histogram + ROC + PR curves
# ==========================================================

from pathlib import Path
import json
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt
from textwrap import shorten

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    average_precision_score,
    roc_curve,
    precision_recall_curve
)
import joblib

# ----------------------------
# Paths
# ----------------------------
TRUE_PATH = Path(r"C:\Users\sagni\Downloads\News Sense\archive\News _dataset\True.csv")
FAKE_PATH = Path(r"C:\Users\sagni\Downloads\News Sense\archive\News _dataset\Fake.csv")
OUT_DIR   = Path(r"C:\Users\sagni\Downloads\News Sense")
OUT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_PKL  = OUT_DIR / "news_model.pkl"
PRED_CSV   = OUT_DIR / "news_test_predictions.csv"
HIST_PNG   = OUT_DIR / "news_prediction_hist.png"
ROC_PNG    = OUT_DIR / "news_roc_curve.png"
PR_PNG     = OUT_DIR / "news_pr_curve.png"
METRICS_JS = OUT_DIR / "news_metrics.json"

# ----------------------------
# Load & label
# ----------------------------
true_df = pd.read_csv(TRUE_PATH, low_memory=False)
true_df["label"] = 0  # 0 = True/Real
fake_df = pd.read_csv(FAKE_PATH, low_memory=False)
fake_df["label"] = 1  # 1 = Fake
df = pd.concat([true_df, fake_df], ignore_index=True)
print("[INFO] Combined shape:", df.shape)

# ----------------------------
# Build unified text field
# ----------------------------
def make_text(df):
    t1 = df["title"].fillna("").astype(str) if "title" in df.columns else ""
    t2 = df["text"].fillna("").astype(str)  if "text"  in df.columns else ""
    if isinstance(t1, str):  # only text present
        return t2
    if isinstance(t2, str):  # only title present
        return t1
    return (t1 + " " + t2)

X_text = make_text(df)
y      = df["label"].astype(int).values

# ----------------------------
# Train/test split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------------------
# Pipeline: TF-IDF + Logistic Regression
# ----------------------------
pipe = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=2,
        max_features=200_000,
        strip_accents="unicode",
        lowercase=True
    )),
    ("clf", LogisticRegression(
        solver="saga",
        penalty="l2",
        class_weight="balanced",
        max_iter=2000,
        n_jobs=-1,
        random_state=42
    ))
])

print("[INFO] Training model...")
pipe.fit(X_train, y_train)

# ----------------------------
# Predict on test set
# ----------------------------
y_prob = pipe.predict_proba(X_test)[:, 1]           # P(class=Fake)
y_pred = (y_prob >= 0.5).astype(int)

acc    = float((y_pred == y_test).mean())
rocauc = float(roc_auc_score(y_test, y_prob))
prau   = float(average_precision_score(y_test, y_prob))
print(f"[INFO] Test Accuracy={acc:.4f} | ROC-AUC={rocauc:.4f} | PR-AUC={prau:.4f}")

# ----------------------------
# Save predictions CSV
# ----------------------------
def preview_text(s, width=180):
    return shorten(str(s), width=width, placeholder="…")

pred_df = pd.DataFrame({
    "text_preview": [preview_text(t) for t in X_test],
    "y_true": y_test,
    "prob_fake": y_prob,
    "y_pred": y_pred
})
pred_df.to_csv(PRED_CSV, index=False)
print(f"[SAVED] Predictions CSV -> {PRED_CSV}")

# ----------------------------
# Prediction graph: histogram of probabilities
# ----------------------------
plt.figure(figsize=(8,4.8))
plt.hist(y_prob[y_test==0], bins=30, alpha=0.7, label="True (y=0)")
plt.hist(y_prob[y_test==1], bins=30, alpha=0.7, label="Fake (y=1)")
plt.axvline(0.5, linestyle="--", linewidth=1)
plt.xlabel("Predicted Probability of Fake")
plt.ylabel("Count")
plt.title("NewsSense: Prediction Probability Histogram")
plt.legend()
plt.tight_layout()
plt.savefig(HIST_PNG, dpi=220)
plt.close()
print(f"[SAVED] Prediction histogram -> {HIST_PNG}")

# ----------------------------
# ROC & PR curves (handy for model quality)
# ----------------------------
# ROC
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"ROC-AUC={rocauc:.3f}")
plt.plot([0,1],[0,1],"--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("NewsSense: ROC Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.savefig(ROC_PNG, dpi=220)
plt.close()
print(f"[SAVED] ROC curve -> {ROC_PNG}")

# PR
prec, rec, _ = precision_recall_curve(y_test, y_prob)
plt.figure(figsize=(6,5))
plt.plot(rec, prec, label=f"PR-AUC={prau:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("NewsSense: Precision-Recall Curve")
plt.legend(loc="upper right")
plt.grid(True)
plt.tight_layout()
plt.savefig(PR_PNG, dpi=220)
plt.close()
print(f"[SAVED] PR curve -> {PR_PNG}")

# ----------------------------
# Save metrics JSON + model
# ----------------------------
with open(METRICS_JS, "w", encoding="utf-8") as f:
    json.dump({"accuracy": acc, "roc_auc": rocauc, "pr_auc": prau}, f, indent=2)
print(f"[SAVED] Metrics JSON -> {METRICS_JS}")

joblib.dump(pipe, MODEL_PKL)
print(f"[SAVED] Trained model -> {MODEL_PKL}")

print("\n[DONE] Files saved in:", OUT_DIR)


[INFO] Combined shape: (44898, 5)
[INFO] Training model...
[INFO] Test Accuracy=0.9891 | ROC-AUC=0.9993 | PR-AUC=0.9993
[SAVED] Predictions CSV -> C:\Users\sagni\Downloads\News Sense\news_test_predictions.csv
[SAVED] Prediction histogram -> C:\Users\sagni\Downloads\News Sense\news_prediction_hist.png
[SAVED] ROC curve -> C:\Users\sagni\Downloads\News Sense\news_roc_curve.png
[SAVED] PR curve -> C:\Users\sagni\Downloads\News Sense\news_pr_curve.png
[SAVED] Metrics JSON -> C:\Users\sagni\Downloads\News Sense\news_metrics.json
[SAVED] Trained model -> C:\Users\sagni\Downloads\News Sense\news_model.pkl

[DONE] Files saved in: C:\Users\sagni\Downloads\News Sense
