In [1]:
# ==========================================================
# NewsSense — Accuracy Graph + Heatmaps (Full Script)
# ==========================================================
# Inputs:
#   "C:\Users\sagni\Downloads\News Sense\archive\News _dataset\True.csv"
#   "C:\Users\sagni\Downloads\News Sense\archive\News _dataset\Fake.csv"
#
# Outputs (to C:\Users\sagni\Downloads\News Sense):
#   - news_features_corr_heatmap.png            (correlation heatmap of engineered features)
#   - news_confusion_matrix.png                 (confusion-matrix heatmap)
#   - news_accuracy_over_epochs.png             (train/test accuracy curve)
#   - news_accuracy_over_epochs.csv             (epoch-wise accuracies)
#   - news_classification_report.txt            (precision/recall/F1 per class)
# ==========================================================

from pathlib import Path
import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    precision_recall_curve, roc_curve, roc_auc_score, average_precision_score
)

# ----------------------------
# Paths
# ----------------------------
TRUE_PATH = Path(r"C:\Users\sagni\Downloads\News Sense\archive\News _dataset\True.csv")
FAKE_PATH = Path(r"C:\Users\sagni\Downloads\News Sense\archive\News _dataset\Fake.csv")
OUT_DIR   = Path(r"C:\Users\sagni\Downloads\News Sense")
OUT_DIR.mkdir(parents=True, exist_ok=True)

HEATMAP_PNG = OUT_DIR / "news_features_corr_heatmap.png"
CM_PNG      = OUT_DIR / "news_confusion_matrix.png"
ACC_PNG     = OUT_DIR / "news_accuracy_over_epochs.png"
ACC_CSV     = OUT_DIR / "news_accuracy_over_epochs.csv"
CLS_TXT     = OUT_DIR / "news_classification_report.txt"
META_JSON   = OUT_DIR / "news_heatmap_and_accuracy_meta.json"

# ----------------------------
# Load & label
# ----------------------------
true_df = pd.read_csv(TRUE_PATH, low_memory=False)
true_df["label"] = 0   # 0 = True/Real
fake_df = pd.read_csv(FAKE_PATH, low_memory=False)
fake_df["label"] = 1   # 1 = Fake

df = pd.concat([true_df, fake_df], ignore_index=True)
print("[INFO] Combined shape:", df.shape)
print("[INFO] Columns:", list(df.columns))

# ----------------------------
# Build a single text field (title + text if available)
# ----------------------------
def get_text_col(df):
    # pick reasonable text sources
    title = df["title"].astype(str) if "title" in df.columns else ""
    body  = df["text"].astype(str)  if "text"  in df.columns else ""
    if isinstance(title, str):  # both missing case
        return body
    if isinstance(body, str):
        return title
    return (title + " " + body).fillna("")

text_series = get_text_col(df).astype(str)
labels = df["label"].astype(int).values

# ----------------------------
# Engineer small numeric features for HEATMAP
# (lengths, punctuation, urls, uppercase ratio, etc.)
# ----------------------------
def count_urls(s: str) -> int:
    return len(re.findall(r"https?://|www\.", s))

def count_digits(s: str) -> int:
    return len(re.findall(r"\d", s))

def upper_ratio(s: str) -> float:
    letters = re.findall(r"[A-Za-z]", s)
    if not letters:
        return 0.0
    upp = sum(1 for ch in letters if ch.isupper())
    return upp / max(1, len(letters))

eng = pd.DataFrame({
    "title_len": df["title"].astype(str).str.len() if "title" in df.columns else pd.Series([0]*len(df)),
    "text_len":  df["text"].astype(str).str.len()  if "text"  in df.columns else text_series.str.len(),
    "num_exclaim": text_series.str.count("!"),
    "num_question": text_series.str.count(r"\?"),
    "num_urls": text_series.apply(count_urls),
    "num_digits": text_series.apply(count_digits),
    "upper_ratio": text_series.apply(upper_ratio),
    "label": labels
})

# Correlation heatmap (engineered features + label)
corr = eng.corr(numeric_only=True)
plt.figure(figsize=(9.5, 7.2))
im = plt.imshow(corr.values, aspect='auto')
plt.xticks(range(corr.shape[1]), corr.columns, rotation=45, ha='right')
plt.yticks(range(corr.shape[0]), corr.index)
plt.title("NewsSense: Correlation Heatmap (engineered features + label)")
plt.colorbar(im)
plt.tight_layout()
plt.savefig(HEATMAP_PNG, dpi=220)
plt.close()
print(f"[SAVED] {HEATMAP_PNG}")

# ----------------------------
# Train/Test split (stratified)
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    text_series, labels, test_size=0.2, random_state=42, stratify=labels
)

# ----------------------------
# Vectorizer + Classifier
# Use SGD (logistic) with partial_fit to plot epoch-wise accuracy
# ----------------------------
tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=2,
    max_features=200_000,
    strip_accents="unicode",
    lowercase=True
)

# Fit TF-IDF once on train, transform both
X_train_t = tfidf.fit_transform(X_train)
X_test_t  = tfidf.transform(X_test)

clf = SGDClassifier(
    loss="log_loss",    # logistic regression
    alpha=1e-4,
    max_iter=1,         # manual epochs
    learning_rate="optimal",
    random_state=42,
    warm_start=True
)
classes = np.array([0, 1])
epochs = 12
train_acc, test_acc = [], []

for ep in range(epochs):
    if ep == 0:
        clf.partial_fit(X_train_t, y_train, classes=classes)
    else:
        clf.partial_fit(X_train_t, y_train)
    y_tr_pred = clf.predict(X_train_t)
    y_te_pred = clf.predict(X_test_t)
    train_acc.append(accuracy_score(y_train, y_tr_pred))
    test_acc.append(accuracy_score(y_test, y_te_pred))

# Plot accuracy curve
plt.figure(figsize=(8, 4.6))
plt.plot(range(1, epochs+1), train_acc, marker='o', label="Train Accuracy")
plt.plot(range(1, epochs+1), test_acc,  marker='s', label="Test Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("NewsSense: Accuracy over Epochs (TF-IDF + SGD)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(ACC_PNG, dpi=220)
plt.close()
print(f"[SAVED] {ACC_PNG}")

pd.DataFrame({
    "epoch": list(range(1, epochs+1)),
    "train_accuracy": train_acc,
    "test_accuracy": test_acc
}).to_csv(ACC_CSV, index=False)
print(f"[SAVED] {ACC_CSV}")

# ----------------------------
# Final evaluation (on test set)
# ----------------------------
y_pred = clf.predict(X_test_t)
cm = confusion_matrix(y_test, y_pred, labels=[0,1])

# Confusion matrix heatmap
plt.figure(figsize=(5.6, 4.8))
im = plt.imshow(cm, aspect='equal')
plt.title("NewsSense: Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.colorbar(im)
plt.xticks([0,1], ["True (0)", "Fake (1)"])
plt.yticks([0,1], ["True (0)", "Fake (1)"])
for (i, j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha='center', va='center')
plt.tight_layout()
plt.savefig(CM_PNG, dpi=220)
plt.close()
print(f"[SAVED] {CM_PNG}")

# Optional extra metrics
try:
    # Probabilities for curves (SGD with log_loss exposes predict_proba)
    probs = clf.predict_proba(X_test_t)[:, 1]
    roc_auc = roc_auc_score(y_test, probs)
    pr_auc  = average_precision_score(y_test, probs)
    print(f"[INFO] ROC-AUC: {roc_auc:.4f} | PR-AUC: {pr_auc:.4f}")
except Exception:
    pass

# Classification report
report = classification_report(y_test, y_pred, digits=4, target_names=["True","Fake"])
with open(CLS_TXT, "w", encoding="utf-8") as f:
    f.write("=== NewsSense: TF-IDF + SGD (Test) ===\n\n")
    f.write(report + "\n")
print(f"[SAVED] {CLS_TXT}")

# Save small meta
meta = {
    "columns_available": list(df.columns),
    "engineered_features": list(eng.columns),
    "epochs": epochs,
    "train_size": int(len(X_train)),
    "test_size": int(len(X_test))
}
with open(META_JSON, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)
print(f"[SAVED] {META_JSON}")

print("\n[DONE] All graphs and reports saved in:", OUT_DIR)


[INFO] Combined shape: (44898, 5)
[INFO] Columns: ['title', 'text', 'subject', 'date', 'label']
[SAVED] C:\Users\sagni\Downloads\News Sense\news_features_corr_heatmap.png
[SAVED] C:\Users\sagni\Downloads\News Sense\news_accuracy_over_epochs.png
[SAVED] C:\Users\sagni\Downloads\News Sense\news_accuracy_over_epochs.csv
[SAVED] C:\Users\sagni\Downloads\News Sense\news_confusion_matrix.png
[INFO] ROC-AUC: 0.9983 | PR-AUC: 0.9984
[SAVED] C:\Users\sagni\Downloads\News Sense\news_classification_report.txt
[SAVED] C:\Users\sagni\Downloads\News Sense\news_heatmap_and_accuracy_meta.json

[DONE] All graphs and reports saved in: C:\Users\sagni\Downloads\News Sense
