In [10]:
# ---------------------------------------
# Fake News Detection System
# Using Python, Scikit-learn, and TF-IDF
# ---------------------------------------

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# -----------------------------
# 1. File Paths and Validation
# -----------------------------
TRUE_PATH = r"C:\Users\sahil\Desktop\ML Pred Project\True.csv"
FAKE_PATH = r"C:\Users\sahil\Desktop\ML Pred Project\Fake.csv"

assert os.path.exists(TRUE_PATH), f"Missing file: {TRUE_PATH}"
assert os.path.exists(FAKE_PATH), f"Missing file: {FAKE_PATH}"

# -----------------------------
# 2. Load Datasets
# -----------------------------
df_true = pd.read_csv(TRUE_PATH)
df_fake = pd.read_csv(FAKE_PATH)

# Label encoding: 0 = Real, 1 = Fake
df_true["label"] = 0
df_fake["label"] = 1

# -----------------------------
# 3. Data Cleaning
# -----------------------------
for df in (df_true, df_fake):
    for col in ["title", "text"]:
        if col not in df.columns:
            df[col] = ""
        df[col] = df[col].fillna("").astype(str)

# Combine title and text into a single column
for df in (df_true, df_fake):
    df["text_all"] = (df["title"].fillna("") + " " + df["text"].fillna("")).str.strip()

# -----------------------------
# 4. Merge and Shuffle Data
# -----------------------------
df = pd.concat(
    [df_true[["text_all", "label"]], df_fake[["text_all", "label"]]],
    ignore_index=True
).sample(frac=1.0, random_state=42).reset_index(drop=True)

print(df.head(3))
print("\nClass balance:\n", df["label"].value_counts())

# -----------------------------
# 5. Feature Extraction
# -----------------------------
X_text = df["text_all"].astype(str).values
y = df["label"].values

vectorizer = TfidfVectorizer(
    stop_words="english",
    lowercase=True,
    max_features=10000,
    ngram_range=(1, 2),
    min_df=2
)

X = vectorizer.fit_transform(X_text)

# -----------------------------
# 6. Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------
# 7. Model Training
# -----------------------------
clf = LinearSVC()
clf.fit(X_train, y_train)

# -----------------------------
# 8. Model Evaluation
# -----------------------------
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}\n")

print("Classification report:\n",
      classification_report(y_test, y_pred, digits=4, target_names=["Real (0)", "Fake (1)"]))

# -----------------------------
# 9. Save Model and Vectorizer
# -----------------------------
joblib.dump(clf, "model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
print("Saved: model.pkl, vectorizer.pkl")

# -----------------------------
# 10. Prediction Function
# -----------------------------
def predict_label(text: str) -> str:
    vec = vectorizer.transform([text])
    pred = clf.predict(vec)[0]
    return "Fake (1)" if pred == 1 else "Real (0)"

# Example Predictions
samples = [
    "Parliament passes the annual budget with increased healthcare spending.",
    "Scientists claim a secret herb cures every disease overnight."
]

for sample in samples:
    print(sample, "->", predict_label(sample))

                                            text_all  label
0  BREAKING: GOP Chairman Grassley Has Had Enough...      1
1  Failed GOP Candidates Remembered In Hilarious ...      1
2  Mike Pence’s New DC Neighbors Are HILARIOUSLY ...      1

Class balance:
 label
1    23481
0    21417
Name: count, dtype: int64
Accuracy: 0.9960

Classification report:
               precision    recall  f1-score   support

    Real (0)     0.9958    0.9958    0.9958      4284
    Fake (1)     0.9962    0.9962    0.9962      4696

    accuracy                         0.9960      8980
   macro avg     0.9960    0.9960    0.9960      8980
weighted avg     0.9960    0.9960    0.9960      8980

Saved: model.pkl, vectorizer.pkl
Parliament passes the annual budget with increased healthcare spending. -> Real (0)
Scientists claim a secret herb cures every disease overnight. -> Fake (1)


In [11]:
import sys
import joblib

# Load trained model and vectorizer
model = joblib.load('model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

def predict_label(text: str) -> str:
    X = vectorizer.transform([text])
    pred = model.predict(X)[0]
    return "Fake (1)" if pred == 1 else "Real (0)"

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('Usage: python predict.py "your news text here"')
        sys.exit(1)
    print(predict_label(sys.argv[1]))

Real (0)


In [12]:
!python predict.py "NASA confirms water found on Mars in large quantities."

Fake (1)


In [13]:
!python predict.py "The following statementsÂ were posted to the verified Twitter accounts of U.S. President Donald Trump, @realDonaldTrump and @POTUS.  The opinions expressed are his own.Â Reuters has not edited the statements or confirmed their accuracy.  @realDonaldTrump : - â€œOn 1/20 - the day Trump was inaugurated - an estimated 35,000 ISIS fighters held approx 17,500 square miles of territory in both Iraq and Syria. As of 12/21, the U.S. military estimates the remaining 1,000 or so fighters occupy roughly 1,900 square miles...â€ via @jamiejmcintyre  [1749 EST] - Just left West Palm Beach Fire & Rescue #2. Met with great men and women as representatives of those who do so much for all of us. Firefighters, paramedics, first responders - what amazing people they are! [1811 EST] - â€œOn 1/20 - the day Trump was inaugurated - an estimated 35,000 ISIS fighters held approx 17,500 square miles of territory in both Iraq and Syria. As of 12/21, the U.S. military est the remaining 1,000 or so fighters occupy roughly 1,900 square miles..â€ @jamiejmcintyre @dcexaminer [2109 EST] - "Arrests of MS-13 Members, Associates Up 83% Under Trump" bit.ly/2liRH3b [2146 EST] -- Source link: (bit.ly/2jBh4LU) (bit.ly/2jpEXYR) "

Real (0)


In [14]:
!python predict.py "WASHINGTON (Reuters) - The Trump administration briefed congressional staff this week on how the White House was considering non-proliferation standards in a potential pact to sell nuclear reactor technology to Saudi Arabia, but did not indicate whether allowing uranium enrichment would be part of any deal, congressional aides said. Non-proliferation advocates worry that allowing Saudi Arabia to enrich fuel in a nuclear power deal could also enable it to one day covertly produce fissile material and set off an arms race with arch-rival Iran that could spread more broadly throughout the Middle East.  Senate Foreign Relations Committee staff members were briefed by State Department and Department of Energy officials in a meeting on Wednesday, the aides said. They learned the administration â€œis working to develop a position on non-proliferation standardsâ€ should they begin talks with Saudi Arabia on a civilian nuclear cooperation pact known as a 123 agreement, a committee aide said.  The administration is still mulling whether any agreement would allow uranium enrichment, the aide said.  The race to build Saudi Arabiaâ€™s first nuclear power reactors is heating up among U.S., South Korean, Chinese and Russian companies.  U.S. Energy Secretary Rick Perry visited Saudi Arabia last week, telling Reuters then that new talks between the two allies on a 123 agreement would start soon. An agreement would allow U.S. companies to participate in Saudi Arabiaâ€™s civilian nuclear program. Riyadh has said it wants to be self-sufficient in producing nuclear fuel and that it is not interested in diverting nuclear technology to military use. In previous talks, Saudi Arabia has refused to sign an agreement with Washington that would deprive it of enriching uranium. Uranium fuel for reactors is enriched to only about 5 percent, lower than the 90 percent level for fissile material in nuclear bombs. Some senators with proliferation concerns worry the administration is moving too quickly on talks about nuclear plants and enrichment with Saudi without consulting Congress. As required by a 2008 law, the president is required to keep the committees in the House and Senate that deal with foreign relations â€œfully and currently informedâ€ on any initiative and talks relating to new or amended 123 agreements. â€œWeâ€™re frustrated by the lack of briefings and having to yet again learn about potential foreign policy developments from the press,â€ a congressional aide said. A day before the senate briefing, a report by Bloomberg citing sources said that the administration may allow uranium enrichment as part of an agreement. The congressional aide said there are concerns that plans for an agreement are only being conducted by a small number of people controlled by the White House. â€œIt also appears that this is policy being driven out of the White House, which makes congressional oversight that much harder,â€ said the aide. If lawmakers oppose a civilian nuclear deal signed by the president they can try to fight it with legislation or other measures. The Trump administration and the previous Obama administration have pushed for selling nuclear power technology abroad, partly to keep the country competitive with Russia and China in nuclear innovation. A State Department official said the United States and Saudi Arabia have been in talks since 2012 regarding a 123 agreement but declined to comment on the discussions. Energy Department officials did not immediately comment on the briefing. Toshiba-owned Westinghouse is in talks with other U.S.- based companies to form a consortium for a bid in a multibillion-dollar tender for two nuclear reactors in Saudi Arabia. Winning a bid would be a big step for Westinghouse. It went into Chapter 11 bankruptcy this year and abandoned plans to build two advanced AP1000 reactors in the United States."

Real (0)
