In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import re#helps for searching words in texts or paragraph
import nltk#natural language tool kit
from nltk.corpus import stopwords#contains non important words
from nltk.stem import PorterStemmer#removes prefix or suffix and gives us root word
from sklearn.feature_extraction.text import TfidfVectorizer#coverts str into number for understanding of machine

In [2]:
# Download stopwords if not already
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Roshan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load datasets
fake_news_data = pd.read_csv(r"C:\Users\Roshan\Downloads\Fake.csv.zip")
true_news_data = pd.read_csv(r"C:\Users\Roshan\Downloads\True.csv.zip")


In [4]:
# Initialize stemmer
port_stem = PorterStemmer()

In [5]:
# Stemming function
def stemming(content):
    # Remove special characters and numbers
    stemmed_content = re.sub('[^a-zA-Z]', ' ', str(content))
    # Convert to lowercase
    stemmed_content = stemmed_content.lower()
    # Tokenize into words
    words = stemmed_content.split()
    # Stem each word
    stemmed_words = [port_stem.stem(word) for word in words]
    # Join back into a single string
    return " ".join(stemmed_words)

In [6]:
# Step 1: Add labels
fake_news_data["label"] = 0   # 0 → Fake
true_news_data["label"] = 1   # 1 → True

In [7]:
# Step 2: Combine datasets
news_data = pd.concat([fake_news_data, true_news_data], axis=0)
news_data = news_data.sample(frac=1) # shuffle rows becz If you split this
#directly into train/test, the train set might get mostly fake and the test set
#  mostly true (or vice versa).

#That makes the model biased and accuracy meaningless.

In [8]:
# Step 3: Apply stemming
news_data['text'] = news_data['text'].apply(stemming)

In [9]:
# Step 4: Features & labels
X = news_data['text'].values
y = news_data['label'].values

In [10]:
# Step 5: TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X = vectorizer.fit_transform(X)

'''
TfidfVectorizer

Converts text into numerical feature vectors so ML models can understand it.

TF-IDF = Term Frequency – Inverse Document Frequency

TF: How often a word appears in a document.

IDF: How unique the word is across all documents (rare words get more weight).

Common words like “the”, “is”, “and” → low weight.

Important words like “election”, “vaccine”, “policy” → high weight.

2. Parameters

stop_words='english' → removes common English stopwords (“a”, “an”, “the”, “and”…).

max_df=0.7 → ignores words that appear in 70%+ of documents, since they don’t help distinguish fake vs true.
'''

"\nTfidfVectorizer\n\nConverts text into numerical feature vectors so ML models can understand it.\n\nTF-IDF = Term Frequency – Inverse Document Frequency\n\nTF: How often a word appears in a document.\n\nIDF: How unique the word is across all documents (rare words get more weight).\n\nCommon words like “the”, “is”, “and” → low weight.\n\nImportant words like “election”, “vaccine”, “policy” → high weight.\n\n2. Parameters\n\nstop_words='english' → removes common English stopwords (“a”, “an”, “the”, “and”…).\n\nmax_df=0.7 → ignores words that appear in 70%+ of documents, since they don’t help distinguish fake vs true.\n"

In [11]:
# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=2
)

# Step 7: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 8: Accuracy
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("Testing Accuracy:", accuracy_score(y_test, y_test_pred))

Training Accuracy: 0.9893646639567905
Testing Accuracy: 0.9810690423162584


In [12]:
def predict_news(text):
    # clean & stem the text
    text = stemming(text)
    # convert text into numbers
    text_vector = vectorizer.transform([text])
    # make prediction
    pred = model.predict(text_vector)
    # return result
    if pred == 1:
        return "✅ True News"
    else:
        return "❌ Fake News"


In [13]:
# Example test
print("Prediction:", predict_news("The following statementsÂ were posted to the verified Twitter accounts of U.S. President Donald Trump, @realDonaldTrump and @POTUS.  The opinions expressed are his own.Â Reuters has not edited the statements or confirmed their accuracy.  @realDonaldTrump : -"))

Prediction: ✅ True News
