## Load Dataset

In [1]:
import pandas as pd

# Paths
fake_path = "Fake.csv"
real_path = "True.csv"

fake = pd.read_csv(fake_path)
real = pd.read_csv(real_path)

# Add labels
fake["label"] = 1   # Fake = 1
real["label"] = 0   # Real = 0

# Combine & shuffle
df = pd.concat([fake, real], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df.shape)
print(df["label"].value_counts())
df.head()

(44898, 5)
label
1    23481
0    21417
Name: count, dtype: int64


Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",0


In [2]:
df["text"] = df["title"].fillna("") + " " + df["text"].fillna("")

## Data Preprocessing

In [4]:
import re
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('omw-1.4')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def clean_text(text, max_words=200):
    # Lowercase
    text = str(text).lower()
    # Remove HTML Tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text)
    # Keep only letters and numbers
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize and Lemmatize
    words = text.split()[:max_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return ' '.join(words)

[nltk_data] Downloading package omw-1.4 to C:\Users\BALRAM
[nltk_data]     MANDAL\AppData\Roaming\nltk_data...
[nltk_data] Downloading package wordnet to C:\Users\BALRAM
[nltk_data]     MANDAL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
df["clean_text"] = df["text"].apply(lambda x: clean_text(x, max_words=200))
print(df[["text", "clean_text"]].head())

                                                text  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                          clean_text  
0  ben stein call out 9th circuit court committed...  
1  trump drop steve bannon from national security...  
2  puerto rico expects u s to lift jones act ship...  
3  oops trump just accidentally confirmed he leak...  
4  donald trump head for scotland to reopen a gol...  


## Verify Processing

In [7]:
for i in range(3):
    print("Original:", df.loc[i, "text"][:200])
    print("Cleaned: ", df.loc[i, "clean_text"][:200])
    print("-" * 50)

Original: Ben Stein Calls Out 9th Circuit Court: Committed a ‘Coup d’état’ Against the Constitution 21st Century Wire says Ben Stein, reputable professor from, Pepperdine University (also of some Hollywood fame
Cleaned:  ben stein call out 9th circuit court committed a coup d tat against the constitution 21st century wire say ben stein reputable professor from pepperdine university also of some hollywood fame appearin
--------------------------------------------------
Original: Trump drops Steve Bannon from National Security Council WASHINGTON (Reuters) - U.S. President Donald Trump removed his chief strategist Steve Bannon from the National Security Council on Wednesday, re
Cleaned:  trump drop steve bannon from national security council washington reuters u s president donald trump removed his chief strategist steve bannon from the national security council on wednesday reversing
--------------------------------------------------
Original: Puerto Rico expects U.S. to lift Jones Act s

## TF-IDF Vectorization

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF
tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1,2),
    stop_words='english'
)

# Fit and Transform
X = tfidf.fit_transform(df["clean_text"])
y = df["label"].values

print("TF-IDF matrix shape:", X.shape)

TF-IDF matrix shape: (44898, 20000)


In [9]:
# Inspect Features
print("Sample features:", tfidf.get_feature_names_out()[:50])

Sample features: ['00' '00 pm' '000' '000 000' '000 email' '000 euro' '000 job'
 '000 muslim' '000 page' '000 people' '000 refugee' '000 rohingya'
 '000 syrian' '000 troop' '000 vote' '000 year' '10' '10 000' '10 billion'
 '10 day' '10 million' '10 minute' '10 month' '10 people' '10 percent'
 '10 point' '10 year' '100' '100 000' '100 day' '100 fed' '100 member'
 '100 million' '100 people' '100 percent' '100 seat' '100 year' '100th'
 '101' '10th' '11' '11 2001' '11 attack' '11 memorial' '11 million'
 '11 percent' '11 year' '110' '1100' '1100 kfnx']


In [10]:
# Train-Test Split (before modeling)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

Train size: (35918, 20000)
Test size: (8980, 20000)


## Model Training

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize Logistic Regression
model = LogisticRegression(
    max_iter=200,     # more iterations for convergence
    solver="liblinear" # CPU-friendly solver
)

# Train on training set
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9900890868596882

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4284
           1       0.99      0.99      0.99      4696

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [12]:
import joblib

joblib.dump(model, "fake_news_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

## Testing

In [13]:
def predict_news(text):
    # Clean the text using same preprocessing
    cleaned = clean_text(text)
    # Transform into TF-IDF vector
    vectorized = tfidf.transform([cleaned])
    # Predict
    prediction = model.predict(vectorized)[0]
    return "FAKE" if prediction == 1 else "REAL"

# Example tests
sample_news = [
    "Donald Trump sends his own plane to transport 200 stranded marines",
    "NASA confirms water on Mars with new satellite evidence"
]

for news in sample_news:
    print(news, "->", predict_news(news))

Donald Trump sends his own plane to transport 200 stranded marines -> FAKE
NASA confirms water on Mars with new satellite evidence -> FAKE
