In [2]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [20]:
fake_df = pd.read_csv("Fake.csv")
true_df = pd.read_csv("True.csv")

fake_df['label'] = 'FAKE'
true_df['label'] = 'REAL'

df = pd.concat([fake_df, true_df], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)
df.head()


Unnamed: 0,title,text,subject,date,label
0,ROMNEY REFUSED TO INSIST OBAMA Expose His Seal...,Wouldn t it have been great to see Mitt fight ...,politics,"May 12, 2016",FAKE
1,PROVOCATION? Republican Senators Introduce New...,21st Century Wire says Donald Trump s campaign...,Middle-east,"January 4, 2017",FAKE
2,Liberia's Liberty Party to appeal election fra...,MONROVIA (Reuters) - Liberia s opposition Libe...,worldnews,"November 26, 2017",REAL
3,Here Are Photos Of Detroit’s Public Schools T...,While much of the nation s attention has been ...,News,"January 17, 2016",FAKE
4,Senators ask Trump EPA chief pick to disclose ...,WASHINGTON (Reuters) - Democrats on the U.S. S...,politicsNews,"December 28, 2016",REAL


In [35]:
import nltk
import re

# Download stopwords (only the first time)
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Define a clean-up function
def preprocess(text):
    text = re.sub(r'\W', ' ', text)               # Remove punctuation
    text = text.lower()                           # Lowercase
    tokens = text.split()                         # Tokenize (split by space)
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)                       # Re-join cleaned words

# Apply preprocessing to the text column
df['text'] = df['text'].apply(preprocess)

# View cleaned data
df[['text', 'label']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shriya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,label
0,great see mitt fight hard reveal truth barack ...,FAKE
1,21st century wire says donald trump campaign p...,FAKE
2,monrovia reuters liberia opposition liberty pa...,REAL
3,much nation attention focused horrors water cr...,FAKE
4,washington reuters democrats u senate environm...,REAL


In [22]:
from sklearn.model_selection import train_test_split

# Features and labels
X = df['text']                             # Input: preprocessed news text
y = df['label'].map({'FAKE': 0, 'REAL': 1})  # Output: 0 = Fake, 1 = Real

# Split into training and testing data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # You can adjust this number if needed

# Fit and transform the training data, transform the test data
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [24]:
from sklearn.linear_model import LogisticRegression

# Create and train the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [25]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluation
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📝 Classification Report:\n", classification_report(y_test, y_pred))


✅ Accuracy: 0.9876391982182628

📉 Confusion Matrix:
 [[4612   70]
 [  41 4257]]

📝 Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4682
           1       0.98      0.99      0.99      4298

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [37]:
def predict_news(news_text):
    # Preprocess the input
    cleaned = preprocess(news_text)
    
    # Transform using TF-IDF
    vectorized = tfidf.transform([cleaned])
    
    # Predict
    prediction = model.predict(vectorized)[0]
    
    # Decode label
    label = 'REAL' if prediction == 1 else 'FAKE'
    return label


In [38]:
test_news = """NASA confirms discovery of a second moon orbiting Earth in secret for decades."""
print("Prediction:", predict_news(test_news))


Prediction: FAKE


In [39]:
test_news = """WASHINGTON (Reuters) - U.S. President Barack Obama on Tuesday nominated federal appeals court judge Merrick Garland to the Supreme Court, calling him a consensus candidate who deserves a full hearing and an up-or-down vote in the Senate."""
print("Prediction:", predict_news(test_news))


Prediction: REAL
