In [13]:
import pandas as pd
import numpy as np
import nltk
import string
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pickle

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pooja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# Load both datasets
fake_df = pd.read_csv('Fake.csv')
true_df = pd.read_csv('True.csv')
#print(fake_df)
#print(true_df)
# Label them
fake_df['label'] = 0  # 0 = Fake
true_df['label'] = 1  # 1 = Real

# Combine datasets
df = pd.concat([fake_df, true_df], axis=0).reset_index(drop=True)

# Shuffle dataset
df = df.sample(frac=1).reset_index(drop=True)

df[['title', 'label']].head()

Unnamed: 0,title,label
0,SEX OBJECTS FOR HILLARY…Jennifer Lopez Shakes ...,0
1,Donald Trump Shames Kids For Not Registering ...,0
2,LIBERTARIAN Gary Johnson ENDORSES Black Lives ...,0
3,PATRIOTS OWNER On Trump: “In The Toughest Time...,0
4,This Hilarious Campaign Ad Shows Voters How T...,0


In [15]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

df['text'] = df['title'].apply(clean_text)

# Optional: remove stopwords
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [16]:
X = df['text']
y = df['label']

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [17]:
# Logistic Regression
model = LogisticRegression()
# model = MultinomialNB()  # Optional alternative

model.fit(X_train, y_train)



In [18]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9430957683741648
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95      4664
           1       0.94      0.94      0.94      4316

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980



In [19]:
# Save the model
pickle.dump(model, open("fake_news_model.pkl", "wb"))
pickle.dump(vectorizer, open("tfidf_vectorizer.pkl", "wb"))