## **Step 1: Load & Preprocess the Data**

In [2]:
import pandas as pd

# Load datasets
df_fake = pd.read_csv('/Users/prasadbodduboina/Documents/untitled folder/Fake.csv')
df_true = pd.read_csv('/Users/prasadbodduboina/Documents/untitled folder/True.csv')

# Add labels
df_fake["label"] = 1
df_true["label"] = 0

# Combine datasets
df = pd.concat([df_fake, df_true], ignore_index=True)

# Shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# View the data
print(df.shape)
df.head()


(44898, 5)


Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",0


# Step 2: Text Cleaning + TF-IDF Feature Extraction

We'll:

Remove punctuation, stopwords, etc.

Convert text to lowercase

Extract features using TF-IDF


In [3]:
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Clean text function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)                      # remove [text]
    text = re.sub(r'https?://\S+|www\.\S+', '', text)        # remove links
    text = re.sub(r'<.*?>+', '', text)                       # remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # remove punctuation
    text = re.sub(r'\n', '', text)                           # remove newlines
    text = re.sub(r'\w*\d\w*', '', text)                     # remove numbers/words with digits
    return text

# Apply cleaning to the 'text' column
df['text'] = df['title'] + " " + df['text']  # Combine title + content
df['text'] = df['text'].apply(clean_text)

# Split data
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# Step 3: Train the Model
Let’s use Logistic Regression, which is simple, fast, and works very well for text classification problems.

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predict
y_pred = model.predict(X_test_tfidf)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.984521158129176

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      4270
           1       0.99      0.98      0.99      4710

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980


Confusion Matrix:
 [[4219   51]
 [  88 4622]]


# Step 5: Save Model and Vectorizer
We’ll now save:

The trained Logistic Regression model

The TF-IDF vectorizer

In [5]:
import joblib

# Create a models directory
import os
os.makedirs("models", exist_ok=True)

# Save model and vectorizer
joblib.dump(model, "models/fake_news_model.pkl")
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")


['models/tfidf_vectorizer.pkl']

In [6]:
import joblib
import os

# Create folder
os.makedirs("models", exist_ok=True)

# Save trained model and vectorizer
joblib.dump(model, "models/fake_news_model.pkl")
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")


['models/tfidf_vectorizer.pkl']

In [7]:
import joblib
import os

# Make sure model and vectorizer are both saved
os.makedirs("models", exist_ok=True)

# Save the trained model
joblib.dump(model, "models/fake_news_model.pkl")  # 👈 This is the missing step

# (Optional: Save vectorizer again, in case it changed)
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")


['models/tfidf_vectorizer.pkl']

In [8]:
print("Saved:", os.listdir("models"))


Saved: ['tfidf_vectorizer.pkl', 'fake_news_model.pkl']
