In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import re
import string


In [30]:
# DATA COLLECTION
df = pd.read_csv(r"C:\Users\shrut\Downloads\Fake.csv")
df.head()   # shows first 5 rows

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [38]:


df = pd.read_csv(r"C:\Users\shrut\Downloads\True.csv")
df.head()


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [42]:
# DATA CLEANING
# Load datasets
fake = pd.read_csv(r"C:\Users\shrut\Downloads\Fake.csv")
true = pd.read_csv(r"C:\Users\shrut\Downloads\True.csv")

# Check shapes
print("Fake dataset shape:", fake.shape)
print("True dataset shape:", true.shape)

Fake dataset shape: (23481, 4)
True dataset shape: (21417, 4)


In [44]:
# Add Labels
fake["label"] = 0   # Fake news = 0
true["label"] = 1   # True news = 1

In [46]:
# Concatenate
df = pd.concat([fake, true], axis=0).reset_index(drop=True)

print("Combined dataset shape:", df.shape)
df.head()

Combined dataset shape: (44898, 5)


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [48]:
df.tail()

Unnamed: 0,title,text,subject,date,label
44893,Fully committed' NATO backs new U.S. approach ...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
44897,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


In [50]:
# Check for null values in entire DataFrame
df.isnull().sum()

title      0
text       1
subject    0
date       0
label      0
dtype: int64

In [52]:
df = df.drop(columns=["title", "subject", "date"])
df.head()


Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [56]:
# Shuffle the dataset
df = df.sample(frac=1)
df.head()

Unnamed: 0,text,label
27753,,0
43575,"He asked me if I wanted it back and I said, ...",0
41681,Find out how many are going to sign up to go ...,0
3941,"The week of Donald Trump s election, white nat...",0
19594,Leave it to a Democrat to waste everyone s tim...,0


In [58]:
# Reset Index
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,text,label
0,,0
1,"He asked me if I wanted it back and I said, ...",0
2,Find out how many are going to sign up to go ...,0
3,"The week of Donald Trump s election, white nat...",0
4,Leave it to a Democrat to waste everyone s tim...,0


In [86]:
import nltk
from nltk.corpus import stopwords


# Download stopwords if not already
nltk.download('stopwords')

# Define stopwords
stop_words = set(stopwords.words('english'))

# Cleaning function
def clean_text(text):
    text = str(text).lower()                                # lowercase
    text = re.sub(r'http\S+|www.\S+', '', text)             # remove URLs
    text = re.sub(r'<.*?>+', '', text)                      # remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)                         # remove digits
    text = re.sub(r'\n', ' ', text)                         # remove newlines
    text = re.sub(r'\s+', ' ', text).strip()                # remove extra spaces
    text = ' '.join([word for word in text.split() if word not in stop_words])  # remove stopwords
    return text

# Apply cleaning
df["text"] = df["text"].apply(clean_text)

# Check cleaned dataset
print(df.head())




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrut\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  label
0                                                         0
1  asked wanted back said veteran gave donald tru...      0
2  find many going sign go war keep protect us mo...      0
3  week donald trump election white nationalists ...      0
4  leave democrat waste everyone time money putti...      0


In [88]:
# Independent variable (features)
X = df["text"]

# Dependent variable (labels)
y = df["label"]

# Check shape
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Quick look
print("\nSample X:\n", X.iloc[0:5])
print("\nSample y:\n", y.iloc[0:5])


Shape of X: (44898,)
Shape of y: (44898,)

Sample X:
 0                                                     
1    asked wanted back said veteran gave donald tru...
2    find many going sign go war keep protect us mo...
3    week donald trump election white nationalists ...
4    leave democrat waste everyone time money putti...
Name: text, dtype: object

Sample y:
 0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64


In [98]:
# Convert into Vectors using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Transform text into vectors
X_vectorized = tfidf.fit_transform(X)

print("Shape of TF-IDF Matrix:", X_vectorized.shape)

Shape of TF-IDF Matrix: (44898, 5000)


In [97]:

# Train-Test Split
from sklearn.model_selection import train_test_split

# Split into training and testing (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (35918, 5000)
Testing set shape: (8980, 5000)


In [101]:
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit the vectorizer on X (text data) and transform it into vectors
X_vectorized = tfidf.fit_transform(X)

# Check the shape of the TF-IDF matrix
print("Shape of TF-IDF Matrix:", X_vectorized.shape)

# Show first 20 feature names (words)
print("\nSample feature names:", tfidf.get_feature_names_out()[0:20])

Shape of TF-IDF Matrix: (44898, 5000)

Sample feature names: ['abandon' 'abandoned' 'abbas' 'abc' 'abdullah' 'abe' 'abedin' 'ability'
 'able' 'abortion' 'abortions' 'abroad' 'absence' 'absolute' 'absolutely'
 'absurd' 'abu' 'abuse' 'abused' 'abuses']


In [103]:
# Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42
)

# Step 2: Initialize Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Step 3: Train the model
model.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:\n", classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9870824053452116

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4717
           1       0.98      0.99      0.99      4263

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Confusion Matrix:
 [[4649   68]
 [  48 4215]]


In [106]:
# Decision Tree Classifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42
)

# Step 2: Initialize Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Step 3: Train the model
dt_model.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = dt_model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:\n", classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9938752783964365

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      4717
           1       1.00      0.99      0.99      4263

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Confusion Matrix:
 [[4696   21]
 [  34 4229]]
