In [1]:
pip install chardet


Note: you may need to restart the kernel to use updated packages.


# Project Title:
Fake News Detection Using Text Classification



# 1.  Problem Statement:
The spread of fake news on social media and news outlets has become a major issue affecting public opinion and even democracy. This project aims to build an AI model that can automatically detect whether a news article is real or fake based on its content, helping reduce misinformation online.


# 2.  Dataset Description:
# Classes:

 Fake News

 Real News (left)

 Real News (right) (can be separated as a third class)

# Features:

News title

News text

Subject

Source

Derived features (e.g., text length, sentiment score, keyword frequency)

# Labels:

Fake or Real (binary or multi-class, depending on how you split)

Number of Samples:

Over 40,000 articles

# Import Libaries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
import pandas as pd

# Load both datasets
fake_df = pd.read_csv('Fake.csv')
true_df = pd.read_csv('True.csv')

# Label them
fake_df['label'] = 'FAKE'
true_df['label'] = 'REAL'

# Combine into one dataset
df = pd.concat([fake_df, true_df], ignore_index=True)

# Shuffle the combined dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check that both labels exist
print(df['label'].value_counts())

# Print sample rows from each label
print("\n--- FAKE News Samples ---")
print(df[df['label'] == 'FAKE'].head())

print("\n--- REAL News Samples ---")
print(df[df['label'] == 'REAL'].head())


label
FAKE    23481
REAL    21417
Name: count, dtype: int64

--- FAKE News Samples ---
                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
5   Paul Ryan Responds To Dem’s Sit-In On Gun Con...   
6  AWESOME! DIAMOND AND SILK Rip Into The Press: ...   
7  STAND UP AND CHEER! UKIP Party Leader SLAMS Ge...   

                                                text          subject  \
0  21st Century Wire says Ben Stein, reputable pr...          US_News   
3  On Monday, Donald Trump once again embarrassed...             News   
5  On Wednesday, Democrats took a powerful stance...             News   
6  President Trump s rally in FL on Saturday was ...  Government News   
7  He s been Europe s version of the outspoken Te...        left-news   

                date label  
0  February 13, 2017  FAKE  
3       May 22, 2017  FAKE  
5      June 22, 2016  FAKE  
6       Feb 19, 2017 

In [3]:
df.head(5)

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",FAKE
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",REAL
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",REAL
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",FAKE
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",REAL


In [4]:
df.shape

(44898, 5)

In [5]:
df.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [6]:
df.describe()

Unnamed: 0,title,text,subject,date,label
count,44898,44898.0,44898,44898,44898
unique,38729,38646.0,8,2397,2
top,Factbox: Trump fills top jobs for his administ...,,politicsNews,"December 20, 2017",FAKE
freq,14,627.0,11272,182,23481


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


# Data Preprocessing 

In [8]:
duplicates_all = df[df.duplicated()]
print(duplicates_all)

                                                   title  \
4106   Islamic State claims responsibility for Aden c...   
5547   Turkey seeks life sentences for 60 ex-military...   
5909   Highlights: The Trump presidency on March 31 a...   
6382   Britain preparing to transfer 400 million poun...   
6779   Israel ambassador asks to meet New Zealand pop...   
...                                                  ...   
44673  Kuwait says GCC to keep operating despite Qata...   
44721  Syrian rebels say discussing evacuation from t...   
44786  Russia may widen designation for media outlets...   
44849  Senate tax bill stalls on deficit-focused 'tri...   
44860  British PM May vows to stay as party plotters ...   

                                                    text       subject  \
4106   CAIRO (Reuters) - Militant group Islamic State...     worldnews   
5547   ISTANBUL (Reuters) - Sixty people including a ...     worldnews   
5909   (Reuters) - Highlights of the day for U.S. Pre... 

In [9]:
# drop duplicate data
df.drop_duplicates(inplace=True)
df.shape

(44689, 5)

In [10]:
# removes  Nan value .
df.dropna(inplace =True)

In [11]:
# show info.
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44689 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44689 non-null  object
 1   text     44689 non-null  object
 2   subject  44689 non-null  object
 3   date     44689 non-null  object
 4   label    44689 non-null  object
dtypes: object(5)
memory usage: 2.0+ MB


In [12]:
#  Nan Value in the dataframe
df.isnull().any().any()

False

In [13]:
#Count the nan value
df.isnull().sum().sum()

0

In [14]:
#Count Nan Value per coloumn
df.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [15]:
# nan value in this coloumn
nan_count = df['title'].isnull().sum()
print(nan_count)

0


In [16]:
title = df[['title']]
print(title)

                                                   title
0      Ben Stein Calls Out 9th Circuit Court: Committ...
1      Trump drops Steve Bannon from National Securit...
2      Puerto Rico expects U.S. to lift Jones Act shi...
3       OOPS: Trump Just Accidentally Confirmed He Le...
4      Donald Trump heads for Scotland to reopen a go...
...                                                  ...
44893  UNREAL! CBS’S TED KOPPEL Tells Sean Hannity He...
44894  PM May seeks to ease Japan's Brexit fears duri...
44895  Merkel: Difficult German coalition talks can r...
44896   Trump Stole An Idea From North Korean Propaga...
44897  BREAKING: HILLARY CLINTON’S STATE DEPARTMENT G...

[44689 rows x 1 columns]


In [17]:
missing = df.isnull().sum()
print("Missing values:\n", missing)

Missing values:
 title      0
text       0
subject    0
date       0
label      0
dtype: int64


In [18]:
df['title'].unique()

array(['Ben Stein Calls Out 9th Circuit Court: Committed a ‘Coup d’état’ Against the Constitution',
       'Trump drops Steve Bannon from National Security Council',
       'Puerto Rico expects U.S. to lift Jones Act shipping restrictions',
       ...,
       "PM May seeks to ease Japan's Brexit fears during trade visit",
       'Merkel: Difficult German coalition talks can reach a deal',
       ' Trump Stole An Idea From North Korean Propaganda Parody Account To Push His Stupid Wall (DETAILS)'],
      dtype=object)

In [20]:
import string

In [21]:
df['text'].unique()

array(['21st Century Wire says Ben Stein, reputable professor from, Pepperdine University (also of some Hollywood fame appearing in TV shows and films such as Ferris Bueller s Day Off) made some provocative statements on Judge Jeanine Pirro s show recently. While discussing the halt that was imposed on President Trump s Executive Order on travel. Stein referred to the judgement by the 9th Circuit Court in Washington state as a  Coup d tat against the executive branch and against the constitution.  Stein went on to call the Judges in Seattle  political puppets  and the judiciary  political pawns. Watch the interview below for the complete statements and note the stark contrast to the rhetoric of the leftist media and pundits who neglect to note that no court has ever blocked any Presidential orders in immigration in the past or discuss the legal efficacy of the halt or the actual text of the Executive Order.READ MORE TRUMP NEWS AT: 21st Century Wire Trump FilesSUPPORT OUR WORK BY SUBSCR

In [22]:
import re

def clean_text(text):
    text = text.lower()                            # Lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)     # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()       # Remove extra spaces
    return text

# Apply to title and text
df['title'] = df['title'].apply(clean_text)
df['text'] = df['text'].apply(clean_text)


In [23]:
df['text_length'] = df['text'].apply(len)

In [24]:
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

In [25]:
df['date'].unique()

array(['February 13, 2017', 'April 5, 2017 ', 'September 27, 2017 ', ...,
       'August 21, 2016 ', 'December 31, 2017 ', 'May 22, 2016 '],
      dtype=object)

In [26]:
# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')  # invalid dates become NaT

# If there are invalid date formats that turned to NaT, you can drop or fill them
df = df.dropna(subset=['date'])


In [27]:
df['date'].unique()

<DatetimeArray>
['2017-02-13 00:00:00', '2017-05-22 00:00:00', '2016-06-22 00:00:00',
 '2016-05-04 00:00:00', '2017-05-07 00:00:00', '2017-06-08 00:00:00',
 '2016-03-01 00:00:00', '2017-05-10 00:00:00', '2016-09-18 00:00:00',
 '2016-10-06 00:00:00',
 ...
 '2017-11-12 00:00:00', '2017-11-25 00:00:00', '2017-11-20 00:00:00',
 '2015-05-02 00:00:00', '2017-12-16 00:00:00', '2017-12-11 00:00:00',
 '2017-10-24 00:00:00', '2017-12-09 00:00:00', '2017-12-19 00:00:00',
 '2017-10-09 00:00:00']
Length: 757, dtype: datetime64[ns]

In [28]:
print(df.info())
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
Index: 11868 entries, 0 to 44896
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   title        11868 non-null  object        
 1   text         11868 non-null  object        
 2   subject      11868 non-null  object        
 3   date         11868 non-null  datetime64[ns]
 4   label        11868 non-null  object        
 5   text_length  11868 non-null  int64         
 6   word_count   11868 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 741.8+ KB
None
title          0
text           0
subject        0
date           0
label          0
text_length    0
word_count     0
dtype: int64


In [35]:
print(df['label'])

0        FAKE
3        FAKE
5        FAKE
16       FAKE
17       FAKE
         ... 
44884    FAKE
44885    FAKE
44889    FAKE
44892    FAKE
44896    FAKE
Name: label, Length: 11868, dtype: object


In [36]:
# Convert 'FAKE' to 0 and 'REAL' to 1
df['label'] = df['label'].map({'FAKE': 0, 'REAL': 1})


In [37]:
df.head(5)

Unnamed: 0,title,text,subject,date,label,text_length,word_count
0,ben stein calls out 9th circuit court committe...,21st century wire says ben stein reputable pro...,US_News,2017-02-13,0,1009,170
3,oops trump just accidentally confirmed he leak...,on monday donald trump once again embarrassed ...,News,2017-05-22,0,1194,182
5,paul ryan responds to dems sitin on gun contro...,on wednesday democrats took a powerful stance ...,News,2016-06-22,0,2080,352
16,judge declares baby name illegal to prevent he...,does a judge have the right to determine what ...,left-news,2016-05-04,0,1533,277
17,paul ryan takes a monumentally humiliating pho...,politicians are all about photoops this is esp...,News,2017-05-07,0,1832,319
