In [1]:
import os
import sys
from pathlib import Path

if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("working dir:", workding_dir)

working dir: /Users/inflaton/code/engd/papers/DM-Fake-News-Detection


# From Detection to Credibility: A Machine Learning Framework for Assessing News Source Reliability



**Motivation**

As media continues to grow in volume, it is becoming increasingly difficult to differentiate real and fake news effectively. It is thus imperative for us to find better ways to identify fake news, and for us, this means with the help of data mining and machine learning.

In the first part of our project, we will focus on experimenting with different data processing techniques and predictive models, optimising our final pipeline and model to accurately identify fake news.

For the second part, we want to apply our trained model to scraped news data from popular US media outlets and access the credibility of these media outlets. This way we can help the public to make more informed decisions about what media outlets they can trust. 


## 2nd Part: Fake News Classification Use Case

For the second part, we scraped articles from 10 different news sites, split into two categories of news sites. 

The dimensions are shown below:
- **Index:**: Index.
- **title:** Title of news article.
- **text:** Text content of news article.
- **label:** Whether news article is real (0) or fake (1).

The Fake News Dataset is split into 3 `csv` files (`part1.csv`, `part2.csv`, `part3.csv`) so that size does not exceed size limit to push changes to GitHub.

## Import Libraries

Please uncomment the code box below to pip install relevant dependencies for this notebook.

In [14]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud

Language Detection:   0%|          | 0/1522 [00:44<?, ?it/s]


In [3]:
## Unreliable websites
breitbart_df = pd.read_csv("./web_scraping/test_websites/breitbart_articles_1.csv")
dailycaller_df = pd.read_csv('./web_scraping/test_websites/daily_caller_articles_1.csv')
naturalnews_df = pd.read_csv('./web_scraping/test_websites/naturalnews_articles_1.csv')
newsmax_df = pd.read_csv('./web_scraping/test_websites/newsmax_articles_1.csv')
zerohedge_df = pd.read_csv('./web_scraping/test_websites/zerohedge_articles_1.csv')

# Reliable websites
cnn_df = pd.read_csv('./web_scraping/test_websites/cnn_articles_1.csv')
ap_df = pd.read_csv('./web_scraping/test_websites/ap_articles_1.csv')
bbc_df = pd.read_csv('./web_scraping/test_websites/bbc_articles_1.csv')
npr_df = pd.read_csv('./web_scraping/test_websites/npr_articles_1.csv')
guardian_df = pd.read_csv('./web_scraping/test_websites/guardian_articles_1.csv')

## Combine all 10 dataframes into 1
Here we combine all 10 dataframes into 1 dataframe (`data_raw`) by concatenating along rows.

We also reset the `index` and drop the old `index` column.

In [4]:
# Combining all 10 dataframes into 1
data_raw = pd.concat([breitbart_df, dailycaller_df, naturalnews_df,newsmax_df,zerohedge_df,cnn_df,ap_df,bbc_df,npr_df,guardian_df], axis=0)

# Reset index and drop old index column
data_raw = data_raw.reset_index(drop=True)

data_raw.info()
print("Dataframe Shape: ", data_raw.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522 entries, 0 to 1521
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   source   1522 non-null   object
 1   title    1522 non-null   object
 2   content  1522 non-null   object
 3   date     1522 non-null   object
dtypes: object(4)
memory usage: 47.7+ KB
Dataframe Shape:  (1522, 4)


In [5]:
data_raw.head()

Unnamed: 0,source,title,content,date
0,Breitbart,GOP: WH May Have Illegally Altered Biden's 'Ga...,House Republicans say the Biden-Harris White H...,2024-10-31
1,Breitbart,Mavericks Principal Owners Donate $100 Million...,"Miriam Adelson, the principal owner of the NBA...",2024-10-30
2,Breitbart,Fact Check: Harris Campaign Twists Trump Comme...,CLAIM: Vice President Kamala Harris’s campaign...,2024-11-01
3,Breitbart,"Harris Co-Chair: She's Different from Biden, H...",On Tuesday’s broadcast of CNN’s “Laura Coates ...,2024-10-29
4,Breitbart,Quinnipiac Poll: Trump Takes Lead in Pennsylvania,Former President Donald Trump has a two point ...,2024-10-30


In [6]:
data_raw['source'].value_counts()

source
News Max            200
Zerohedge           200
Breitbart           198
The Daily Caller    175
Natural News        160
AP                  157
Guardian            133
CNN                 125
NPR                  88
BBC                  86
Name: count, dtype: int64

## Selection Criteria
- Language: Only use articles written in English (using language detection if necessary).
- Date Range: Focus on articles published in 2024 or, at most, 2023 to ensure credibility remains up-to-date.
- Content Focus: Only articles from the politics/election sections of each news website, targeting US election-related topics.

### Additional Filtering
- Define Political Content: Articles should contain keywords like “election,” “government,” “policy,” or “candidate” to qualify as political.
- Exclude Advertisements: Filter out content with commercial keywords or phrases related to advertisements.
- Exclude Opinion Pieces: Identify and exclude articles labeled as “opinion,” “editorial,” or similar terms, or those appearing in “Opinion” sections.
- Remove Outliers by Length: Filter out articles with fewer than 100 words or more than 5,000 words to focus on substantial content.
- Regex Filtering: Use regular expressions to remove boilerplate or irrelevant sections, such as “All rights reserved,” “Read more,” bylines, and embedded links to other articles or advertisements.

#### Step 1: Language Detection

In [15]:
# 1) Set a seed for langdetect to ensure reproducibility
DetectorFactory.seed = 0

# 2a) Simplified preprocessing: only remove non-alphabetic characters
def preprocess_text_simple(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()

# 2b) Check if the text is non-language (e.g., numbers, symbols only)
def is_non_language_text(text):
    if re.match(r'^[^a-zA-Z]*$', text):  # Check if text has no alphabetic characters
        return True
    return False

# 3a) Function to get langdetect prediction
def get_langdetect_prediction(text):
    try:
        # Directly use text without preprocessing for efficiency
        if len(text) < 10 or is_non_language_text(text):
            return "unknown"
        lang = langdetect_detect(text)
        return lang
    except LangDetectException:
        return "unknown"

# 3b) Function to get langid prediction
def get_langid_prediction(text):
    try:
        lang, _ = langid_classify(text)
        if len(text) < 10 or is_non_language_text(text):
            return "unknown"
        return lang
    except Exception:
        return "unknown"

# 4) Function to calculate majority vote for each language
def calculate_majority_vote(predictions):
    vote_counts = {}
    for lang in predictions:
        if lang in vote_counts:
            vote_counts[lang] += 1
        else:
            vote_counts[lang] = 1
    return vote_counts

# 5) Parallel processing for efficiency with limited workers
def parallel_detection(text):
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(lambda func: func(text), 
                                    [get_langdetect_prediction, get_langid_prediction]))
    return results

# 6) Caching function for repeated inputs
@lru_cache(maxsize=500)
def get_cached_language(text):
    return combined_language_detection(text)

# 7) Combined majority voting language detection function
def combined_language_detection(text):
    # Check if the text is non-language (e.g., numbers, symbols only)
    if is_non_language_text(text):
        return "unknown"
    
    # Run the detectors in parallel for efficiency
    predictions = parallel_detection(text)
    
    # Calculate majority vote for each language based on predictions
    vote_counts = calculate_majority_vote(predictions)
    
    # Determine the language with the highest majority vote
    final_language = max(vote_counts, key=vote_counts.get)
    
    # If "unknown" is the most common or if all detectors fail, return "unknown"
    if final_language == "unknown" or vote_counts[final_language] <= 1:
        return "unknown"
    
    return final_language

# 8) Apply the cached function to each text in the DataFrame with a progress bar
data_raw['language'] = [get_cached_language(text) for text in tqdm(data_raw['content'], desc="Language Detection")]

# 9) Display the DataFrame with detected languages
data_raw

Language Detection: 100%|██████████| 1522/1522 [00:19<00:00, 79.78it/s]


Unnamed: 0,source,title,content,date,language
0,Breitbart,GOP: WH May Have Illegally Altered Biden's 'Ga...,House Republicans say the Biden-Harris White H...,2024-10-31,en
1,Breitbart,Mavericks Principal Owners Donate $100 Million...,"Miriam Adelson, the principal owner of the NBA...",2024-10-30,en
2,Breitbart,Fact Check: Harris Campaign Twists Trump Comme...,CLAIM: Vice President Kamala Harris’s campaign...,2024-11-01,en
3,Breitbart,"Harris Co-Chair: She's Different from Biden, H...",On Tuesday’s broadcast of CNN’s “Laura Coates ...,2024-10-29,en
4,Breitbart,Quinnipiac Poll: Trump Takes Lead in Pennsylvania,Former President Donald Trump has a two point ...,2024-10-30,en
...,...,...,...,...,...
1517,Guardian,US elections 2024: seven key House races to watch,Much attention has been paid to the rematch be...,2024-06-11,en
1518,Guardian,Michigan congresswoman Rashida Tlaib declines ...,Michigan congresswoman Rashida Tlaib declined ...,2024-11-02,en
1519,Guardian,Donald Trump says he will meet Ukrainian presi...,Donald Trump said he would meet with Ukrainian...,2024-09-26,en
1520,Guardian,Middle East crisis live: ‘apocalyptic’ north G...,Deadly Israeli strikes on 'apocalyptic' north ...,2024-11-02,en


In [16]:
# Drop rows where language is NOT in english and reset the index
data_raw = data_raw[data_raw['language'] == 'en'].reset_index(drop=True)
data_raw

Unnamed: 0,source,title,content,date,language
0,Breitbart,GOP: WH May Have Illegally Altered Biden's 'Ga...,House Republicans say the Biden-Harris White H...,2024-10-31,en
1,Breitbart,Mavericks Principal Owners Donate $100 Million...,"Miriam Adelson, the principal owner of the NBA...",2024-10-30,en
2,Breitbart,Fact Check: Harris Campaign Twists Trump Comme...,CLAIM: Vice President Kamala Harris’s campaign...,2024-11-01,en
3,Breitbart,"Harris Co-Chair: She's Different from Biden, H...",On Tuesday’s broadcast of CNN’s “Laura Coates ...,2024-10-29,en
4,Breitbart,Quinnipiac Poll: Trump Takes Lead in Pennsylvania,Former President Donald Trump has a two point ...,2024-10-30,en
...,...,...,...,...,...
1517,Guardian,US elections 2024: seven key House races to watch,Much attention has been paid to the rematch be...,2024-06-11,en
1518,Guardian,Michigan congresswoman Rashida Tlaib declines ...,Michigan congresswoman Rashida Tlaib declined ...,2024-11-02,en
1519,Guardian,Donald Trump says he will meet Ukrainian presi...,Donald Trump said he would meet with Ukrainian...,2024-09-26,en
1520,Guardian,Middle East crisis live: ‘apocalyptic’ north G...,Deadly Israeli strikes on 'apocalyptic' north ...,2024-11-02,en


In [17]:
data_raw['language'].value_counts()

language
en    1522
Name: count, dtype: int64

In [18]:
# Drop the  'language' column
data_raw = data_raw.drop(columns=['language'])

data_raw.head()

Unnamed: 0,source,title,content,date
0,Breitbart,GOP: WH May Have Illegally Altered Biden's 'Ga...,House Republicans say the Biden-Harris White H...,2024-10-31
1,Breitbart,Mavericks Principal Owners Donate $100 Million...,"Miriam Adelson, the principal owner of the NBA...",2024-10-30
2,Breitbart,Fact Check: Harris Campaign Twists Trump Comme...,CLAIM: Vice President Kamala Harris’s campaign...,2024-11-01
3,Breitbart,"Harris Co-Chair: She's Different from Biden, H...",On Tuesday’s broadcast of CNN’s “Laura Coates ...,2024-10-29
4,Breitbart,Quinnipiac Poll: Trump Takes Lead in Pennsylvania,Former President Donald Trump has a two point ...,2024-10-30


In [19]:
#  Copy data for safety
data_filtered = data_raw.copy()
print("Number of articles before filtering:", len(data_filtered))

# Step 2: Political Keywords Filter
us_politics_keywords = [
    # Core Keywords
    "election", "elections", "2024 election", "presidential election", "campaign", "campaigning",
    "primary", "primaries", "polling", "polls",

    # Key Political Figures
    "Kamala Harris", "Donald Trump", "Joe Biden", "Ron DeSantis", "Gavin Newsom", 
    "Mike Pence", "Vivek Ramaswamy", "Tim Walz"

    # Political Parties & Groups
    "Democrat", "Democratic Party", "Republican", "GOP", "Republican Party",
    "Independent", "Third party", "PAC", "Super PAC",

    # Political Issues & Controversies
    "voting rights", "voter suppression", "absentee ballot", "mail-in ballot",
    "Electoral College", "Supreme Court", "abortion", "Roe v. Wade", "gun control",
    "Second Amendment", "immigration", "border security", "healthcare", 
    "Medicare", "Affordable Care Act", "climate change", "Green New Deal", 
    "inflation", "economic policy", "tax cuts", "tax reform", "foreign policy", 
    "foreign relations",

    # U.S. Government Bodies & Offices
    "Congress", "Senate", "Senators", "House of Representatives", "White House",
    "Supreme Court", "Federal government", "State government",

    # Policies & Bills
    "voting reform", "healthcare reform", "climate policy", "gun legislation", 
    "economic recovery", "infrastructure bill", "Social Security", "student loan forgiveness",

    # Social & Cultural Issues
    "social justice", "racial equality", "police reform", "civil rights",
    "freedom of speech", "religious freedom",

    # Election Processes
    "debates", "presidential debate", "swing state", "battleground state",
    "electoral votes",

    # Additional Relevant Terms
    "approval rating", "national convention", "lobbying", "lobbyist",
    "scandal", "investigation", "political rally", "rally"
]
data_filtered = data_filtered[data_filtered['content'].str.contains('|'.join(us_politics_keywords), case=False, na=False)]
print("Number of articles after political keyword filtering:", len(data_filtered))

# Step 3: Exclude Opinion Pieces (assuming titles or content contain specific indicators of opinion)
opinion_keywords = ['opinion', 'editorial', 'op-ed']
data_filtered = data_filtered[~data_filtered['title'].str.contains('|'.join(opinion_keywords), case=False, na=False)]
data_filtered = data_filtered[~data_filtered['content'].str.contains('|'.join(opinion_keywords), case=False, na=False)]
print("Number of articles after opinion piece filtering:", len(data_filtered))

# Step 4: Remove Irrelevant Content Using Regex
# Define patterns for irrelevant content
irrelevant_patterns = [
    r'All rights reserved', r'Read more', r'For more information', r'Follow us', 
    r'Find us on', r'Contact the author', r'Subscribe for updates', r'\bWATCH\b',
    r'Advertisement', r'^[\W_]+$'  # Removes lines that are mostly symbols or whitespace
]

def clean_irrelevant_content(text):
    for pattern in irrelevant_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    return text

data_filtered['content'] = data_filtered['content'].apply(clean_irrelevant_content)
print("Number of articles after irrelevant content cleaning:", len(data_filtered))

# Final DataFrame after filtering
print("Number of articles after all filtering:", len(data_filtered))
data_filtered.head()

Number of articles before filtering: 1522
Number of articles after political keyword filtering: 1456
Number of articles after opinion piece filtering: 1313
Number of articles after irrelevant content cleaning: 1313
Number of articles after all filtering: 1313


Unnamed: 0,source,title,content,date
0,Breitbart,GOP: WH May Have Illegally Altered Biden's 'Ga...,House Republicans say the Biden-Harris White H...,2024-10-31
1,Breitbart,Mavericks Principal Owners Donate $100 Million...,"Miriam Adelson, the principal owner of the NBA...",2024-10-30
2,Breitbart,Fact Check: Harris Campaign Twists Trump Comme...,CLAIM: Vice President Kamala Harris’s campaign...,2024-11-01
3,Breitbart,"Harris Co-Chair: She's Different from Biden, H...",On Tuesday’s broadcast of CNN’s “Laura Coates ...,2024-10-29
4,Breitbart,Quinnipiac Poll: Trump Takes Lead in Pennsylvania,Former President Donald Trump has a two point ...,2024-10-30


In [20]:
data_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1313 entries, 0 to 1520
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   source   1313 non-null   object
 1   title    1313 non-null   object
 2   content  1313 non-null   object
 3   date     1313 non-null   object
dtypes: object(4)
memory usage: 51.3+ KB


In [21]:
data_filtered['source'].value_counts()

source
News Max            189
Breitbart           176
The Daily Caller    161
Zerohedge           158
AP                  144
Natural News        128
CNN                 108
Guardian             99
NPR                  79
BBC                  71
Name: count, dtype: int64

# Feature Selection
Here we select the relevant features for fake news classification.
- `title`, `text`
- Create a new DataFrame (`data`) by selecting the specifc columns mentioned above from the original DataFrame `data_raw`.

In [22]:
data = data_filtered[['title', 'content', 'source',]]
print(type(data))
print(data.head())

# Shape before dropping duplicates
print("\nThe old shape is: ", data.shape)

<class 'pandas.core.frame.DataFrame'>
                                               title  \
0  GOP: WH May Have Illegally Altered Biden's 'Ga...   
1  Mavericks Principal Owners Donate $100 Million...   
2  Fact Check: Harris Campaign Twists Trump Comme...   
3  Harris Co-Chair: She's Different from Biden, H...   
4  Quinnipiac Poll: Trump Takes Lead in Pennsylvania   

                                             content     source  
0  House Republicans say the Biden-Harris White H...  Breitbart  
1  Miriam Adelson, the principal owner of the NBA...  Breitbart  
2  CLAIM: Vice President Kamala Harris’s campaign...  Breitbart  
3  On Tuesday’s broadcast of CNN’s “Laura Coates ...  Breitbart  
4  Former President Donald Trump has a two point ...  Breitbart  

The old shape is:  (1313, 3)


# Data Cleaning

## Remove Duplicate Rows
- Drop duplicate rows from the dataframe (`data`).

In [23]:
data_ = data.drop_duplicates()

# Display the new dataframe shape
print("The new shape is: ", data.shape)

The new shape is:  (1313, 3)


## Remove Outliers

### `text`

The `text` column of `data`, which is of string type, may contain values with unusually long lengths, indicating the presence of outliers. We will identify the outliers using [Z-score method].

1. Create a new column `text_length` in the DataFrame `data` by calculating the length of each review. (Set the value as 0 if the correponding `text` column has NaN values.)

2. Check the statistics of `text_length` using `describe()` method.

3. Calculate the mean and standard deviation of the `text_length` column.

4. Set the Z-score threshold for identifying outliers to 3.

5. Identify outliers of the `text_length` column and set the corresponding `text` to np.nan.

6. Drop the `text_length` column from the DataFrame.

In [24]:
data['text_length'] = data['content'].apply(lambda x: len(x) if pd.notna(x) else 0)
print(data.head(3))

TL = data["text_length"]
stats_TL = TL.describe()
print(stats_TL)

                                               title  \
0  GOP: WH May Have Illegally Altered Biden's 'Ga...   
1  Mavericks Principal Owners Donate $100 Million...   
2  Fact Check: Harris Campaign Twists Trump Comme...   

                                             content     source  text_length  
0  House Republicans say the Biden-Harris White H...  Breitbart         2736  
1  Miriam Adelson, the principal owner of the NBA...  Breitbart         2375  
2  CLAIM: Vice President Kamala Harris’s campaign...  Breitbart         3187  
count     1313.000000
mean      4414.495811
std       3364.380305
min        115.000000
25%       2514.000000
50%       3530.000000
75%       5421.000000
max      38675.000000
Name: text_length, dtype: float64


In [25]:
mean_TL = TL.mean()
# print(mean_TL)

sd_TL = TL.std()
# print(sd_TL)

threshold = 3

z_score = zscore(TL)
# print(z_score)

# Remove 'text' of lengths that are greater than 3 standard deviations above the mean
data.loc[abs(z_score) > threshold, 'content'] = np.nan
# print(data.head(3))

data = data.drop("text_length", axis=1)
data.head()

Unnamed: 0,title,content,source
0,GOP: WH May Have Illegally Altered Biden's 'Ga...,House Republicans say the Biden-Harris White H...,Breitbart
1,Mavericks Principal Owners Donate $100 Million...,"Miriam Adelson, the principal owner of the NBA...",Breitbart
2,Fact Check: Harris Campaign Twists Trump Comme...,CLAIM: Vice President Kamala Harris’s campaign...,Breitbart
3,"Harris Co-Chair: She's Different from Biden, H...",On Tuesday’s broadcast of CNN’s “Laura Coates ...,Breitbart
4,Quinnipiac Poll: Trump Takes Lead in Pennsylvania,Former President Donald Trump has a two point ...,Breitbart


### `title`

Similarly, the `title` column of `data` (of type `str`) may also contain values with unusually long lengths, indicating the presence of outliers.

1. Create a new column `title_length` in the DataFrame `data` by calculating the length of each price value. (Set the value as 0 if the correponding `title` column has NaN values.)

2. Check the statistics of `title_length` using `describe()` method and display its unique values.

3. Identify the outlier values by inspecting the content in `title` corresponding to the abnormal value in `title_length` and set the corresponding value of `title` to np.nan.

4. Drop the `title_length` column from the DataFrame.

In [26]:
data['title_length'] = data['title'].apply(lambda x: len(x) if pd.notna(x) else 0)
print(data.head(3))

TL = data["title_length"]
stats_TL = TL.describe()
print(stats_TL)

                                               title  \
0  GOP: WH May Have Illegally Altered Biden's 'Ga...   
1  Mavericks Principal Owners Donate $100 Million...   
2  Fact Check: Harris Campaign Twists Trump Comme...   

                                             content     source  title_length  
0  House Republicans say the Biden-Harris White H...  Breitbart            59  
1  Miriam Adelson, the principal owner of the NBA...  Breitbart           120  
2  CLAIM: Vice President Kamala Harris’s campaign...  Breitbart            99  
count    1313.000000
mean       80.137091
std        24.709738
min         9.000000
25%        62.000000
50%        77.000000
75%        96.000000
max       186.000000
Name: title_length, dtype: float64


In [27]:
mean_TL = TL.mean()
# print(mean_TL)

sd_TL = TL.std()
# print(sd_TL)

threshold = 3

z_score = zscore(TL)
# print(z_score)

# Remove 'title' of lengths that are greater than 3 standard deviations above the mean
data.loc[abs(z_score) > threshold, 'title'] = np.nan
# print(data.head(3))

data = data.drop("title_length", axis=1)
data.head()

Unnamed: 0,title,content,source
0,GOP: WH May Have Illegally Altered Biden's 'Ga...,House Republicans say the Biden-Harris White H...,Breitbart
1,Mavericks Principal Owners Donate $100 Million...,"Miriam Adelson, the principal owner of the NBA...",Breitbart
2,Fact Check: Harris Campaign Twists Trump Comme...,CLAIM: Vice President Kamala Harris’s campaign...,Breitbart
3,"Harris Co-Chair: She's Different from Biden, H...",On Tuesday’s broadcast of CNN’s “Laura Coates ...,Breitbart
4,Quinnipiac Poll: Trump Takes Lead in Pennsylvania,Former President Donald Trump has a two point ...,Breitbart


In [28]:
data.isnull().sum()

title       6
content    21
source      0
dtype: int64

# Feature Engineering

### Create new column `full_review`
Since there are some rows with empty `text` and `title`, we will concatenate both columns (`text` and `title`) to form a new column `full_content`.
1. Replace `NaN` values in `text` and `title` with an empty string.

2. Combine `text` and `title` into `full_content`.

3. Strip any leading/trailing whitespaces in `full_content`.

4. Drop `text` and `title` columns.

In [29]:
# 1) Fill NaN values in 'text' and 'title' with an empty string
data['title'] = data['title'].fillna('')
data['content'] = data['content'].fillna('')

# 2) Combine 'text' and 'title' into 'content'
data['full_content'] = data['content'] + " " + data['title']

# 3) Strip any leading/trailing whitespace
data['full_content'] = data['full_content'].str.strip()

# 4) Drop `text` and `title` columns
data = data.drop(columns = ['content', 'title'])

# Check if the 'full_review' column was added and if 'text' and 'title' columns has been dropped
print(data.head())
print("\nThe old shape is:",data.shape)

      source                                       full_content
0  Breitbart  House Republicans say the Biden-Harris White H...
1  Breitbart  Miriam Adelson, the principal owner of the NBA...
2  Breitbart  CLAIM: Vice President Kamala Harris’s campaign...
3  Breitbart  On Tuesday’s broadcast of CNN’s “Laura Coates ...
4  Breitbart  Former President Donald Trump has a two point ...

The old shape is: (1313, 2)


### Handle Missing Values
1. Drop rows where `full_review` are empty strings and reset the index.

2. Check if there are no more null values in `data`.

In [30]:
# 1) Drop rows where `full_review` are empty strings and reset the index
data = data[data['full_content'] != ""].reset_index(drop=True)
print("The new shape is:",data.shape)

# 2) Check if there are no more null values in `data`
data.isnull().sum()

The new shape is: (1313, 2)


source          0
full_content    0
dtype: int64

In [31]:
data

Unnamed: 0,source,full_content
0,Breitbart,House Republicans say the Biden-Harris White H...
1,Breitbart,"Miriam Adelson, the principal owner of the NBA..."
2,Breitbart,CLAIM: Vice President Kamala Harris’s campaign...
3,Breitbart,On Tuesday’s broadcast of CNN’s “Laura Coates ...
4,Breitbart,Former President Donald Trump has a two point ...
...,...,...
1308,Guardian,Striking Boeing staff have been urged by their...
1309,Guardian,Much attention has been paid to the rematch be...
1310,Guardian,Michigan congresswoman Rashida Tlaib declined ...
1311,Guardian,Donald Trump said he would meet with Ukrainian...


In [32]:
data.to_csv('dataset/scrapped_news.csv', index=False)

# Applying our best model (CNN + Word2Vec) on the scraped data

In [33]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define function to process text
import string
from nltk.stem import *
from nltk.stem.porter import *

# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize

# Part-of-speech tagging
from nltk import pos_tag

# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk

# Regular expressions for text pattern matching
import re


def process_full_review(text):
    # Convert to lowercase and tokenize
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in string.punctuation]
    stemmer = PorterStemmer()
    # List of stopwords
    stop_words = stopwords.words("english")
    allowed_words = [
        "no",
        "not",
        "don't",
        "dont",
        "don",
        "but",
        "however",
        "never",
        "wasn't",
        "wasnt",
        "shouldn't",
        "shouldnt",
        "mustn't",
        "musnt",
    ]

    stemmed = [
        stemmer.stem(word)
        for word in tokens
        if word not in stop_words or word in allowed_words
    ]
    return " ".join(stemmed)

In [34]:
from datasets import load_dataset, concatenate_datasets, Dataset

datasets = load_dataset(
    "csv",
    data_files={
        "train": [
            "dataset/train_data_1.csv",
            "dataset/train_data_2.csv",
            "dataset/train_data_3.csv",
            "dataset/train_data_4.csv",
        ],
    },
)
datasets

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 54441 examples [00:02, 25634.32 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
})

In [35]:
# load model
from tensorflow.keras.models import load_model

model = load_model("results/CNN_model.keras")
model.summary()

  saveable.load_own_variables(weights_store.get(inner_path))


In [39]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import pandas as pd
from tqdm import tqdm

def evaluate_model(model, train_data, val_data, force_reprocess=False):
    # Apply process_full_review function with tqdm progress bar and expand the results into separate columns.
    processed_columns = "processed_full_content"
    if force_reprocess or processed_columns not in train_data.columns:
        # Enable tqdm for pandas (progress bar)
        tqdm.pandas(desc="Processing Train Data")
        
        train_data[processed_columns] = train_data["full_content"].progress_apply(
            lambda x: pd.Series(process_full_review(x))
        )

    if force_reprocess or processed_columns not in val_data.columns:
        # Enable tqdm for pandas (progress bar)
        tqdm.pandas(desc="Processing Val Data")

        # Apply process_full_review function with tqdm progress bar and expand the results into separate columns.
        val_data[processed_columns] = val_data["full_content"].progress_apply(
            lambda x: pd.Series(process_full_review(x))
        )

    print("Evaluating Model")
    
    max_words = 10000
    max_sequence_length = 300

    train_texts = train_data["processed_full_content"]
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_texts)

    val_texts = val_data["processed_full_content"]

    X_val = pad_sequences(
        tokenizer.texts_to_sequences(val_texts), maxlen=max_sequence_length
    )
    y_pred = (model.predict(X_val) > 0.5).astype(int)

    if "label" not in val_data.columns:
        return y_pred

    y_val = val_data["label"]
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {f1:.4f}")

In [41]:
df_train = datasets["train"].to_pandas()
df_test = pd.read_csv("dataset/scrapped_news.csv")
predictions = evaluate_model(model, df_train, df_test)

# Print results
print("\nPrediction Results:")
print(f"Total articles: {len(predictions)}")
print(f"Predicted Real: {sum(predictions == 1)}")
print(f"Predicted Fake: {sum(predictions == 0)}")

Processing Val Data: 100%|██████████| 1313/1313 [00:05<00:00, 240.60it/s]


Evaluating Model
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Prediction Results:
Total articles: 1313
Predicted Real: [611]
Predicted Fake: [702]


In [42]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   source                  1313 non-null   object
 1   full_content            1313 non-null   object
 2   processed_full_content  1313 non-null   object
dtypes: object(3)
memory usage: 30.9+ KB


In [44]:
df_test.to_csv("dataset/scrapped_news.csv", index=False)