# Import Libraries

In [13]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download the stopwords dataset
nltk.download('stopwords')

# Download the 'punkt' tokeniser model
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wware\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wware\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load Dataset

In [2]:
# Load dataset
df = pd.read_csv('../data/WELFake_Dataset.csv')

# Display the first few rows
df.head()


Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


The dataset contains the following columns:

- **Serial number**: An integer identifier for each news article (starting from 0).
- **Title**: The title or headline of the news article.
- **Text**: The main content of the news article.
- **Label**: Indicates whether the article is real or fake:
  - `0`: Fake news
  - `1`: Real news

The dataset provides a robust foundation for training and evaluating machine learning models aimed at detecting fake news. The labels are balanced with a nearly equal number of real and fake news articles, making it suitable for binary classification tasks.

### Source

The dataset was published in the following paper:

- **IEEE Transactions on Computational Social Systems**: pp. 1-13 (doi: [10.1109/TCSS.2021.3068519](https://doi.org/10.1109/TCSS.2021.3068519)).

You can access the dataset on Kaggle via this [link](https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification).

# Drop Unnecessary Columns

In [3]:
# Drop the 'Unnamed: 0' column
df.drop(columns=['Unnamed: 0'], inplace=True)

# Display the first few rows
df.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


# Handling Missing Values

In [4]:
# Check for missing values in each column
missing_values = df.isnull().sum()
print(missing_values)


title    558
text      39
label      0
dtype: int64


In [5]:
# Drop rows with critical missing text data
df.dropna(subset=['text'], inplace=True)

df.head()


Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [6]:
# Check for missing values in each column
missing_values = df.isnull().sum()
print(missing_values)

title    558
text       0
label      0
dtype: int64


In [7]:
# Fill missing values in 'title' column with 'No Title'
df['title'].fillna('No Title', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['title'].fillna('No Title', inplace=True)


In [8]:
# Check for missing values in each column
missing_values = df.isnull().sum()
print(missing_values)

title    0
text     0
label    0
dtype: int64


# Removing Stopwords

Stopwords are common words (like "the," "is," "in," etc.) that typically do not carry much meaning and can be removed to reduce noise in the data.

In [9]:
df.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,No Title,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [10]:
# Combine 'title' and 'text' into a single column called 'combined_text'
df['combined_text'] = df['title'].fillna('') + " " + df['text']

In [11]:
df.head()

Unnamed: 0,title,text,label,combined_text
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1,No Title,Did they post their votes for Hillary already?,1,No Title Did they post their votes for Hillary...
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri..."
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...


In [12]:
# Set of English stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    # Tokenize the text into words
    word_tokens = word_tokenize(text)
    
    # Filter out the stopwords
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    
    # Join the filtered words back into a string
    return ' '.join(filtered_text)

# Apply the function to the 'combined_text' column
df['cleaned_text'] = df['combined_text'].apply(remove_stopwords)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\wware/nltk_data'
    - 'c:\\Users\\wware\\AppData\\Local\\anaconda3\\envs\\fake-news-predictor\\nltk_data'
    - 'c:\\Users\\wware\\AppData\\Local\\anaconda3\\envs\\fake-news-predictor\\share\\nltk_data'
    - 'c:\\Users\\wware\\AppData\\Local\\anaconda3\\envs\\fake-news-predictor\\lib\\nltk_data'
    - 'C:\\Users\\wware\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
