In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import string
import os
import zipfile
import pyarrow as pa
import pyarrow.parquet as pq
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from gensim.models import Word2Vec

In [None]:
# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Load SpaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")



In [None]:
# Initialize necessary tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [None]:
# Function to clean raw text data
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
        text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\d+', '', text)  # Remove numbers
        return text.strip()
    return ""

In [None]:
# Function for tokenization, stopword removal, and lemmatization
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [None]:
# Function for sentiment analysis
def sentiment_score(text):
    return TextBlob(text).sentiment.polarity

In [None]:
# Function for word embedding using Word2Vec
def train_word2vec(corpus):
    tokenized_corpus = [word_tokenize(doc) for doc in corpus]
    model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
    return model

In [None]:
# GitHub raw file URL
liar_url = "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/refs/heads/main/Dataset/liar-fake-news-dataset/train.tsv"

try:
    liar_df = pd.read_csv(liar_url, delimiter="\t", names=["label", "text"])
    liar_df['label'] = liar_df['label'].map({'true': 0, 'false': 1})  # Real: 0, Fake: 1
    liar_df["id"] = "No ID"  # LIAR dataset has no ID
    liar_df["news_url"] = "No URL"  # LIAR dataset has no URL
    print("LIAR dataset loaded successfully!")
except Exception as e:
    print(f"Error loading LIAR dataset: {e}")
    liar_df = pd.DataFrame(columns=["id", "news_url", "text", "label"])  # Create empty DataFrame if loading fails


LIAR dataset loaded successfully!


In [None]:
# GitHub raw URLs
fake_news_net_url = "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Dataset/FakeNewsNet-github/dataset/politifact_fake.csv"
real_news_net_url = "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Dataset/FakeNewsNet-github/dataset/politifact_real.csv"

try:
    fake_news_net_df = pd.read_csv(fake_news_net_url)
    real_news_net_df = pd.read_csv(real_news_net_url)

    fake_news_net_df['label'] = 1  # Fake news
    real_news_net_df['label'] = 0  # Real news

    # Rename 'title' to 'text' for consistency
    fake_news_net_df.rename(columns={"title": "text"}, inplace=True)
    real_news_net_df.rename(columns={"title": "text"}, inplace=True)

    fakenewsnet_df = pd.concat([fake_news_net_df, real_news_net_df], ignore_index=True)
    print("FakeNewsNet dataset loaded successfully!")

except Exception as e:
    print(f"Error loading FakeNewsNet dataset: {e}")
    fakenewsnet_df = pd.DataFrame(columns=["id", "news_url", "text", "label"])


FakeNewsNet dataset loaded successfully!


In [None]:
import pandas as pd

# GitHub raw file URLs
fake_articles_url = "https://raw.github.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Dataset/Fake-News-Dataset-kaggle/Testing_dataset/testingSet/Catalog%20-%20Fake%20Articles.csv"
real_articles_url = "https://raw.github.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Dataset/Fake-News-Dataset-kaggle/Testing_dataset/testingSet/Catalog%20-%20Real%20Articles.csv"

try:
    fake_news_kaggle = pd.read_csv(fake_articles_url, encoding="utf-8")
    real_news_kaggle = pd.read_csv(real_articles_url, encoding="utf-8")

    fake_news_kaggle = fake_news_kaggle[['Article']].rename(columns={'Article': 'text'})
    real_news_kaggle = real_news_kaggle[['Article']].rename(columns={'Article': 'text'})

    fake_news_kaggle['label'] = 1  # Fake news
    real_news_kaggle['label'] = 0  # Real news

    fakenews_kaggle_df = pd.concat([fake_news_kaggle, real_news_kaggle], ignore_index=True)
    fakenews_kaggle_df["id"] = "No ID"
    fakenews_kaggle_df["news_url"] = "No URL"
    print("Kaggle Fake News dataset loaded successfully!")

except Exception as e:
    print(f"Error loading Kaggle Fake News dataset: {e}")
    fakenews_kaggle_df = pd.DataFrame(columns=["id", "news_url", "text", "label"])


Kaggle Fake News dataset loaded successfully!


In [None]:
# Merge all datasets
all_data = pd.concat([
    liar_df[['id', 'news_url', 'text', 'label']],
    fakenewsnet_df[['id', 'news_url', 'text', 'label']],
    fakenews_kaggle_df[['id', 'news_url', 'text', 'label']]
], ignore_index=True)

In [None]:
# Apply text processing
all_data['clean_text'] = all_data['text'].apply(clean_text)
all_data['processed_text'] = all_data['clean_text'].apply(preprocess_text)
all_data['sentiment'] = all_data['processed_text'].apply(sentiment_score)

In [None]:
# Apply TF-IDF Feature Engineering
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(all_data['processed_text'])

In [None]:
# Apply Word2Vec Embeddings
word2vec_model = train_word2vec(all_data['processed_text'])

In [None]:
# Convert text into numerical vectors
def get_avg_word2vec(text, model):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0, dtype=np.float32) if vectors else np.zeros(100, dtype=np.float32)

all_data['word2vec'] = all_data['processed_text'].apply(lambda x: get_avg_word2vec(x, word2vec_model))


In [None]:
# Save Processed Data
all_data.to_csv("processed_fake_news.csv", index=False)
import pickle
with open("processed_fake_news.pkl", "wb") as f:
    pickle.dump(all_data, f)

print("Processed dataset saved successfully!")

Processed dataset saved successfully!


In [None]:
# Load the processed dataset
df = pd.read_csv("processed_fake_news.csv")

# Verify the first few rows
print("\nSample Data\n")
print(df.head(10))


Sample Data

      id news_url                                      text  label  \
0  No ID   No URL                                  a mailer    NaN   
1  No ID   No URL                           a floor speech.    NaN   
2  No ID   No URL                                    Denver    NaN   
3  No ID   No URL                            a news release    NaN   
4  No ID   No URL                       an interview on CNN    NaN   
5  No ID   No URL                 a an online opinion-piece    NaN   
6  No ID   No URL                          a press release.    NaN   
7  No ID   No URL  a Democratic debate in Philadelphia, Pa.    NaN   
8  No ID   No URL                                a website     NaN   
9  No ID   No URL                           an online video    NaN   

                               clean_text                     processed_text  \
0                                a mailer                             mailer   
1                          a floor speech              

In [None]:
# Check dataset label distribution
print("\nLabel Distribution by Source")

# LIAR Dataset Label Count
print("\nLIAR Dataset Labels:\n", df[df['news_url'] == "No URL"]['label'].value_counts())

# FakeNewsNet Dataset Label Count
print("\nFakeNewsNet Dataset Labels:\n", df[df['news_url'] != "No URL"]['label'].value_counts())

# FakeNewsKaggle Dataset Label Count
print("\nFakeNewsKaggle Dataset Labels:\n", df[df['id'] == "No ID"]['label'].value_counts())



Label Distribution by Source

LIAR Dataset Labels:
 label
1.0    50
0.0    50
Name: count, dtype: int64

FakeNewsNet Dataset Labels:
 label
0.0    624
1.0    432
Name: count, dtype: int64

FakeNewsKaggle Dataset Labels:
 label
1.0    50
0.0    50
Name: count, dtype: int64
