In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import string
import os
import zipfile
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from gensim.models import Word2Vec

In [2]:
# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# Load SpaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

In [4]:
# Initialize necessary tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [5]:
# Function to clean raw text data
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
        text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\d+', '', text)  # Remove numbers
        return text
    return ""

In [6]:
# Function for tokenization, stopword removal, and lemmatization
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [7]:
# Function for sentiment analysis
def sentiment_score(text):
    return TextBlob(text).sentiment.polarity

In [8]:
# Function for word embedding using Word2Vec
def train_word2vec(corpus):
    tokenized_corpus = [word_tokenize(doc) for doc in corpus]
    model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
    return model

In [11]:
# GitHub raw file URL
url = "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Dataset/liar-fake-news-dataset/train.tsv"

# Read the dataset from GitHub
try:
    liar_df = pd.read_csv(url, delimiter="\t", names=["label", "text"])
    liar_df = liar_df[['label', 'text']]
    liar_df['label'] = liar_df['label'].map({'true': 0, 'false': 1})
    print("Dataset Uploaded Successfully!")
except Exception as e:
    print(f"Error loading dataset: {e}")

Dataset Uploaded Successfully!


In [10]:
# GitHub raw URLs
fake_news_net_url = "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Dataset/FakeNewsNet-github/dataset/politifact_fake.csv"
real_news_net_url = "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Dataset/FakeNewsNet-github/dataset/politifact_real.csv"

try:
    # Read CSV with error handling
    fake_news_net_df = pd.read_csv(fake_news_net_url, delimiter=",", quotechar='"', on_bad_lines="skip")
    real_news_net_df = pd.read_csv(real_news_net_url, delimiter=",", quotechar='"', on_bad_lines="skip")

    # Assign labels
    fake_news_net_df['label'] = 1  # Fake news
    real_news_net_df['label'] = 0  # Real news

    # Keep only necessary columns
    fake_news_net_df = fake_news_net_df[['id', 'news_url', 'title', 'label']]
    real_news_net_df = real_news_net_df[['id', 'news_url', 'title', 'label']]

    # Merge datasets
    fakenewsnet_df = pd.concat([fake_news_net_df, real_news_net_df])

    print("FakeNewsNet dataset loaded successfully!")

except Exception as e:
    print(f"Error loading FakeNewsNet dataset: {e}")


FakeNewsNet dataset loaded successfully!


In [12]:
import pandas as pd

# Correct GitHub raw file URLs
fake_articles_url = "https://raw.github.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Dataset/Fake-News-Dataset-kaggle/Testing_dataset/testingSet/Catalog%20-%20Fake%20Articles.csv"
real_articles_url = "https://raw.github.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Dataset/Fake-News-Dataset-kaggle/Testing_dataset/testingSet/Catalog%20-%20Real%20Articles.csv"

try:
    # Load datasets from GitHub
    fake_news_kaggle = pd.read_csv(fake_articles_url, encoding="ISO-8859-1")
    real_news_kaggle = pd.read_csv(real_articles_url, encoding="ISO-8859-1")

    # Keep only relevant columns: "Article" for text
    fake_news_kaggle = fake_news_kaggle[['Article']].rename(columns={'Article': 'text'})
    real_news_kaggle = real_news_kaggle[['Article']].rename(columns={'Article': 'text'})

    # Assign labels: 1 for Fake, 0 for Real
    fake_news_kaggle['label'] = 1
    real_news_kaggle['label'] = 0

    # Merge both datasets into a single dataframe
    fakenews_kaggle_df = pd.concat([fake_news_kaggle, real_news_kaggle], ignore_index=True)

    print("Kaggle Fake News dataset loaded successfully!")

except Exception as e:
    print(f"Error loading Kaggle Fake News dataset: {e}")


Kaggle Fake News dataset loaded successfully!


In [13]:
# Merge all datasets
all_data = pd.concat([liar_df, fakenewsnet_df, fakenews_kaggle_df], ignore_index=True)

In [14]:
# Drop duplicates and missing values
all_data.drop_duplicates(inplace=True)
all_data.dropna(subset=['text'], inplace=True)

In [15]:
# Apply text cleaning
all_data['clean_text'] = all_data['text'].apply(clean_text)

In [16]:
# Apply text preprocessing
all_data['processed_text'] = all_data['clean_text'].apply(preprocess_text)

In [17]:
# Compute sentiment scores
all_data['sentiment'] = all_data['processed_text'].apply(sentiment_score)

In [18]:
print("Data processing completed successfully!")

Data processing completed successfully!
