**FakeNewsNet Cleaning techniques**

**Dataset 1: PolitiFact**

In [None]:
# Cleaning Techniques for PolitiFact:
## 1. Deduplication: Eliminate repeated articles/posts
## 2. Missing values/nulls: Eliminate columns with missing values/nulls
## 3. Lowercasing: Standardize textual data for tokenization
## 4. URL/user mention removal: Clean Twitter-based metadata (e.g., ‚Äú@user‚Äù, ‚Äúhttp‚Äù)
## 5. Punctuation removal: Eliminate noise from text
## 6. Emoji/HTML tag stripping: Remove irrelevant or encoded characters
## 7. Non-English removal: Keep only English-language content (based on langdetect or SpaCy)
## 8. Data Imbalancing: Checked only since it is not needed to address, since the rate of true to fake is acceptable

In [None]:
from google.colab import files

# Upload files politifact_real and politifact_fake datasets
uploaded = files.upload()


In [None]:
import pandas as pd

df_fake = pd.read_csv("politifact_fake.csv")
df_real = pd.read_csv("politifact_real.csv")

df_fake['label'] = 'fake'
df_real['label'] = 'real'

df = pd.concat([df_fake, df_real], ignore_index=True)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'politifact_fake.csv'

In [None]:
# Check for duplicates in title (most common text field for detection)
print("Duplicates in title:", df.duplicated(subset='title').sum())

# Remove duplicates by title
df = df.drop_duplicates(subset='title')

# Check for nulls
print("\nMissing values per column:")
print(df.isnull().sum())

# Drop rows with missing title or label (essential for modeling)
df = df.dropna(subset=['title', 'label'])

# Preview cleaned structure
df.head()

In [None]:
import re

def clean_text(text):
    text = str(text).lower()                          # Lowercase
    text = re.sub(r"http\S+", "", text)               # Remove URLs
    text = re.sub(r"@\w+", "", text)                  # Remove mentions
    text = re.sub(r"[^\w\s]", "", text)               # Remove punctuation
    text = re.sub(r"[\u263a-\U0001f645]", "", text)   # Remove emojis/symbols
    return text.strip()

# Apply to title column
df['clean_title'] = df['title'].apply(clean_text)

# Preview result
df[['title', 'clean_title', 'label']].head()

In [None]:
!pip install langdetect

In [None]:
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

# Function to safely detect language
def detect_lang(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

# Apply to cleaned titles
df['language'] = df['clean_title'].apply(detect_lang)

# Filter only English rows
df = df[df['language'] == 'en']

# Drop helper column
df = df.drop(columns=['language'])

# Preview
df[['clean_title', 'label']].sample(5)

In [None]:
import matplotlib.pyplot as plt

# Count fake vs real
class_counts = df['label'].value_counts()
print("üßÆ Class Distribution:\n", class_counts)

# Plot bar chart
class_counts.plot(kind='bar', color=['red', 'green'])
plt.title("Class Distribution After Cleaning")
plt.xlabel("Label")
plt.ylabel("Number of Samples")
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.show()

In [None]:
# Save to CSV
df[['clean_title', 'label']].to_csv('clean_fakenewsnet.csv', index=False)

# Download locally
from google.colab import files
files.download('clean_fakenewsnet.csv')

**Dataset 2: GossipCop**

In [None]:
# Cleaning Techniques for GossipCop:
## 1. Deduplication: Eliminate repeated articles/posts
## 2. Missing values/nulls: Eliminate columns with missing values/nulls
## 3. Lowercasing: Standardize textual data for tokenization
## 4. URL/user mention removal: Clean Twitter-based metadata (e.g., ‚Äú@user‚Äù, ‚Äúhttp‚Äù)
## 5. Punctuation removal: Eliminate noise from text
## 6. Emoji/HTML tag stripping: Remove irrelevant or encoded characters
## 7. Non-English removal: Keep only English-language content (based on langdetect or SpaCy)
## 8. Data Imbalancing:

In [None]:
# Upload files gossipcop_real and gossipcop_fake datasets
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd

# Load files
df_fake = pd.read_csv("gossipcop_fake.csv")
df_real = pd.read_csv("gossipcop_real.csv")

# Label the data
df_fake['label'] = 'fake'
df_real['label'] = 'real'

# Combine
df = pd.concat([df_fake, df_real], ignore_index=True)

# Preview
df.head()

In [None]:
# Check for duplicate titles
print("Duplicates in title:", df.duplicated(subset='title').sum())

# Remove duplicate titles
df = df.drop_duplicates(subset='title')

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Drop rows missing title or label (core for modeling)
df = df.dropna(subset=['title', 'label'])

# Preview cleaned structure
df.head()

In [None]:
import re

# Define clean text function
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)     # Remove mentions
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"[\u263a-\U0001f645]", "", text)  # Remove emojis/symbols
    return text.strip()

# Apply to title column
df['clean_title'] = df['title'].apply(clean_text)

# Preview result
df[['title', 'clean_title', 'label']].head()

In [None]:
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

# Define function for language detection
def detect_lang(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

# Apply to cleaned titles
df['language'] = df['clean_title'].apply(detect_lang)

# Keep only English
df = df[df['language'] == 'en']

# Drop helper column
df = df.drop(columns=['language'])

# Preview result
df[['clean_title', 'label']].sample(5)

In [None]:
import matplotlib.pyplot as plt

# Count labels
class_counts = df['label'].value_counts()
print("\nClass Distribution:\n", class_counts)

# Plot distribution
class_counts.plot(kind='bar', color=['red', 'green'])
plt.title("Class Distribution After Cleaning")
plt.xlabel("Label")
plt.ylabel("Number of Samples")
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.show()

In [None]:
# Reference for Undersampling technique: A comprehensive survey of fake news in social networks: Attributes,features, and detection approaches (random undersampling technique)

# Separate by class
df_real = df[df['label'] == 'real']
df_fake = df[df['label'] == 'fake']

# Downsample real news
df_real_downsampled = df_real.sample(n=len(df_fake), random_state=42)

# Combine and shuffle
df_balanced = pd.concat([df_fake, df_real_downsampled], ignore_index=True)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Confirm balance
print(df_balanced['label'].value_counts())

In [None]:
# Save to CSV
df_balanced[['clean_title', 'label']].to_csv('clean_gossipcop.csv', index=False)

# Download locally (Colab)
from google.colab import files
files.download('clean_gossipcop.csv')