# Unified Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import re                                   # to preprocess regular expressions
import string
import nltk
from nltk.corpus import stopwords           #  to remove common words to reduce noise
from nltk.stem import WordNetLemmatizer     # to reduce words to their root form like 'running' to 'run'
import os

### NLTK Setup

In [2]:
# Downloading NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to /Users/roshni/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/roshni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Base Paths

In [18]:
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))       # Automatically detects current project directory

DATA_DIR = os.path.join(BASE_DIR, "data")                         # Define subdirectories relative to project root
OUTPUT_DIR = os.path.join(BASE_DIR, "output")                     # optional


### Initializing Stopwords and Lemmatizer

In [19]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

### Function to Clean Text

In [20]:
def clean_text(text): 
    text = str(text).lower()                         
    text = re.sub(r"http\S+", "", text)                            # removes URLs
    text = re.sub(r'<.*?>', '', text)                              # removes HTML tags
    text = re.sub(r'\d+', '', text)                                # removes digits
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)  # removes punctuation
    text = re.sub(r'\s+', ' ', text).strip()                       # removes extra spaces
    return text

def preprocess_text(text):                                         # Tokenization and Lemmatization
    words = text.split()                                           # Tokenization by splitting on spaces
    words = [w for w in words if w not in stop_words]              # Remove stopwords
    words = [lemmatizer.lemmatize(w) for w in words]               # Lemmatization by reducing words to their root form
    return ' '.join(words)                                         # Join words back into a single string 

### Dataset Cleaning Function

In [21]:
def clean_dataset(df, text_col, label_col, dataset_name):           
    df = df[[text_col, label_col]].copy()                                        # Keep only relevant columns
    df.rename(columns={text_col: 'text', label_col: 'label'}, inplace=True)      # Standardize column names

    df.drop_duplicates(inplace=True)                                             # Remove duplicates
    df.dropna(subset=['text'], inplace=True)                                     # Remove rows with missing text
    df.reset_index(drop=True, inplace=True)                                      # Reset index

    df['text'] = df['text'].apply(clean_text)                                    # applying cleaning function
    df['text'] = df['text'].apply(preprocess_text)                               # applying Preprocessing function

    output_path = os.path.join(OUTPUT_DIR, f"{dataset_name}_Cleaned.csv")        # define output path
    df.to_csv(output_path, index=False)                                          # Save cleaned CSV
    print(f"✅ {dataset_name} cleaned and saved ({df.shape[0]} rows) → {output_path}")    
    return df                                                                    # return cleaned dataframe    

## Load & Clean Each Dataset

### CoAID

In [22]:
print("Full path:", coaid_path)

Full path: /Users/roshni/Desktop/DrParmar_Project/owml_project/notebooks/data/CoAID/CoAID_News_Combined.csv


In [24]:
coaid_path = os.path.join(DATA_DIR, "CoAID", "CoAID_News_Combined.csv")                                        # path to CoAID dataset
coaid_df = pd.read_csv(coaid_path)                                                                          # load CoAID dataset
coaid_cleaned = clean_dataset(coaid_df, text_col='content', label_col='label', dataset_name='CoAID')        # apply cleaning function

✅ CoAID cleaned and saved (3079 rows) → /Users/roshni/Desktop/DrParmar_Project/owml_project/output/CoAID_Cleaned.csv


### FakeNewsNet (PolitiFact example)

In [25]:
fakenews_path_real = os.path.join(DATA_DIR, "FakeNewsNet/PolitiFact_real_news_content.csv")                      
fakenews_path_fake = os.path.join(DATA_DIR, "FakeNewsNet/PolitiFact_fake_news_content.csv")                      
df_real = pd.read_csv(fakenews_path_real)                                                                        
df_fake = pd.read_csv(fakenews_path_fake)                                                                        

df_fakenews = pd.concat([df_real, df_fake], ignore_index=True)                                                   # combine real and fake news

df_fakenews['label'] = [1]*len(df_fake) + [0]*len(df_real)                                                       # create label column: 1 for fake, 0 for real

fakenews_cleaned = clean_dataset(df_fakenews, text_col='text', label_col='label', dataset_name='FakeNewsNet')    # apply cleaning function

                         

✅ FakeNewsNet cleaned and saved (228 rows) → /Users/roshni/Desktop/DrParmar_Project/owml_project/output/FakeNewsNet_Cleaned.csv


### WELFake

In [26]:
welfare_path = os.path.join(DATA_DIR, "WELFake/WELFake.csv")                                                      
welfare_df = pd.read_csv(welfare_path)                                                                            

welfare_df['text_combined'] = welfare_df['title'].fillna('') + ' ' + welfare_df['text'].fillna('')                # combine title and text columns
welfare_cleaned = clean_dataset(welfare_df, text_col='text_combined', label_col='label', dataset_name='WELFake')  # apply cleaning function


✅ WELFake cleaned and saved (63678 rows) → /Users/roshni/Desktop/DrParmar_Project/owml_project/output/WELFake_Cleaned.csv


## Quick Sanity Check

In [27]:
print("\nSample cleaned text from CoAID:")              # print sample cleaned text from CoAID
print(coaid_cleaned.head(3)['text'])

print("\nSample cleaned text from FakeNewsNet:")        # print sample cleaned text from FakeNewsNet
print(fakenews_cleaned.head(3)['text'])

print("\nSample cleaned text from WELFake:")            # print sample cleaned text from WELFake
print(welfare_cleaned.head(3)['text'])


Sample cleaned text from CoAID:
0    browser longer supported please switch support...
1    de moines iowa new study conducted researcher ...
2    confirm receive notification channel subscribe...
Name: text, dtype: object

Sample cleaned text from FakeNewsNet:
0    k share share story hillary clinton called fac...
1    famous dog killed spot waited year owner retur...
2    story highlight house oversight panel voted ho...
Name: text, dtype: object

Sample cleaned text from WELFake:
0    law enforcement high alert following threat co...
1                            post vote hillary already
2    unbelievable obama’s attorney general say char...
Name: text, dtype: object
