## Download dependencies

In [14]:
import re
import pandas as pd
import sys
import nltk
import ssl
import os
import numpy as np
import wandb
import wandb.integration

from cleantext.clean import clean
from concurrent.futures import ProcessPoolExecutor
from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report
from sklearn.neural_network import MLPClassifier

## Process data

In [None]:
def initialize_nltk():
    """Initialize NLTK resources if they don't exist."""
    try:
        # Check if resources are already downloaded
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/wordnet')
        nltk.data.find('corpora/omw-1.4')
        nltk.data.find('corpora/stopwords')
    except LookupError:
        # Only download if resources are missing
        try:
            _create_unverified_https_context = ssl._create_unverified_context
            ssl._create_default_https_context = _create_unverified_https_context
        except AttributeError:
            pass
        
        nltk.download('punkt', quiet=True)
        nltk.download('wordnet', quiet=True)
        nltk.download('omw-1.4', quiet=True)
        nltk.download('stopwords', quiet=True)

# Initialize NLTK resources
initialize_nltk()

uniqueWordsBeforePreprocessed = Counter()
uniqueWordsAfterCleaned = Counter()
uniqueWordsAfterRemStopWords = Counter()
uniqueWordsAfterStemming = Counter()

allWords = 0
stop_words = set(stopwords.words('english'))

num_workers = os.cpu_count()

date_patterns = re.compile(
    r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.?\s+\d+,?\s+\d+\b|'
    r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d+,?\s+\d+\b|'
    r'(\d+)-(\d+)-(\d+) ?(\d*):?(\d*):?(\d*)(\.\d+)?'
)

def clean_column(text):
    '''
    Here to clean the text data, we are using the cleantext library.
    '''
    if not isinstance(text, str):
        return text
    
    global uniqueWordsBeforePreprocessed
    uniqueWordsBeforePreprocessed.update(text.split())
    text = text.lower()
    text = re.sub(date_patterns, ' <DATE> ', text)

    text = clean(text,
            fix_unicode=True,
            to_ascii=True, 
            no_punct=True,
            no_urls=True,                  
            no_emails=True,                
            no_numbers=True,  
            replace_with_punct= "",             
            no_line_breaks=True,
            replace_with_url=" <URL> ",
            replace_with_email=" <EMAIL> ",
            replace_with_number=" <NUMBER> ",
            lower=True
            )
    
    global uniqueWordsAfterCleaned
    uniqueWordsAfterCleaned.update(text.split())
    return text
    
def remove_stopwords(text):
    '''
    This function will clear stop words, assuming the text has been cleaned.
    '''
    if not isinstance(text, str):
        return text
    
    tokens = text.split(' ')
    filtered_tokens = [word for word in tokens if word not in stop_words]

    global uniqueWordsAfterRemStopWords
    uniqueWordsAfterRemStopWords.update(filtered_tokens)
    return ' '.join(filtered_tokens)

def stem_text(text):
    '''
    This function will perform stemming, assuming the text has been cleaned.
    '''
    if not isinstance(text, str):
        return text

    tokens = text.split(' ')
    stemmer = nltk.stem.PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    global uniqueWordsAfterStemming
    uniqueWordsAfterStemming.update(stemmed_tokens)
    return " ".join(stemmed_tokens)

def process_text(text):
    """
    Process a single text entry (cleaning, stopwords removal, stemming).
    """
    text = clean_column(text)
    text = remove_stopwords(text)
    text = stem_text(text)
    return text

def process_df(df):
    """Process the entire DataFrame."""
    df['content'] = df['content'].apply(process_text)
    return df

df_chunks = pd.read_csv("995,000_rows.csv", usecols=['type','content'], chunksize=10000)

with open("processed.csv", 'w', encoding='utf-8') as f:
    for chunk in df_chunks:
        processed_chunk = process_df(chunk)
        processed_chunk.to_csv(f, index=False, header=f.tell()==0)
print(f"Cleaned data saved to processed.csv")
print("Unique words before preprocessing: ", len(uniqueWordsBeforePreprocessed))
print("Unique words after cleaning: ", len(uniqueWordsAfterCleaned))
print("Unique words after removing stop words: ", len(uniqueWordsAfterRemStopWords))
print("Unique words after stemming: ", len(uniqueWordsAfterStemming))


## Split data

In [None]:
input_csv = "processed.csv"

try:
    chunks = []
    for chunk in pd.read_csv(input_csv, chunksize=10000):
        chunks.append(chunk)
    df = pd.concat(chunks)
except FileNotFoundError:
    print(f"Error: File '{input_csv}' not found.")
    sys.exit(1)

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, shuffle=True)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

train_df.to_csv('995_train.csv', index = False)
val_df.to_csv('995_validation.csv', index=False)
test_df.to_csv('995_test.csv', index = False)

print('processed.csv has succesfully been split into train.csv, validation.csv, test.csv')

## Logistic Regression

In [None]:
vectorizer = TfidfVectorizer(max_features=10000)

def label_entry(type, labels):
    """
    Label one document as fake or reliable, defaults to None (which can be removed with .dropna()).
    """
    if type in labels[0]:
        return 0
    elif type in labels[1]:
        return 1
    else:
        print(f"MISSED TYPE:{type}")
        return None

def logistic_regression(train_x:pd.Series, train_y:pd.Series, labels):
    """
    Initialize an SKLearn Logistic regression model.
    """
    vector_col = vectorizer.fit_transform(train_x).astype(np.float32)
    label_col = train_y
    print("Done vectorizing data.")
    model = LogisticRegression(class_weight="balanced", random_state=16, solver="saga", max_iter=1000, n_jobs=-1)
    print("Done iterating data.")
    model.fit(vector_col, label_col)
    print("Done training model.")
    return model

def test_model_lr(model:LogisticRegression, val_x:pd.Series, val_y:pd.Series, test_X=None, test_Y=None, wandb_init = False):
    label_col = val_y
    vector_col = vectorizer.transform(val_x).astype(np.float32)

    # Evaluate scores:
    val_predict = model.predict(vector_col)
    val_score = f1_score(label_col, val_predict)
    class_report_lr = classification_report(label_col, val_predict)
    print("CLASS REPORT VALIDATION DATA:")
    print(class_report_lr)

    if wandb_init:
        wandb.init(project="gds-project-test")
        wandb.log({ "val_accuracy": val_score,})
        wandb.finish()

    if test_X is not None and test_Y is not None:
        vector_col_test = vectorizer.transform(test_X).astype(np.float32)
        label_col_test = test_Y
        test_predict = model.predict(vector_col_test)
        test_score = f1_score(label_col_test, test_predict)
        return val_score, test_score
    else:
        return val_score

'''
Preparing training, validation, and test cases for the model
'''
labels = (["fake", "satire", "bias", "conspiracy", "junksci", "hate", "state"],
    ["clickbait", "political", "reliable"])
unwanted_labels = ["unreliable", "rumor", "unknown", "2018-02-10 13:43:39.521661"]

#split corpus data
train = pd.read_csv("995_train.csv").dropna()
train.drop(train.index[(train["type"].isin(unwanted_labels))],axis=0,inplace=True)
train_X = train['content']
train_Y = train['type'].apply(label_entry, args=(labels,)).to_numpy().astype(np.float32)

validation = pd.read_csv("995_validation.csv").dropna()
validation.drop(validation.index[(validation["type"].isin(unwanted_labels))],axis=0,inplace=True)
validation_X = validation['content']
validation_Y = validation['type'].apply(label_entry, args=(labels,)).to_numpy().astype(np.float32)

test = pd.read_csv("995_test.csv").dropna()
test.drop(test.index[(test["type"].isin(unwanted_labels))],axis=0,inplace=True)
test_X = test['content']
test_Y = test['type'].apply(label_entry, args=(labels,)).to_numpy().astype(np.float32)

bbc_data = pd.read_csv("bbc_processed.csv").dropna()
bbc_data_X = bbc_data['content']
bbc_data_Y = bbc_data['type'].apply(label_entry, args=(labels,)).to_numpy().astype(np.float32)

X_val_bbc = pd.concat([validation_X, bbc_data_X])
Y_val_bbc = np.concat([validation_Y, bbc_data_Y])
print("Done getting data ready.")

model = logistic_regression(train_X, train_Y, labels)
print("Done training model.")
val_score = test_model_lr(model, validation_X, validation_Y)
bbc_val_score = test_model_lr(model, X_val_bbc, Y_val_bbc)
fake_news_test_lr = test_model_lr(model, test_X, test_Y)

print(f"f1 validation score without bbc data: {val_score}")
print(f"f1 validation score with bbc data: {bbc_val_score}")
print(f"f1 test score: {fake_news_test_lr}")

## Neural Network

In [None]:
vectorizer = TfidfVectorizer(max_features=10000)

def neural_network(train_x:pd.Series, train_y:pd.Series, labels):
    """
    Initialize an SKLearn classifier neural network.
    """
    classifier = MLPClassifier(solver='adam', alpha=1e-4, hidden_layer_sizes=(50,), activation='relu', random_state=1, max_iter=500, n_features_in_=6)

    vector_col = vectorizer.fit_transform(train_x)
    label_col = train_y.to_numpy()
    print("Done vectorizing data.")
    classifier.fit(vector_col, label_col)
    print("Done training model.")
    return classifier

def test_model_nn(model:MLPClassifier, val_x:pd.Series, val_y:pd.Series, test_X=None, test_Y=None, wandb_init = False):
    label_col = val_y.to_numpy()
    vector_col = vectorizer.transform(val_x)

    # Evaluate scores:
    val_pred = model.predict(vector_col)
    val_f1 = f1_score(label_col, val_pred, average='weighted')
    class_report = classification_report(label_col, val_pred)
    print("CLASS REPORT VALIDATION DATA:")
    print(class_report)

    if wandb_init:
        wandb.init(project="gds-project-test")
        wandb.log({"val_f1": val_f1})
        wandb.finish()

    if test_X is not None and test_Y is not None:
        vector_col_test = vectorizer.transform(test_X)
        label_col_test = test_Y.to_numpy()
        test_pred = model.predict(vector_col_test)
        test_f1 = f1_score(label_col_test, test_pred, average='weighted')
        class_report_test = classification_report(label_col, val_pred)
        print("CLASS REPORT TEST DATA:")
        print(class_report_test)
        return test_f1
    else:
        return val_f1
    
def liar_file(model:MLPClassifier, liar_file_train, liar_file_val, liar_file_test, wandb_init = None):
    train_y = liar_file_train[1]
    columns = [3, 4, 5, 6]

    df = liar_file_train[2]
    for column in columns:
        df = pd.concat([df, liar_file_train[column]], axis=1)

    pt.process_df(df)
    train_x = vectorizer.transform(df)
    prediction = model.predict(train_x)
    score = model.score(train_y, prediction)

    return score

"""
Show test-case.
"""
labels = (["fake", "satire", "bias", "conspiracy", "junksci", "state"],
    ["clickbait", "political", "reliable"])
unwanted_labels = ["unreliable", "rumor", "unknown", "hate", "2018-02-10 13:43:39.521661"]

#split corpus data
data = pd.read_csv("processed.csv").dropna()
data.drop(data.index[(data["type"].isin(unwanted_labels))],axis=0,inplace=True)
data_X = data['content']
data_Y = data['type'].apply(label_entry, args=(labels,))
print("Done getting data ready.")

X_train, X_val_test, Y_train, Y_val_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=16)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, test_size=0.5, random_state=16)
print("Done splitting data.")

model = neural_network(X_train, Y_train, labels)
print("Done training model.")

fake_news_test_nn = test_model_nn(model, X_val, Y_val)
print(f"VALIDATION DATA F1: {test}")

# Logistic Regression model evaluated with LIAR Dataset

In [None]:
def map_liar_label(label):
    if label in ["pants-fire", "false", "barely-true", "half-true"]:
        return 0
    elif label in ["mostly-true", "true"]:
        return 1
    else:
        return None
    
liar_train_data_lr = pd.read_csv("liar_dataset/train.tsv", sep='\t', header=None)
liar_train_X = liar_train_data_lr[[2, 3, 4, 5, 6, 7]].fillna('').agg('\t'.join, axis=1)
liar_train_Y = liar_train_data_lr[1].apply(map_liar_label)
liar_valid_lr_test = liar_train_Y.notnull()
liar_train_X = liar_train_X[liar_valid_lr_test]
liar_train_X = liar_train_X.apply(process_text)
liar_train_Y = liar_train_Y[liar_valid_lr_test]

liar_val_data_lr = pd.read_csv("liar_dataset/valid.tsv", sep='\t', header=None)
liar_val_X = liar_val_data_lr[[2, 3, 4, 5, 6, 7]].fillna('').agg('\t'.join, axis=1)
liar_val_Y = liar_val_data_lr[1].apply(map_liar_label)
liar_valid_lr_val = liar_val_Y.notnull()
liar_val_X = liar_val_X[liar_valid_lr_val]
liar_val_X = liar_val_X.apply(process_text)
liar_val_Y = liar_val_Y[liar_valid_lr_val]

liar_test_data_lr = pd.read_csv("liar_dataset/test.tsv", sep='\t', header=None)
liar_test_X = liar_test_data_lr[[2, 3, 4, 5, 6, 7]].fillna('').agg('\t'.join, axis=1)
liar_test_Y = liar_test_data_lr[1].apply(map_liar_label)
liar_valid_lr_test = liar_test_Y.notnull()
liar_test_X = liar_test_X[liar_valid_lr_val]
liar_test_X = liar_test_X.apply(process_text)
liar_test_Y = liar_test_Y[liar_valid_lr_val]

print("Evaluating FakeNewsCorpus-trained model on LIAR dataset")
liar_f1_lr_train = test_model_lr(model, liar_train_X, liar_train_Y)
liar_f1_lr_val = test_model_lr(model, liar_val_X, liar_val_Y)
liar_f1_lr_test = test_model_lr(model, liar_test_X, liar_test_Y)
print(f"LIAR Dataset F1 (Cross-domain) train set: {liar_f1_lr_train}")
print(f"LIAR Dataset F1 (Cross-domain) validation set: {liar_f1_lr_val}")
print(f"LIAR Dataset F1 (Cross-domain) test set: {liar_f1_lr_test}")

# TASK 3: Comparison
print("Comparison of Results:")
print(f"FakeNewsCorpus Validation F1: {fake_news_test_lr}")
print(f"LIAR Dataset F1 (Cross-domain) train set: {liar_f1_lr_train}")
print(f"LIAR Dataset F1 (Cross-domain) validation set: {liar_f1_lr_val}")
print(f"LIAR Dataset F1 (Cross-domain) test set: {liar_f1_lr_test}")

# Neural Network model evaluated with LIAR Dataset

In [None]:
print("Evaluating FakeNewsCorpus-trained model on LIAR dataset")
liar_f1_nn_train = test_model_nn(model, liar_train_X, liar_train_Y)
liar_f1_nn_val = test_model_nn(model, liar_val_X, liar_val_Y)
liar_f1_nn_test = test_model_nn(model, liar_test_X, liar_test_Y)
print(f"LIAR Dataset F1 (Cross-domain) train set: {liar_f1_nn_train}")
print(f"LIAR Dataset F1 (Cross-domain) validation set: {liar_f1_nn_val}")
print(f"LIAR Dataset F1 (Cross-domain) test set: {liar_f1_nn_test}")

# TASK 3: Comparison
print("Comparison of Results:")
print(f"FakeNewsCorpus Validation F1: {fake_news_test_nn}")
print(f"LIAR Dataset F1 (Cross-domain) train set: {liar_f1_nn_train}")
print(f"LIAR Dataset F1 (Cross-domain) validation set: {liar_f1_nn_val}")
print(f"LIAR Dataset F1 (Cross-domain) test set: {liar_f1_nn_test}")