## Part 1
### Task 1  

Pandas is used to process The fake news corpus. Since content will be used for our models we drop any rows that don't have any content.

In [None]:
import pandas as pd
url = 'https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv'
df = pd.read_csv(url)
print(df.head)

### Continuation of task 1

We've implemented data processing functions to do the following:

    - Clean the text
    - Tokenize the text
    - Remove stopwords
    - Remove word variations with stemming
We use nltk and cleantext because it has built-in support for many of these operations.
We also use collections to import a counter, sklearn to import functions to split the dataset, chain to help with counting and matplotlib for visualizing.

In [None]:
import re
from collections import Counter
from itertools import chain
from nltk.tokenize import RegexpTokenizer
from cleantext import clean
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


def plot_word_frequency(counter, top_n=10000, title="Word Frequency Distribution"):
    """
    Plots the frequency distribution of the top_n words using a log-log plot.
    """
    # Extract frequencies of the most common words
    freqs = [freq for word, freq in counter.most_common(top_n)]
    
    plt.figure(figsize=(10, 6))
    plt.loglog(range(1, len(freqs) + 1), freqs, marker=".")
    plt.xlabel("Rank of word (log scale)")
    plt.ylabel("Frequency (log scale)")
    plt.title(title)
    plt.grid(True, which="both", linestyle="--", linewidth=0.5)
    plt.show()


# Regex pattern for tokenization
# <\w+> matches tags (e.g., <num>), [\w]+(?:-[\w]+)? matches words with hyphens
pattern = r'<\w+>|[\w]+(?:-[\w]+)?'
tokenizer = RegexpTokenizer(pattern)

def clean_text(text: str) -> str:
    clean_text = re.sub(
        r'([A-Za-z]+\.?\s[0-9]{1,2}?,\s[0-9]{4})|\b\d{4}-\d{2}-\d{2}\b|\b\d{2}-\d{2}-\d{4}\b', 
        '<DATE>', 
        text
    )
    clean_text = clean(clean_text,
        lower=True,
        no_urls=True, replace_with_url="<URL>",
        no_emails=True, replace_with_email="<EMAIL>",
        no_numbers=True, replace_with_number= r"<NUM>",
        no_currency_symbols=True, replace_with_currency_symbol="<CUR>",
        no_punct=True, replace_with_punct="",
        no_line_breaks=True 
    )
    return clean_text

def tokenize_text(text: str, stop_words: set) -> list:
    """
    Tokenizes the input text using NLTK and removes stopwords.
    """
    tokens = tokenizer.tokenize(text)
    return [token for token in tokens if token not in stop_words]

def stem_tokens(tokens: list) -> list:
    """
    Applies Porter stemming to a list of tokens.
    """
    ps = PorterStemmer()
    return [ps.stem(token) for token in tokens]

def split_dataset(df, train_ratio=0.8, val_ratio=0.1):
    """
    Splits the DataFrame into training, validation, and test sets.
    """
    train_df, temp_df = train_test_split(df, test_size=(1 - train_ratio), random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=val_ratio/(1 - train_ratio), random_state=42)
    return train_df, val_df, test_df

def main(df, text_columns):
    """
    Processes text data by cleaning, tokenizing, and stemming.
    This version handles multiple text columns at once by combining them.
    
    Parameters:
        df (pandas.DataFrame): Input DataFrame.
        text_columns (str or list): Name of the text column or a list of text column names.
    """
    # Ensure text_columns is a list
    if isinstance(text_columns, str):
        text_columns = [text_columns]
    
    # Create a combined text column by joining the specified columns (ignoring any NaNs)
    df['combined_text'] = df[text_columns].apply(lambda row: " ".join(row.dropna().astype(str)), axis=1)
    
    # Define English stopwords set
    stop_words = set(stopwords.words('english'))
    
    # Clean the combined text data
    df['cleaned_text'] = df['combined_text'].apply(clean_text)
    
    # Tokenize the cleaned text and remove stopwords
    df['tokenized_text'] = df['cleaned_text'].apply(lambda x: " ".join(tokenize_text(x, stop_words)))
    
    # Save tokenized text to CSV (the file name reflects that columns were combined)
    df['tokenized_text'].to_csv("combined_tokenized_news_sample.csv", index=False)
    
    # Count word frequencies in the tokenized text
    tokenized_series = df['tokenized_text']
    all_tokens = list(chain.from_iterable(tokenized_series.str.split()))
    token_freq = Counter(all_tokens)
    
    # Count word frequencies in the cleaned (lowercased) text for comparison
    original_tokens = list(chain.from_iterable(df['cleaned_text'].str.split()))
    original_freq = Counter(original_tokens)
    
    # Compute vocabulary sizes and reduction rate after stopword removal
    vocab_original = set(original_tokens)
    vocab_tokenized = set(all_tokens)
    reduction_rate_stopwords = (len(vocab_original) - len(vocab_tokenized)) / len(vocab_original)
    
    print("----- Vocabulary Analysis -----")
    print(f"Original vocabulary size: {len(vocab_original)}")
    print(f"Vocabulary size after stopword removal: {len(vocab_tokenized)}")
    print(f"Reduction rate after stopword removal: {reduction_rate_stopwords:.2%}")
    
    # Apply stemming to the tokenized text
    df['stemmed_text'] = df['tokenized_text'].apply(lambda x: " ".join(stem_tokens(x.split())))
    df['stemmed_text'].to_csv("combined_stemmed_news_sample.csv", index=False)
    
    # Count word frequencies in the stemmed text
    all_stemmed_tokens = list(chain.from_iterable(df['stemmed_text'].str.split()))
    stem_freq = Counter(all_stemmed_tokens)
    
    # Compute vocabulary size and reduction rate after stemming
    vocab_stemmed = set(all_stemmed_tokens)
    reduction_rate_stemming = (len(vocab_tokenized) - len(vocab_stemmed)) / len(vocab_tokenized)
    
    print("\n----- Stemming Analysis -----")
    print(f"Vocabulary size after stemming: {len(vocab_stemmed)}")
    print(f"Reduction rate after stemming: {reduction_rate_stemming:.2%}")
    
    # Print top 10 words for each version of the text
    print("\n----- Top 10 Words -----")
    print(f"Original sample: {original_freq.most_common(10)}")
    print(f"Tokenized (stopword-removed) sample: {token_freq.most_common(10)}")
    print(f"Stemmed sample: {stem_freq.most_common(10)}")
    
    # Count placeholder tokens (note: the clean function lowercases text)
    url_count = df['stemmed_text'].str.count("<url>").sum()
    date_count = df['stemmed_text'].str.count("<date>").sum()
    num_count = df['stemmed_text'].str.count("<num>").sum()
    
    print("\n----- Placeholder Token Counts -----")
    print(f"Number of URLs: {url_count}")
    print(f"Number of dates: {date_count}")
    print(f"Number of numerics: {num_count}")
    
    # Plot frequency distributions
    plot_word_frequency(original_freq, top_n=10000, title="Original Text Frequency Distribution")
    plot_word_frequency(token_freq, top_n=10000, title="Stopword-Removed Frequency Distribution")
    plot_word_frequency(stem_freq, top_n=10000, title="Stemmed Frequency Distribution")
    
    print("\n----- DataFrame Summary -----")
    print(df.info())
    print(df.describe(include="all"))
    
    # Split the DataFrame into training, validation, and test sets
    train_df, val_df, test_df = split_dataset(df)
    print("\nDataset split sizes:")
    print(f"Training set: {len(train_df)} rows")
    print(f"Validation set: {len(val_df)} rows")
    print(f"Test set: {len(test_df)} rows")

    train_df, val_df, test_df = split_dataset(df)
    return train_df, val_df, test_df  # Returner datasættene

# Read the CSV file into a DataFrame and select a subset (for testing)
df = pd.read_csv("/Users/clarabovingmagnussen/Desktop/GDS/Projekt/news_sample.csv", encoding="utf-8")
df_sample = df.head(300).copy()  # For a large dataset, consider processing in chunks
main(df_sample, text_columns='content')

## Part 1
### Task 2

We apply our data processing pipeline from task 1 on the 995k FakeNewsCorpus.

Since Pandas is slow on larger datasets we can use modin and ray to optimize pandas and allow for multithreading.

In [None]:
import modin.config as modin_config
import modin.pandas as pd
modin_config.Engine.put('ray')

# only read the columns we need
df = pd.read_csv("/Users/clarabovingmagnussen/Desktop/GDS/Projekt/995000_rows.csv", 
                 usecols=['content', 'type', 'title', 'domain'], 
                 engine='c', 
                 dtype = str)
df = df.dropna(subset=['content', 'type', 'title'])
main(df.head(300).copy(), text_columns=df.columns)


## Part 2  
### Task 1, Task 2 and Task 3

To create a baseline model for Fake News classification, we implemented a **logistic regression model** using a vocabulary of the 10,000 most frequently occurring words.  

We used `CountVectorizer` to transform the text data into numerical features. The model was trained on the dataset to distinguish between **reliable** and **fake** news articles.



In [None]:
import numpy as np
from joblib import dump
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_score, recall_score, confusion_matrix

def categorize_type(df):
    """ Categorize types in omitted, fake, reliable """
    omitted = {'hate', 'bias', 'satire', 'unreliable', 'state'}
    fake = {'fake', 'junksci', 'conspiracy'}
    reliable = {'reliable', 'political', 'clickbait'}

    df_categorized = df[~df['type'].isin(omitted)].copy()
    df_categorized['type'] = df_categorized['type'].map(lambda x: 'fake' if x in fake else ('reliable' if x in reliable else None))
    df_categorized = df_categorized.dropna(subset=['type'])

    return df_categorized

# Rens FakeNewsCorpus datasættet 
df = categorize_type(df)

# Load articles_clara.csv 
scraped_df = pd.read_csv("/Users/clarabovingmagnussen/Desktop/GDS/Projekt/articles_clara.csv", encoding="utf-8")

# Checking that neccesarry comlumns exsists in 'df'
if 'text' in scraped_df.columns:
    scraped_df = scraped_df.rename(columns={'text': 'content'})
    scraped_df['type'] = 'reliable'  # Marker artikler i Articels_clara som pålidelige
    scraped_df = scraped_df[['content', 'type']]
else:
    raise ValueError("Fejl: Kolonnen 'text' mangler i articles_clara.csv")

print("Type of df:", type(df))  # Ensure df is a DataFrame
print("Type of scraped_df:", type(scraped_df))  # Ensure scraped_df is a DataFrame

# Convert Modin DataFrame to Pandas before concatenation
df = df._to_pandas() 

# Concat the two datasets
combined_train_df = pd.concat([df, scraped_df], ignore_index=True)

# Split data i training (80%), validationg (10%) and test (10%) sets
train_df, val_df, test_df = np.split(combined_train_df.sample(frac=1, random_state=42), 
                                     [int(0.8*len(combined_train_df)), int(0.9*len(combined_train_df))])

# Use CountVectorizer to transform text into numeric features
vectorizer = CountVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(train_df['content'])
X_test = vectorizer.transform(test_df['content'])

# Convert labels to binary values (1 = Fake, 0 = Reliable)
y_train = np.array([1 if label == "fake" else 0 for label in train_df['type']])
y_test = np.array([1 if label == "fake" else 0 for label in test_df['type']])

# Train simpel logistisk regressionsmodel
clf = LogisticRegression(max_iter=500, solver='saga')
clf.fit(X_train, y_train)

# Make predictions on testdata
y_pred = clf.predict(X_test)

dump(vectorizer, "count_vectorizer.pkl")
dump(clf, "logistic_model.pkl")

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Model performance after integrating Articles_Clara dataset:")
print(f"Accuracy: {accuracy:.2%}")
print(f"F1 Score: {f1:.2%}")

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Extract TP, TN, FP, FN
tn, fp, fn, tp = cm.ravel()

# Print results
print("\nConfusion Matrix:")
print(cm)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"True Positives (TP): {tp}")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")

## Part 3

For our advanced Fake News predictor, we implemented a **Support Vector Machine (SVM) with a linear kernel** using **TF-IDF** as a feature. 

We use `TfidfVectorizer` to convert text into numerical feature representations, reducing the impact of frequently occurring words while emphasizing informative words that differentiate fake from reliable news.

The model is trained using **a vocabulary of the 10,000 most frequent words**.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import seaborn as sns

# Use TF-IDF Vectorizer to transform text into a vocabulary of the 10,000 most frequent words
tfidf_vectorizer = TfidfVectorizer(max_features=2000)

# Convert text content into numerical feature matrices
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['content'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['content'])

# Convert labels to binary values (1 = Fake, 0 = Reliable)
y_train_tfidf = np.array([1 if label == "fake" else 0 for label in train_df['type']])
y_test_tfidf = np.array([1 if label == "fake" else 0 for label in test_df['type']])

svm_model = LinearSVC(C=1.0, random_state=0)
svm_model.fit(X_train_tfidf, y_train_tfidf)

# Save advanced model
dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")
dump(svm_model, "svm_model.pkl")

y_pred_tfidf=svm_model.predict(X_test_tfidf)
y_pred_train_tfidf=svm_model.predict(X_train_tfidf)

# Evaluate model
accuracy_tfidf =accuracy_score(y_pred_tfidf, y_test_tfidf)
f1_tfidf =f1_score(y_pred_train_tfidf, y_train_tfidf )
precision_tfidf = precision_score(y_test_tfidf, y_pred_tfidf)
recall_tfidf = recall_score(y_test_tfidf, y_pred_tfidf)

print("Model performance after integrating Articles_Clara dataset:")
print(f"Accuracy: {accuracy_tfidf:.2%}")
print(f"F1 Score: {f1_tfidf:.2%}")

# Compute confusion matrix
cm_tfidf = confusion_matrix(y_test_tfidf, y_pred_tfidf)

# Extract TP, TN, FP, FN
tn, fp, fn, tp = cm_tfidf.ravel()

# Print results
print("\nConfusion Matrix:")
print(cm_tfidf)
print("\nClassification Report:")
print(classification_report(y_test_tfidf, y_pred_tfidf))
print(f"True Positives (TP): {tp}")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")

# Create confusion matrix for SVM model
plt.figure(figsize=(8, 6))
sns.heatmap(cm_tfidf, annot=True, fmt="d", cmap="Blues", xticklabels=["Reliable", "Fake"], yticklabels=["Reliable", "Fake"])
plt.title("Confusion Matrix - SVM model - Fake News")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

## Part 4
### Task 1, Task 2 & Task 3

We load our trained models and their vectorizers by using load function in the model `joblib`. We tweaked our earlier approach to fit the LIAR dataset. We made sure that we are able to find the label and its text.

We visualised the results of the simple and advanced model by plotting the Confusion Matrix for LIAR. Thus we are able to compare results and make conclusions.

In [None]:
from joblib import load

# Load trained models 
vectorizer = load("count_vectorizer.pkl")
clf = load("logistic_model.pkl")
vectorizer_tfidf = load("tfidf_vectorizer.pkl")
svm_model = load("svm_model.pkl")

# Load LIAR dataset
liar_train = pd.read_csv("/Users/clarabovingmagnussen/Desktop/liar/train.tsv", sep='\t', on_bad_lines='skip', engine='python', encoding='utf-8')
liar_test = pd.read_csv("/Users/clarabovingmagnussen/Desktop/liar/test.tsv", sep='\t', on_bad_lines='skip', engine='python', encoding='utf-8')
liar_val = pd.read_csv("/Users/clarabovingmagnussen/Desktop/liar/valid.tsv", sep='\t', on_bad_lines='skip', engine='python', encoding='utf-8')

# Define categorization function 
def categorize_type(df):
    false = {'false', 'barely-true', 'pants-fire'}
    true = {'true', 'mostly-true', 'half-true'}

    label_col = df.columns[1]
    df_categorized = df[df[label_col].isin(false.union(true))].copy()
    df_categorized[label_col] = df_categorized[label_col].map(lambda x: 'false' if x in false else 'true')
    return df_categorized

# Categorize LIAR data
categorize_train = categorize_type(liar_train)
categorize_test = categorize_type(liar_test)

# Extract content and labels
text_col = liar_test.columns[2] 
label_col = liar_test.columns[1] 

X_test_liar_texts = categorize_test[text_col]
y_test_liar = np.array([1 if label == "false" else 0 for label in categorize_test[label_col]])

# Transform LIAR test data using the FakeNewsCorpus vectorizers
X_test_liar_count = vectorizer.transform(X_test_liar_texts)
X_test_liar_tfidf = vectorizer_tfidf.transform(X_test_liar_texts)

# Predict with pre-trained models 
y_pred_simple_liar = clf.predict(X_test_liar_count)
y_pred_advanced_liar = svm_model.predict(X_test_liar_tfidf)

def evaluate_model(name, y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    return f1, accuracy, precision, recall, cm, tn, fp, fn, tp

# Evaluate simple logistic regression on LIAR dataset
f1_simple_L, acc_simple_L, prec_simple_L, rec_simple_L, cm_simple_L, tn_simple_L, tp_simple_L, fp_simple_L, fn_simple_L = evaluate_model("Simple Logistic Regression (LIAR)", y_test_liar, y_pred_simple_liar)

# Evaluate advanced SVM on LIAR dataset
f1_advanced_L, acc_advanced_L, prec_advanced_L, rec_advanced_L, cm_advanced_L, tn_advanced_L, tp_advanced_L, fp_advanced_L, fn_advanced_L = evaluate_model("Advanced SVM (TF-IDF) (LIAR)", y_test_liar, y_pred_advanced_liar)

table = {
    'Simple-FN': [f1, accuracy, precision, recall],
    'Advanced-FN': [f1_tfidf, accuracy_tfidf, precision_tfidf, recall_tfidf],
    'Simple-LIAR': [f1_simple_L, acc_simple_L, prec_simple_L, rec_simple_L],
    'Advanced-LIAR': [f1_advanced_L, acc_advanced_L, prec_advanced_L, rec_advanced_L]
}

# Header column
header_column = ['F1 Score', 'Accuracy', 'Precision', 'Recall']

# Create the DataFrame with 'header_column' as index
table_data = pd.DataFrame(table, index=header_column)

# Print the DataFrame
print(table_data)

# Visualize the confusion matrix for the simple model for LIAR
plt.figure(figsize=(8, 6))
sns.heatmap(cm_simple_L, annot=True, fmt="d", cmap="Greens", xticklabels=["Reliable", "Fake"], yticklabels=["True", "False"])
plt.title("CM - Simple Model - LIAR")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Visualize the confusion matrix for the advanced model for LIAR
plt.figure(figsize=(8, 6))
sns.heatmap(cm_simple_L, annot=True, fmt="d", cmap="Reds", xticklabels=["Reliable", "Fake"], yticklabels=["True", "False"])
plt.title("CM - Advanced Model - LIAR")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()