In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.linear_model import SGDClassifier

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud
import tensorflow as tf
import numpy as np
import random

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [10]:
import os
import sys
from pathlib import Path

if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("working dir:", workding_dir)

working dir: /home/dariusng2103/projects/dm_project/DM-Fake-News-Detection


In [11]:
from datasets import load_dataset, concatenate_datasets, Dataset

datasets = load_dataset(
    "csv",
    data_files={
        "train": [
            "dataset/train_data_1.csv",
            "dataset/train_data_2.csv",
            "dataset/train_data_3.csv",
            "dataset/train_data_4.csv",
        ],
        "test": "dataset/test_data.csv",
        "rewritten_train": [
            "dataset/rewritten_train_data_1.csv",
            "dataset/rewritten_train_data_2.csv",
            "dataset/rewritten_train_data_3.csv",
            "dataset/rewritten_train_data_4.csv",
        ],
        "rewritten_test": "dataset/rewritten_test_data.csv",
    },
)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
    test: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 6050
    })
    rewritten_train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
    rewritten_test: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 6050
    })
})

In [None]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib  # ‚úÖ For saving/loading the vectorizer

def train_model(train_data, val_data, C=1.0, kernel="linear"):
    """Trains an SVM model using TF-IDF vectorization."""
    
    print(f"\nüöÄ Training SVM with C={C}, kernel={kernel}")

    # ‚úÖ Extract texts and labels
    train_texts = train_data["processed_full_content"]
    val_texts = val_data["processed_full_content"]
    y_train = train_data["label"]
    y_val = val_data["label"]

    # ‚úÖ Use TF-IDF instead of CountVectorizer
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_val = vectorizer.transform(val_texts)  # ‚úÖ Transform val data with same vectorizer

    # ‚úÖ Train SVM
    model = SVC(C=C, kernel=kernel, probability=True)
    model.fit(X_train, y_train)

    # ‚úÖ Save the trained model
    joblib.dump(model, "svm_model.pkl")

    print("‚úÖ Model and vectorizer saved successfully!")

    return model


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

def evaluate_model(model, train_data, val_data):
    """Evaluates the trained SVM model using the stored TF-IDF vectorizer."""
    print("\nüìä Evaluating SVM Model...")

    # ‚úÖ Load saved TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=5000)

    # ‚úÖ Transform validation set
    y_val = val_data["label"]
    val_texts = val_data["processed_full_content"]
    X_val = vectorizer.transform(val_texts)

    # ‚úÖ Predict labels
    y_pred = model.predict(X_val)

    # ‚úÖ Compute metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    print("\nüèÜ Final Evaluation Results:")
    for key, value in {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1}.items():
        print(f"üîπ {key.capitalize()}: {value:.4f}")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1}


In [14]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV

def do_grid_search(data):
    """Performs Grid Search using SVM with TF-IDF (5000 features)."""
    
    param_grid = {
        "clf__C": [0.1, 1, 10],  
        "clf__kernel": ["linear"]  
    }

    X = data["processed_full_content"]
    y = data["label"].to_numpy().ravel()

    # ‚úÖ Use Stratified K-Fold (Reduce folds if too slow)
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # ‚úÖ Reduced to 3 folds

    # ‚úÖ Use Pipeline (TF-IDF + SVM)
    pipeline = Pipeline([
        ("vectorizer", TfidfVectorizer(max_features=5000)),  
        ("clf", SVC(probability=True))  
    ])

    grid_search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring="f1", verbose=2, n_jobs=-1)
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    print(f"\nüèÜ Best Parameters Found: C={best_params['clf__C']}, Kernel={best_params['clf__kernel']}")

    return best_params


In [15]:
train_data = datasets["train"].to_pandas()
val_data = datasets["test"].to_pandas()
data = pd.concat([train_data, val_data], ignore_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60491 entries, 0 to 60490
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   60491 non-null  int64 
 1   full_content            60491 non-null  object
 2   processed_full_content  60491 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [16]:
%%time
best_params = do_grid_search(data)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END .......................clf__C=1, clf__kernel=linear; total time=83.1min
[CV] END .......................clf__C=1, clf__kernel=linear; total time=84.0min
[CV] END .......................clf__C=1, clf__kernel=linear; total time=86.1min
[CV] END .......................clf__C=1, clf__kernel=linear; total time=88.2min
[CV] END .......................clf__C=1, clf__kernel=linear; total time=90.3min
[CV] END .....................clf__C=0.1, clf__kernel=linear; total time=94.1min
[CV] END .....................clf__C=0.1, clf__kernel=linear; total time=95.6min
[CV] END .....................clf__C=0.1, clf__kernel=linear; total time=98.2min
[CV] END .....................clf__C=0.1, clf__kernel=linear; total time=98.6min
[CV] END ....................clf__C=0.1, clf__kernel=linear; total time=100.5min
[CV] END .....................clf__C=10, clf__kernel=linear; total time=107.9min
[CV] END .....................clf__C=10, clf__ker

In [23]:
%%time

model = train_model(train_data, val_data, C=best_params["clf__C"], kernel=best_params["clf__kernel"])



üöÄ Training SVM with C=1, kernel=linear
‚úÖ Model and vectorizer saved successfully!
CPU times: user 1h 6min 24s, sys: 0 ns, total: 1h 6min 24s
Wall time: 1h 6min 24s


In [24]:
# ‚úÖ Load the trained model
model2 = joblib.load("svm_model.pkl")

print("‚úÖ Model loaded successfully!")


‚úÖ Model loaded successfully!


In [25]:
evaluate_model(model, train_data, val_data)


üìä Evaluating SVM Model...

üèÜ Final Evaluation Results:
üîπ Accuracy: 0.9650
üîπ Precision: 0.9586
üîπ Recall: 0.9615
üîπ F1_score: 0.9600


{'accuracy': 0.9649586776859504,
 'precision': 0.9585687382297552,
 'recall': 0.9614658103513412,
 'f1_score': 0.960015088645794}

In [26]:
evaluate_model(model2, train_data, val_data)


üìä Evaluating SVM Model...

üèÜ Final Evaluation Results:
üîπ Accuracy: 0.9650
üîπ Precision: 0.9586
üîπ Recall: 0.9615
üîπ F1_score: 0.9600


{'accuracy': 0.9649586776859504,
 'precision': 0.9585687382297552,
 'recall': 0.9614658103513412,
 'f1_score': 0.960015088645794}

In [27]:
val_data_rewritten = datasets["rewritten_test"].to_pandas()
train_data_rewritten = datasets["rewritten_train"].to_pandas()
data_rewritten = pd.concat([train_data, train_data_rewritten, val_data, val_data_rewritten], ignore_index=True)
data_rewritten.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120982 entries, 0 to 120981
Data columns (total 3 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   label                   120982 non-null  int64 
 1   full_content            120982 non-null  object
 2   processed_full_content  120982 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.8+ MB


In [28]:
evaluate_model(model, train_data, val_data_rewritten)


üìä Evaluating SVM Model...

üèÜ Final Evaluation Results:
üîπ Accuracy: 0.8607
üîπ Precision: 0.8700
üîπ Recall: 0.8013
üîπ F1_score: 0.8342


{'accuracy': 0.8606611570247934,
 'precision': 0.8699753896636587,
 'recall': 0.8012844729882886,
 'f1_score': 0.8342182890855457}

In [None]:
evaluate_model(model2, train_data, val_data_rewritten)


üìä Evaluating SVM Model...

üèÜ Final Evaluation Results:
üîπ Accuracy: 0.8607
üîπ Precision: 0.8700
üîπ Recall: 0.8013
üîπ F1_score: 0.8342


{'accuracy': 0.8606611570247934,
 'precision': 0.8699753896636587,
 'recall': 0.8012844729882886,
 'f1_score': 0.8342182890855457}

: 

In [None]:
%%time

best_params_rewritten = do_grid_search(data_rewritten)
best_params_rewritten

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [None]:
%%time

train_data_combined = pd.concat([train_data, train_data_rewritten], ignore_index=True)
val_data_combined = pd.concat([val_data, val_data_rewritten], ignore_index=True)
model_combined = train_model(train_data_combined, val_data_combined, 
                             C=best_params_rewritten["clf__C"], 
                             kernel=best_params_rewritten["clf__kernel"])
model_combined.save("results/SVM_model_combined.keras")

In [None]:
evaluate_model(model_combined, train_data_combined, val_data_combined)

In [None]:
evaluate_model(model_combined, train_data_combined, val_data)

In [None]:
evaluate_model(model_combined, train_data_combined, val_data_rewritten)

In [None]:
model_combined2 = load_model("results/SVM_model_combined.keras")
model_combined2.summary()

In [None]:
evaluate_model(model_combined2, train_data_combined, val_data_combined)

In [None]:
evaluate_model(model_combined2, train_data_combined, val_data)

In [None]:
evaluate_model(model_combined2, train_data_combined, val_data_rewritten)