In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from cuml.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.linear_model import SGDClassifier

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud
import tensorflow as tf
import numpy as np
import random

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

2025-02-21 16:10:16.336825: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-21 16:10:16.363073: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740125416.377426 2922243 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740125416.381839 2922243 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-21 16:10:16.412242: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
import os
import sys
from pathlib import Path

if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("working dir:", workding_dir)

working dir: /home/inflaton/code/CrediNews


In [3]:
from datasets import load_dataset, concatenate_datasets, Dataset

datasets = load_dataset(
    "csv",
    data_files={
        "train": [
            "dataset/train_data_1.csv",
            "dataset/train_data_2.csv",
            "dataset/train_data_3.csv",
            "dataset/train_data_4.csv",
        ],
        "test": "dataset/test_data.csv",
        "rewritten_train": [
            "dataset/rewritten_train_data_1.csv",
            "dataset/rewritten_train_data_2.csv",
            "dataset/rewritten_train_data_3.csv",
            "dataset/rewritten_train_data_4.csv",
        ],
        "rewritten_test": "dataset/rewritten_test_data.csv",
    },
)
datasets

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
    test: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 6050
    })
    rewritten_train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
    rewritten_test: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 6050
    })
})

In [4]:
from cuml.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib  # ✅ For saving/loading the vectorizer

def train_model(train_data, val_data, C=1.0, kernel="linear", gamma='0.001'):
    """Trains an SVM model using TF-IDF vectorization."""
    
    print(f"\n🚀 Training SVM with C={C}, gamma={gamma}, kernel={kernel}")

    # ✅ Extract texts and labels
    train_texts = train_data["processed_full_content"]
    val_texts = val_data["processed_full_content"]
    y_train = train_data["label"]
    y_val = val_data["label"]

    # ✅ Use TF-IDF instead of CountVectorizer
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_val = vectorizer.transform(val_texts)  # ✅ Transform val data with same vectorizer

    # ✅ Train SVM
    model = SVC(kernel=kernel, gamma=gamma, C=C)
    model.fit(X_train, y_train)

    # ✅ Save the trained model
    joblib.dump(model, "svm_model.pkl")

    print("✅ Model and vectorizer saved successfully!")

    return model


In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

def evaluate_model(model, train_data, val_data):
    """Evaluates the trained SVM model using the stored TF-IDF vectorizer."""
    print("\n📊 Evaluating SVM Model...")

    # ✅ Load saved TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=5000)

    # ✅ Transform validation set
    y_val = val_data["label"]
    val_texts = val_data["processed_full_content"]
    X_val = vectorizer.transform(val_texts)

    # ✅ Predict labels
    y_pred = model.predict(X_val)

    # ✅ Compute metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    print("\n🏆 Final Evaluation Results:")
    for key, value in {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1}.items():
        print(f"🔹 {key.capitalize()}: {value:.4f}")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1}


In [6]:
from cuml.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV

def do_grid_search(data):
    """Performs Grid Search using SVM with TF-IDF (5000 features)."""
    
    param_grid = {
        "clf__C": [0.1, 1, 10],
        'clf__gamma': [1, 0.1, 0.01, 0.001],
        'clf__kernel': ['rbf'],
        'clf__random_state': [1]
    }

    X = data["processed_full_content"]
    y = data["label"].to_numpy().ravel()

    # ✅ Use Stratified K-Fold (Reduce folds if too slow)
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # ✅ Reduced to 3 folds

    # ✅ Use Pipeline (TF-IDF + SVM)
    pipeline = Pipeline([
        ("vectorizer", TfidfVectorizer(max_features=5000)),  
        ("clf", SVC())  
    ])

    grid_search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring="f1", verbose=2, n_jobs=-1)
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    print(f"\n🏆 Best Parameters Found: C={best_params['clf__C']}, Kernel={best_params['clf__kernel']}, Gamma={best_params['clf__gamma']}")

    return best_params


In [7]:
train_data = datasets["train"].to_pandas()
val_data = datasets["test"].to_pandas()
data = pd.concat([train_data, val_data], ignore_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60491 entries, 0 to 60490
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   60491 non-null  int64 
 1   full_content            60491 non-null  object
 2   processed_full_content  60491 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [None]:
%%time
best_params = do_grid_search(data)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:
%%time

model = train_model(train_data, val_data, C=best_params["clf__C"], kernel=best_params["clf__kernel"], gamma=best_params["clf__gamma"])



🚀 Training SVM with C=1, kernel=rbf


TypeError: __init__() got an unexpected keyword argument 'loss'

In [None]:
model.save("results/SVM_model_original.keras")

In [None]:
evaluate_model(model, train_data, val_data)

In [None]:
val_data_rewritten = datasets["rewritten_test"].to_pandas()
train_data_rewritten = datasets["rewritten_train"].to_pandas()
data_rewritten = pd.concat([train_data, train_data_rewritten, val_data, val_data_rewritten], ignore_index=True)
data_rewritten.info()

In [None]:
evaluate_model(model, train_data, val_data_rewritten)

In [None]:
%%time

best_params_rewritten = do_grid_search(data_rewritten)
best_params_rewritten

In [None]:
%%time

train_data_combined = pd.concat([train_data, train_data_rewritten], ignore_index=True)
val_data_combined = pd.concat([val_data, val_data_rewritten], ignore_index=True)
model_combined = train_model(train_data_combined, val_data_combined, 
                             C=best_params_rewritten["clf__C"], 
                             kernel=best_params_rewritten["clf__kernel"],
                             gamma=best_params_rewritten["clf__gamma"])
model_combined.save("results/SVM_model_combined.keras")

In [None]:
evaluate_model(model_combined, train_data_combined, val_data_combined)

In [None]:
evaluate_model(model_combined, train_data_combined, val_data)

In [None]:
evaluate_model(model_combined, train_data_combined, val_data_rewritten)