In [None]:
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Tuple
import logging

import numpy as np
import pandas as pd
from numpy.typing import NDArray
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

logging.basicConfig(level=logging.INFO)


@dataclass(frozen=True)
class ModelConfig:
    """Configuration values for training and evaluation."""

    test_size: float = 0.2
    random_state: int = 7
    max_df: float = 0.7
    max_iter: int = 100
    stop_words: str = "english"


def load_dataset(path: Path) -> pd.DataFrame:
    """Load the news dataset from a CSV file."""

    try:
        df: pd.DataFrame = pd.read_csv(path)
    except FileNotFoundError as exc:
        logging.exception("Dataset file not found at %s", path)
        raise FileNotFoundError(f"Dataset not found at: {path}") from exc
    except (pd.errors.EmptyDataError, pd.errors.ParserError) as exc:
        logging.exception("Dataset file is invalid: %s", path)
        raise ValueError(f"Dataset file is invalid: {path}") from exc

    required_columns: set[str] = {"text", "label"}
    if not required_columns.issubset(df.columns):
        missing: set[str] = required_columns.difference(df.columns)
        raise ValueError(f"Dataset missing required columns: {sorted(missing)}")

    return df


def split_dataset(
    df: pd.DataFrame, config: ModelConfig
) -> Tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
    """Split the dataset into train and test subsets."""

    try:
        texts: pd.Series = df["text"]
        labels: pd.Series = df["label"]
        return train_test_split(
            texts,
            labels,
            test_size=config.test_size,
            random_state=config.random_state,
        )
    except KeyError as exc:
        logging.exception("Dataset missing required columns")
        raise ValueError("Dataset must include 'text' and 'label' columns") from exc


def build_vectorizer(config: ModelConfig) -> TfidfVectorizer:
    """Create a TF-IDF vectorizer configured for the dataset."""

    return TfidfVectorizer(stop_words=config.stop_words, max_df=config.max_df)


def vectorize_text(
    vectorizer: TfidfVectorizer,
    x_train: pd.Series,
    x_test: pd.Series,
) -> Tuple[np.ndarray, np.ndarray]:
    """Vectorize training and test text data."""

    try:
        x_train_vec: np.ndarray = vectorizer.fit_transform(x_train)
        x_test_vec: np.ndarray = vectorizer.transform(x_test)
        return x_train_vec, x_test_vec
    except ValueError as exc:
        logging.exception("Vectorization failed")
        raise ValueError("Vectorization failed; check input data") from exc


def train_model(
    x_train_vec: np.ndarray,
    y_train: pd.Series,
    config: ModelConfig,
) -> PassiveAggressiveClassifier:
    """Train a PassiveAggressiveClassifier on vectorized data."""

    model: PassiveAggressiveClassifier = PassiveAggressiveClassifier(
        max_iter=config.max_iter
    )
    try:
        model.fit(x_train_vec, y_train)
    except ValueError as exc:
        logging.exception("Model training failed")
        raise ValueError("Model training failed; check input data") from exc

    return model


def evaluate_model(
    y_true: pd.Series, y_pred: NDArray[np.object_]
) -> Dict[str, object]:
    """Evaluate predictions and return metrics."""

    accuracy: float = accuracy_score(y_true, y_pred)
    matrix: NDArray[np.int_] = confusion_matrix(y_true, y_pred)
    return {"accuracy": accuracy, "confusion_matrix": matrix}


In [None]:
config: ModelConfig = ModelConfig()

data_path: Path = Path("news") / "news.csv"

df: pd.DataFrame = load_dataset(data_path)

x_train, x_test, y_train, y_test = split_dataset(df, config)

vectorizer: TfidfVectorizer = build_vectorizer(config)

x_train_vec, x_test_vec = vectorize_text(vectorizer, x_train, x_test)

model: PassiveAggressiveClassifier = train_model(x_train_vec, y_train, config)

y_pred: NDArray[np.object_] = model.predict(x_test_vec)

metrics: Dict[str, object] = evaluate_model(y_test, y_pred)

metrics