In [1]:
import os
import sys
import nltk
import joblib
import numpy as np
from scipy.sparse import save_npz, load_npz
from sklearn.model_selection import train_test_split
from concurrent.futures import ThreadPoolExecutor

# Setup src imports
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))
from data_loader import load_authorship_dataset
from features import extract_tfidf_double_norm
from bert_features import extract_bert_embeddings
from logger import get_logger

  from .autonotebook import tqdm as notebook_tqdm


# Init logger

In [2]:
logger = get_logger("preprocessing")

# NLTK Stopwords (ensure available)

In [3]:
nltk.download("stopwords", quiet=True)
from nltk.corpus import stopwords

# Paths

In [4]:
DATA_RAW_PATH = "./../data/raw/dataset_authorship"
DATA_PROCESSED_PATH = "./../data/processed"
DATA_TRAIN_PATH = "./../data/train"
DATA_TEST_PATH = "./../data/test"
os.makedirs(DATA_PROCESSED_PATH, exist_ok=True)
os.makedirs(DATA_TRAIN_PATH, exist_ok=True)
os.makedirs(DATA_TEST_PATH, exist_ok=True)

# Dataset

In [5]:
df, label_encoder = load_authorship_dataset(DATA_RAW_PATH)
texts = df["text"].tolist()
labels = df["label"].values
turkish_stopwords = stopwords.words("turkish")

2025-04-21 13:09:32 [INFO] 📂 Found 30 author folders.


2025-04-21 13:09:34 [INFO] 🔢 Encoded 30 unique authors.
2025-04-21 13:09:34 [INFO] ✅ Loaded 1200 documents from dataset.


# TF-IDF config

In [6]:
feature_configs = [
    {"name": "tfidf_word_1gram", "analyzer": "word", "ngram_range": (1, 1), "stopwords": turkish_stopwords},
    {"name": "tfidf_word_2gram", "analyzer": "word", "ngram_range": (2, 2), "stopwords": turkish_stopwords},
    {"name": "tfidf_word_3gram", "analyzer": "word", "ngram_range": (3, 3), "stopwords": turkish_stopwords},
    {"name": "tfidf_char_2gram", "analyzer": "char", "ngram_range": (2, 2), "stopwords": None},
    {"name": "tfidf_char_3gram", "analyzer": "char", "ngram_range": (3, 3), "stopwords": None},
]

# Extract and save TF-IDF and BERT

In [7]:

# === TF-IDF Function ===
def process_tfidf_config(config):
    logger.info(f"🧮 TF-IDF: {config['name']} started")
    X, vectorizer, idf = extract_tfidf_double_norm(
        texts,
        ngram_range=config["ngram_range"],
        analyzer=config["analyzer"],
        stopword_list=config["stopwords"],
        max_features=10000
    )
    save_npz(f"{DATA_PROCESSED_PATH}/X_{config['name']}.npz", X)
    joblib.dump(vectorizer, f"{DATA_PROCESSED_PATH}/vectorizer_{config['name']}.pkl")
    np.save(f"{DATA_PROCESSED_PATH}/idf_{config['name']}.npy", idf)
    logger.info(f"✅ Saved {config['name']} with shape: {X.shape}")

# === BERT Function ===
def run_bert():
    extract_bert_embeddings(
        dataset_path=DATA_RAW_PATH,
        save_dir=DATA_PROCESSED_PATH,
        batch_size=16
    )

# === Run in parallel ===
logger.info("🚀 Starting parallel preprocessing...")
with ThreadPoolExecutor(max_workers=6) as executor:
    # Submit: TF-IDF + BERT
    futures = (
        [executor.submit(process_tfidf_config, config) for config in feature_configs]
        + [executor.submit(run_bert)]
    )

    for f in futures:
        f.result()

logger.info("🏁 All feature sets completed.")

2025-04-21 13:09:34 [INFO] 🚀 Starting parallel preprocessing...
2025-04-21 13:09:34 [INFO] 🧮 TF-IDF: tfidf_word_1gram started
2025-04-21 13:09:34 [INFO] 🧮 TF-IDF: tfidf_word_2gram started
2025-04-21 13:09:34 [INFO] 📥 Loading tokenizer and model...
2025-04-21 13:09:34 [INFO] 🧮 TF-IDF: tfidf_word_3gram started
2025-04-21 13:09:34 [INFO] 🧮 TF-IDF: tfidf_char_2gram started
2025-04-21 13:09:34 [INFO] 🧮 TF-IDF: tfidf_char_3gram started
2025-04-21 13:09:51 [INFO] ✅ Saved tfidf_word_1gram with shape: (1200, 10000)
2025-04-21 13:09:53 [INFO] ✅ Saved tfidf_word_2gram with shape: (1200, 10000)
2025-04-21 13:09:54 [INFO] ✅ Saved tfidf_word_3gram with shape: (1200, 10000)
2025-04-21 13:09:54 [INFO] ✅ Saved tfidf_char_2gram with shape: (1200, 2658)
2025-04-21 13:09:56 [INFO] ✅ Saved tfidf_char_3gram with shape: (1200, 10000)
2025-04-21 13:10:13 [INFO] 📦 Using device: cpu
2025-04-21 13:10:13 [INFO] 📂 Loading dataset...
2025-04-21 13:10:13 [INFO] 📂 Found 30 author folders.
2025-04-21 13:10:14 [INFO] 🔢

BERT batches: 100%|██████████| 75/75 [07:35<00:00,  6.07s/it]

2025-04-21 13:17:49 [INFO] ✅ Embeddings shape: (1200, 768)
2025-04-21 13:17:49 [INFO] 💾 Embeddings and labels saved to: ./../data/processed





2025-04-21 13:17:49 [INFO] 🏁 All feature sets completed.


# Split helper

In [8]:
def split_and_save(X, y, name, is_sparse=True):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    if is_sparse:
        save_npz(f"{DATA_TRAIN_PATH}/X_{name}.npz", X_train)
        save_npz(f"{DATA_TEST_PATH}/X_{name}.npz", X_test)
    else:
        np.save(f"{DATA_TRAIN_PATH}/X_{name}.npy", X_train)
        np.save(f"{DATA_TEST_PATH}/X_{name}.npy", X_test)

    np.save(f"{DATA_TRAIN_PATH}/y_{name}.npy", y_train)
    np.save(f"{DATA_TEST_PATH}/y_{name}.npy", y_test)

    logger.info(f"📊 {name} split → train: {X_train.shape}, test: {X_test.shape}")

# Split TF-IDF features

In [9]:
for config in feature_configs:
    name = config["name"]
    X = load_npz(f"{DATA_PROCESSED_PATH}/X_{name}.npz")
    split_and_save(X, labels, name, is_sparse=True)

2025-04-21 13:17:50 [INFO] 📊 tfidf_word_1gram split → train: (960, 10000), test: (240, 10000)
2025-04-21 13:17:50 [INFO] 📊 tfidf_word_2gram split → train: (960, 10000), test: (240, 10000)
2025-04-21 13:17:50 [INFO] 📊 tfidf_word_3gram split → train: (960, 10000), test: (240, 10000)
2025-04-21 13:17:50 [INFO] 📊 tfidf_char_2gram split → train: (960, 2658), test: (240, 2658)
2025-04-21 13:17:51 [INFO] 📊 tfidf_char_3gram split → train: (960, 10000), test: (240, 10000)


# Split BERT

In [10]:
X_bert = np.load(f"{DATA_PROCESSED_PATH}/X_bert_embeddings.npy")
y_bert = np.load(f"{DATA_PROCESSED_PATH}/y_bert.npy")
split_and_save(X_bert, y_bert, "bert", is_sparse=False)

2025-04-21 13:17:51 [INFO] 📊 bert split → train: (960, 768), test: (240, 768)
