In [21]:
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [22]:
# Add src to path
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

from data_loader import load_authorship_dataset
from features import extract_tfidf_double_norm
from bert_features import extract_bert_embeddings
from scipy.sparse import save_npz
import joblib
import numpy as np


In [23]:
DATA_RAW_PATH = "./../data/raw/dataset_authorship"
DATA_PROCESSED_PATH = "./../data/processed"

In [24]:
from nltk.corpus import stopwords


# Setup
df, label_encoder = load_authorship_dataset(DATA_RAW_PATH)
texts = df["text"].tolist()
turkish_stopwords = stopwords.words("turkish")
os.makedirs("./../data/processed", exist_ok=True)

# Feature configuration
feature_configs = [
    {"name": "word_1gram", "analyzer": "word", "ngram_range": (1, 1), "stopwords": turkish_stopwords},
    {"name": "word_2gram", "analyzer": "word", "ngram_range": (2, 2), "stopwords": turkish_stopwords},
    {"name": "word_3gram", "analyzer": "word", "ngram_range": (3, 3), "stopwords": turkish_stopwords},
    {"name": "char_2gram", "analyzer": "char", "ngram_range": (2, 2), "stopwords": None},
    {"name": "char_3gram", "analyzer": "char", "ngram_range": (3, 3), "stopwords": None},
]

# Processing loop
for config in feature_configs:
    print(f"Processing: {config['name']} ...")

    X, vectorizer, idf = extract_tfidf_double_norm(
        texts,
        ngram_range=config["ngram_range"],
        analyzer=config["analyzer"],
        stopword_list=config["stopwords"],
        max_features=10000
    )

    save_npz(f"{DATA_PROCESSED_PATH}/X_tfidf_{config['name']}.npz", X)
    joblib.dump(vectorizer, f"{DATA_PROCESSED_PATH}/vectorizer_{config['name']}.pkl")
    np.save(f"{DATA_PROCESSED_PATH}/idf_{config['name']}.npy", idf)

    print(f"✔ Saved {config['name']} with shape {X.shape}")

Processing: word_1gram ...
✔ Saved word_1gram with shape (1200, 10000)
Processing: word_2gram ...
✔ Saved word_2gram with shape (1200, 10000)
Processing: word_3gram ...
✔ Saved word_3gram with shape (1200, 10000)
Processing: char_2gram ...
✔ Saved char_2gram with shape (1200, 2658)
Processing: char_3gram ...
✔ Saved char_3gram with shape (1200, 10000)


In [25]:
extract_bert_embeddings(
    dataset_path=DATA_RAW_PATH,
    save_dir=DATA_PROCESSED_PATH
)


📥 Loading tokenizer and model...
📂 Loading dataset...
🚀 Generating embeddings...


Generating BERT embeddings: 100%|██████████| 1200/1200 [11:53<00:00,  1.68it/s]


✅ Embeddings shape: (1200, 768)
💾 Saved to ./../data/processed
