In [1]:
# BERT-alapú osztályozó betöltése
# Modell: "google/bert_uncased_L-2_H-128_A-2" (2 osztály)
# Tokenizer és előtanított modell inicializálása

from transformers import AutoModelForSequenceClassification, AutoTokenizer

MODEL_NAME = "google/bert_uncased_L-2_H-128_A-2"
num_labels = 2

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# Előkészített adathalmaz betöltése és előfeldolgozása
# - CSV beolvasása és hiányzó/üres szövegek kiszűrése
# - 90%-os tanító és 10%-os teszt felosztás
# - Teszt tovább bontása 50-50%-ban validációs és végső tesztre
# - Végső struktúra: train, validation, test
# - Teszt készlet mentése CSV-be és szerkezet kiírása

from datasets import load_dataset, DatasetDict

# Load dataset
dataset = load_dataset("csv", data_files={"all_data": "resource_data/cleaned_fake_news_dataset.csv"})

# Filter out rows with missing or empty text
dataset["all_data"] = dataset["all_data"].filter(lambda example: example["text"] is not None and example["text"].strip() != "")

# Step 1: Split into 90% train, 10% test
split_data = dataset["all_data"].train_test_split(test_size=0.1)

# Step 2: Split test set into validation (50%) and final test (50%)
test_valid_split = split_data["test"].train_test_split(test_size=0.5)

# Create final dataset dictionary
split_data = DatasetDict({
    "train": split_data["train"],
    "test": test_valid_split["test"],
    "validation": test_valid_split["train"],
})

# Save test set to CSV
split_data["test"].to_csv("resource_data/fake_news_dataset_test.csv")

# Print structure and first item
print(split_data)
print(split_data["train"][0])

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 70566
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3921
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 3920
    })
})
{'text': 'WELLINGTON (Reuters) - Just days after the United States said it would increase troop numbers in Afghanistan and ask its allies to do the same, New Zealand on Friday announced an extra three non-combat military personnel, boosting its military commitment to 13. U.S. President Donald Trump on Monday unveiled his strategy to end the conflict in Afghanistan, committing the United States to an open-ended conflict and signaling he would dispatch more troops to America s longest war. U.S. officials have said Trump had signed off on plans to send about 4,000 more U.S. troops to add to the roughly 8,400 now deployed in Afghanistan. U.S. Defense Secretary James Mattis has since said exact troop numbers a

In [3]:
#Ellenőrzés hibás sorokra
for i, example in enumerate(split_data["train"]):
    if not isinstance(example["text"], str):
        print(f"Non-string type found at index {i}: {example['text']} (type: {type(example['text'])})")

In [4]:
# Tokenizálás BERT modellhez
# - Szövegek max 128 hosszra vágása és párnázása
# - Minden adathalmazon végrehajtás
# - Tokenizált adatok ellenőrzése

def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=128
    )

# Apply tokenization to all dataset splits
tokenized_datasets = split_data.map(tokenize_function, batched=True)

# Check if tokenization worked
print(tokenized_datasets)

Map:   0%|          | 0/70566 [00:00<?, ? examples/s]

Map:   0%|          | 0/3921 [00:00<?, ? examples/s]

Map:   0%|          | 0/3920 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 70566
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3921
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3920
    })
})


In [5]:
# Tanítási paraméterek beállítása
# - Kimeneti mappa: "results"
# - Értékelés és mentés epochonként
# - Tanulási ráta: 2e-5, súlycsökkenés: 0.01
# - Batch méret: 16 (train és eval)
# - 3 tanítási epoch, legjobb modell betöltése a végén
# - Float16 (fp16) gyorsítás engedélyezve


from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True
)

In [6]:
# Trainer inicializálása és tanítás indítása
# - Adatkitöltés tokenizálás után
# - Modell, tanítási paraméterek és adathalmazok beállítása
# - Tanítás elindítása a megadott konfigurációval

from transformers import Trainer, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.325,0.316821
2,0.2973,0.290768
3,0.2962,0.292643


TrainOutput(global_step=13233, training_loss=0.3258444746362373, metrics={'train_runtime': 2346.0674, 'train_samples_per_second': 90.235, 'train_steps_per_second': 5.641, 'total_flos': 67239891348480.0, 'train_loss': 0.3258444746362373, 'epoch': 3.0})

In [7]:
model.save_pretrained("fake_news_bert_model")
tokenizer.save_pretrained("fake_news_bert_model")

('fake_news_bert_model\\tokenizer_config.json',
 'fake_news_bert_model\\special_tokens_map.json',
 'fake_news_bert_model\\vocab.txt',
 'fake_news_bert_model\\added_tokens.json',
 'fake_news_bert_model\\tokenizer.json')