In [None]:
# Imports
import pandas as pd
import spacy
import transformers
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import classification_report
from datasets import Dataset
import torch

# Version Checks
print("Pandas version " + pd.__version__)
print("SpaCy version " + spacy.__version__)
print("Torch version " + torch.__version__)
print("Transformers version " + transformers.__version__)

Pandas version 2.2.3
SpaCy version 3.8.6
Torch version 2.7.0+cpu
Transformers version 4.51.3
MODUL: transformers.training_args
DATEI: c:\Users\Teo\Desktop\Studium\_Master\C201 Mustererkennung\FakeNewsClassification\.venv\lib\site-packages\transformers\training_args.py


In [None]:
# Test Training Arguments (can be skipped)
args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    num_train_epochs=3
)

print(args)

TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
eval_use_gather_object=False,
fp16=False,
fp1

In [3]:

# Data Paths
train_path = "../../data/GonzaloA - fake_news/train_without_reuters.csv"
val_path   = "../../data/GonzaloA - fake_news/evaluation_without_reuters.csv"
test_path  = "../../data/GonzaloA - fake_news/test_without_reuters.csv"

# Read with semicolon separator
df_train = pd.read_csv(train_path, sep=';')
df_val   = pd.read_csv(val_path,   sep=';')
df_test  = pd.read_csv(test_path,  sep=';')

# Quick sanity-check
print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)
print()
print("Train columns:", df_train.columns.tolist())

# first 5 rows of dataset
print(df_train.head(5))

Train shape: (24353, 4)
Validation shape: (8117, 4)
Test shape: (8117, 4)

Train columns: ['Unnamed: 0', 'title', 'text', 'label']
   Unnamed: 0                                              title  \
0           0  Palestinians switch off Christmas lights in Be...   
1           1  China says Trump call with Taiwan president wo...   
2           2   FAIL! The Trump Organization’s Credit Score W...   
3           3  Zimbabwe military chief's China trip was norma...   
4           4  THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...   

                                                text  label  
0  RAMALLAH, West Bank  - Palestinians switched o...      1  
1  BEIJING  - U.S. President-elect Donald Trump’s...      1  
2  While the controversy over Trump s personal ta...      0  
3  BEIJING  - A trip to Beijing last week by Zimb...      1  
4  There has never been a more UNCOURAGEOUS perso...      0  


In [5]:

# Preprocessing
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

In [18]:
# Test Preprocessing (can be skipped)
df_train_sample = df_train[:5]['text'].astype(str).apply(preprocess_text)

with pd.option_context('display.max_colwidth', None):
    print(df_train_sample)

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               ramallah west bank   palestinians switch christmas light

In [None]:

# Apply Preprocessing
df_train['text'] = df_train['text'].astype(str).apply(preprocess_text)
df_test['text'] = df_test['text'].astype(str).apply(preprocess_text)

print(df_train['text'].head(5))

In [3]:

# Load Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenizing
train_encodings = tokenizer(list(df_train['text']), truncation=True, padding=True)
test_encodings = tokenizer(list(df_test['text']), truncation=True, padding=True)

print(train_encodings.head(5))

NameError: name 'df_train' is not defined

In [None]:

# Translate to Huggingface Dataset Format
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'label': list(df_train['label'])
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'label': list(df_test['label'])
})

# Load Model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Training Parameters
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Define Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=1)
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "f1": report["weighted avg"]["f1-score"]
    }

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Start Training
trainer.train()

# Evaluation
predictions = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(predictions.predictions), dim=1)
print(classification_report(df_test['label'], pred_labels))
