In [1]:
# Imports
import pandas as pd
import spacy
import torch
import transformers
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from datasets import Dataset
from pandarallel import pandarallel

In [2]:
# Version Checks
print("Pandas version:", pd.__version__)
print("SpaCy version:", spacy.__version__)
print("Transformers version:", transformers.__version__)
print("Torch version:", torch.__version__)
print("Cuda version:", torch.version.cuda)

if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# Initialize Parallelization
pandarallel.initialize(progress_bar=True)

Pandas version: 2.3.0
SpaCy version: 3.8.7
Transformers version: 4.52.4
Torch version: 2.5.1+cu121
Cuda version: 12.1
GPU: NVIDIA GeForce RTX 2060 SUPER
INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [3]:
# Data Path
csv_path = "../../data/Saurabh Shahane - Fake_News_Classification/WELFake_Dataset.csv"

# Read with semicolon separator
df = pd.read_csv(csv_path, sep=',')

# Split Data
df_train, df_temp = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42, stratify=df_temp['label'])

# Quick sanity check
print("Train:", df_train.shape)
print("Eval: ", df_val.shape)
print("Test: ", df_test.shape)

# first 5 rows of dataset
print(df_train.head(5))

Train: (50493, 4)
Eval:  (10820, 4)
Test:  (10821, 4)
       Unnamed: 0                                              title  \
24388       24388  THE FACE OF THE DEMOCRAT PARTY Has A Message F...   
40608       40608  INCOMING FRESHMEN Are Put On Notice With Welco...   
66652       66652  REPUBLICANS CALL FOR ANSWERS: Did Wasserman-Sc...   
71224       71224  EXTORTION? HOW IRAN Used Nuke Deal To Force Ob...   
17060       17060  Democrats want strong response to intel report...   

                                                    text  label  
24388  This ass-clown reminds us of why term limits a...      1  
40608  Wow! The University of Chicago sends impressiv...      1  
66652  Busted! Even moderate Republican Susan Collins...      1  
71224  If I were a European and was forced to deal wi...      1  
17060  WASHINGTON (Reuters) - The top Democrats on th...      0  


In [9]:
# Preprocessing
def preprocess_text(text):
    import spacy
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

In [10]:
# Test Preprocessing (can be skipped)
df_train_sample = df_train[:5]['text'].astype(str).apply(preprocess_text)

with pd.option_context('display.max_colwidth', None):
    print(df_train_sample)

24388                                                                                                                                                                                                                                                                                                                                                                                        ass clown remind term limit important leisa see rep. rangel d ny close personal week visit capitol building guest rep. mike bishop r mi shock people elect represent nation 20 house member look like roam hall nursing home   hall congress   people business make decision behalf country charlie rangel perfect example assertion s secret democrats t stand tea party rarely express hatred loud like rep. charlie rangel moment catch camera democrat congressman let reporter know think coalition citizen believe individual liberty small government undoubtedly constituents).theblaze report rep. charlie rangel d n.y. harsh wor

In [11]:
# Apply Preprocessing
df_train['text'] = df_train['text'].astype(str).parallel_apply(preprocess_text)
df_test['text'] = df_test['text'].astype(str).parallel_apply(preprocess_text)

print(df_train['text'].head(5))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=8416), Label(value='0 / 8416'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1804), Label(value='0 / 1804'))), …

24388    ass clown remind term limit important leisa se...
40608    wow university chicago send impressive letter ...
66652    bust moderate republican susan collins maine s...
71224    european force deal massive influx muslim refu...
17060    washington reuters democrats u.s. senate house...
Name: text, dtype: object


In [12]:

# Load Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenizing
train_encodings = tokenizer(list(df_train['text']), truncation=True, padding=True)
test_encodings = tokenizer(list(df_test['text']), truncation=True, padding=True)

In [13]:

# Translate to Huggingface Dataset Format
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'label': list(df_train['label'])
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'label': list(df_test['label'])
})

# Load Model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Training Parameters
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Define Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=1)
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "f1": report["weighted avg"]["f1-score"]
    }

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Start Training
trainer.train()

# Evaluation
predictions = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(predictions.predictions), dim=1)
print(classification_report(df_test['label'], pred_labels))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0368,0.053573,0.984105,0.984107
2,0.011,0.044878,0.988726,0.988726


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5255
           1       0.99      0.99      0.99      5566

    accuracy                           0.99     10821
   macro avg       0.99      0.99      0.99     10821
weighted avg       0.99      0.99      0.99     10821



In [None]:
model.save_pretrained("GonzaloA-DistilBert-simple")
tokenizer.save_pretrained("GonzaloA-DistilBert-simple")

('GonzaloA-DistilBert-simple\\tokenizer_config.json',
 'GonzaloA-DistilBert-simple\\special_tokens_map.json',
 'GonzaloA-DistilBert-simple\\vocab.txt',
 'GonzaloA-DistilBert-simple\\added_tokens.json',
 'GonzaloA-DistilBert-simple\\tokenizer.json')