In [1]:
import pandas as pd
import numpy as np

In [2]:
from datasets import Dataset, ClassLabel, Features, Value

In [3]:
real = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

In [4]:
real['label'] = 0
fake['label'] = 1

In [5]:
print(real.shape, fake.shape)

(21417, 5) (23481, 5)


In [8]:
news = pd.concat([real, fake])
news_articles = news[['text', 'label']]

In [9]:
news_articles = news_articles.reset_index(drop=True)

In [10]:
import re

In [11]:
news_articles["text"] = news_articles["text"].str.replace(
    r'^.*?\([^)]*\)\s*-\s*',
    '',
    regex=True
)

In [12]:
news_articles.head()

Unnamed: 0,text,label
0,The head of a conservative Republican faction ...,0
1,Transgender people will be allowed for the fir...,0
2,The special counsel investigation of links bet...,0
3,Trump campaign adviser George Papadopoulos tol...,0
4,President Donald Trump called on the U.S. Post...,0


In [96]:
news_articles.to_csv("news_articles.csv", index=False)

In [13]:
features = Features({
    'text': Value('string'),
    'label': ClassLabel(names=[0, 1])
})

In [14]:
news_articles = Dataset.from_pandas(news_articles, features=features)

In [15]:
print(news_articles)

Dataset({
    features: ['text', 'label'],
    num_rows: 44898
})


In [16]:
from transformers import AutoTokenizer

In [17]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [18]:
train_test = news_articles.train_test_split(test_size=0.3, seed=42, stratify_by_column='label')
train = train_test['train']
temp = train_test['test']

In [19]:
eval_test = temp.train_test_split(test_size=0.5, seed=42, stratify_by_column='label')
eval = eval_test['train']
test = eval_test['test']

In [20]:
print(train)
print(eval)
print(test)

Dataset({
    features: ['text', 'label'],
    num_rows: 31428
})
Dataset({
    features: ['text', 'label'],
    num_rows: 6735
})
Dataset({
    features: ['text', 'label'],
    num_rows: 6735
})


In [21]:
def tokenize(article):
    return tokenizer(article['text'], truncation=True, max_length=512, padding='max_length')

In [None]:
train_dataset = train.map(tokenize, batched=True)
eval_dataset = eval.map(tokenize, batched=True)
test_dataset = test.map(tokenize, batched=True)

In [24]:
print(train_dataset)
print(eval_dataset)
print(test_dataset)

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 31428
})
Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6735
})
Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6735
})


In [25]:
from sklearn.metrics import accuracy_score, f1_score

In [26]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='binary')
    
    return {
        'accuracy': acc,
        'f1': f1
    }

In [27]:
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer

In [28]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [None]:
from huggingface_hub import login

hf_token = None
login(hf_token)

repo = "tvocoder/bert_fake_news_ft"

In [32]:
training_args = TrainingArguments(
    output_dir="./bert_fake_news_ft",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    hub_model_id=repo,
    hub_strategy="end"
)

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0216,0.013773,0.997327,0.997441
2,0.0032,0.006519,0.998664,0.998722
3,0.0007,0.010373,0.998515,0.99858


TrainOutput(global_step=5895, training_loss=0.011922684163597678, metrics={'train_runtime': 2407.2016, 'train_samples_per_second': 39.167, 'train_steps_per_second': 2.449, 'total_flos': 2.480716274356224e+16, 'train_loss': 0.011922684163597678, 'epoch': 3.0})

In [None]:
trainer.save_model("bert_fake_news")

In [36]:
tokenizer.save_pretrained("bert_fake_news")

('bert_fake_news\\tokenizer_config.json',
 'bert_fake_news\\special_tokens_map.json',
 'bert_fake_news\\vocab.txt',
 'bert_fake_news\\added_tokens.json',
 'bert_fake_news\\tokenizer.json')

In [None]:
tokenizer.push_to_hub("tvocoder/bert_fake_news_ft")

In [None]:
metrics = trainer.evaluate(test_dataset)

{'eval_loss': 0.0121395168825984, 'eval_accuracy': 0.998218262806236, 'eval_f1': 0.9982973893303064, 'eval_runtime': 60.3052, 'eval_samples_per_second': 111.682, 'eval_steps_per_second': 1.758, 'epoch': 3.0}


In [39]:
metrics

{'eval_loss': 0.0121395168825984,
 'eval_accuracy': 0.998218262806236,
 'eval_f1': 0.9982973893303064,
 'eval_runtime': 60.3052,
 'eval_samples_per_second': 111.682,
 'eval_steps_per_second': 1.758,
 'epoch': 3.0}