# Fake news detection using NLP

Load the dataset:

In [1]:
!pip install transformers evaluate datasets peft nltk --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m76.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/GonzaloA/fake_news/" + splits["train"])
df_validation = pd.read_parquet("hf://datasets/GonzaloA/fake_news/" + splits["validation"])
df_test = pd.read_parquet("hf://datasets/GonzaloA/fake_news/" + splits["test"])

df = pd.concat([df_train,df_validation,df_test])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 40587 entries, 0 to 8116
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  40587 non-null  int64 
 1   title       40587 non-null  object
 2   text        40587 non-null  object
 3   label       40587 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.5+ MB


Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,‘Maury’ Show Official Facebook Posts F*CKED U...,Maury is perhaps one of the trashiest shows on...,0
1,1,Trump’s Favorite News Channel Tries To Soothe...,"Yesterday, after the father of one of the UCLA...",0
2,2,"Russia warns Iraq, Kurds not to destabilize Mi...",MOSCOW (Reuters) - Russia on Wednesday warned ...,1
3,3,WATCH STEVE SCALISE Throw A Strike At The Nati...,"House Majority Whip Steve Scalise (R., La.) th...",0
4,4,Trump Will HATE What Stephen Colbert Just Did...,It can be said that Late Show host Stephen Col...,0


Drop unwanted properties

In [4]:
df = df.drop(['Unnamed: 0', 'title'], axis=1)

Text preprocessing

In [5]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
import re
import string

def preprocess_text(text):
      text = text.lower() #Text to lower case
      text = re.sub(r'\[.*?\]|\W|https?://\S+|www\.\S+|<.*?>+|\n|\w*\d\w*', ' ', text) #Remove special characters, URLs, HTML tags, newlines, and words containing numbers
      words = text.split() #Split the text into words
      stop_words = set(stopwords.words('english')) #Load and convert the English stopwords list to a set for efficient lookup
      words = [word for word in words if word not in stop_words] #Remove stopwords
      text = ' '.join(words)
      return text

df['text'] = df['text'].fillna('').apply(preprocess_text)

Delete empty rows

In [7]:
empty_rows_count = (df['text'].str.strip() == '').sum()
empty_rows_count

np.int64(58)

In [8]:
df = df[df['text'].str.strip().astype(bool)]

In [9]:
df.head()

Unnamed: 0,text,label
0,maury perhaps one trashiest shows television t...,0
1,yesterday father one ucla players arrested chi...,0
2,moscow reuters russia wednesday warned iraq ku...,1
3,house majority whip steve scalise r la threw s...,0
4,said late show host stephen colbert clearly gi...,0


Split data to train and test

In [10]:
from sklearn.model_selection import train_test_split
x = df['text']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)

## Logistic regression:

Logistic regression is a statistical model used for binary classification problems.

Logistic regression provides a simple, interpretable, and efficient way to model the probability of a news article being fake based on the text features. It's a valuable tool, particularly as a baseline to understand the problem and compare against more complex approaches.

For the TF-IDF solution we tried logistic regression as a classifier.

Vectorize the data

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [12]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(xv_train,y_train)
pred_lr = lr.predict(xv_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_lr))
from sklearn.metrics import accuracy_score
print('Model accuracy:')
accuracy_score(y_test, pred_lr)

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      3728
           1       0.98      0.97      0.97      4378

    accuracy                           0.97      8106
   macro avg       0.97      0.97      0.97      8106
weighted avg       0.97      0.97      0.97      8106

Model accuracy:


0.9710091290402171

For this dataset, we got a pretty high accuracy. It is worth, to give this type of model a try.

## BERT

BERT is a powerful, pre-trained language model that uses the Transformer architecture and bidirectional training to gain a deep understanding of text. It offers state-of-the-art performance, excels at understanding context and linguistic nuance, and allows us to benefit from transfer learning on this specific task.

The potential downsides of using BERT is its high computational resource requirements, large model size, limited interpretability, potential need for a reasonably sized fine-tuning dataset, sensitivity to training configurations, and potentially higher latency for predictions. These factors need to be weighed against the potential performance gains.

Tokenize the texts

In [13]:
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

df_train_split = pd.DataFrame({'text': x_train, 'label': y_train})
df_test_split = pd.DataFrame({'text': x_test, 'label': y_test})

train_dataset = Dataset.from_pandas(df_train_split)
test_dataset = Dataset.from_pandas(df_test_split)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/32423 [00:00<?, ? examples/s]

Map:   0%|          | 0/8106 [00:00<?, ? examples/s]

In [15]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [17]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [18]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

Load the model

In [19]:
from transformers import AutoModelForSequenceClassification

model_bert = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train the model

In [20]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="model",
    report_to="none",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    label_names=["NEGATIVE", "POSITIVE"]
)

In [21]:
from transformers import Trainer

trainer_bert = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [22]:
trainer_bert.train()

Epoch,Training Loss,Validation Loss
1,0.0444,No log


TrainOutput(global_step=2027, training_loss=0.06574022878727774, metrics={'train_runtime': 3183.6013, 'train_samples_per_second': 10.184, 'train_steps_per_second': 0.637, 'total_flos': 8284427744596560.0, 'train_loss': 0.06574022878727774, 'epoch': 1.0})

Save model

In [23]:
trainer_bert.save_model()

Evaluation

In [24]:
bert_eval_results = trainer_bert.evaluate()
print(bert_eval_results)

{'eval_runtime': 235.1011, 'eval_samples_per_second': 34.479, 'eval_steps_per_second': 2.157, 'epoch': 1.0}


For a first run, we used BERT to determine wether the news is true or false. Due to slight imbalance in the dataset, it gave incorrect response with too high level of confidence, so we decided against BERT in the end.

## LoRA

In [25]:
from peft import LoraConfig
from peft import get_peft_model

In [26]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=2,
    lora_alpha=8,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none"
)

In [27]:
model_lora = get_peft_model(model_bert, lora_config)

In [28]:
from transformers import Trainer

trainer_lora = Trainer(
    model=model_lora,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [29]:
trainer_lora.train()

Epoch,Training Loss,Validation Loss
1,0.0298,No log


TrainOutput(global_step=2027, training_loss=0.027778452643633007, metrics={'train_runtime': 2517.603, 'train_samples_per_second': 12.879, 'train_steps_per_second': 0.805, 'total_flos': 8291559182127696.0, 'train_loss': 0.027778452643633007, 'epoch': 1.0})

Save model

In [30]:
trainer_lora.save_model()

Evaluation

In [31]:
lora_eval_results = trainer_lora.evaluate()
print(lora_eval_results)

{'eval_runtime': 243.7582, 'eval_samples_per_second': 33.254, 'eval_steps_per_second': 2.08, 'epoch': 1.0}


Download the model

In [32]:
!zip -r model.zip model/

from google.colab import files
files.download('model.zip')

  adding: model/ (stored 0%)
  adding: model/training_args.bin (deflated 52%)
  adding: model/vocab.txt (deflated 53%)
  adding: model/adapter_model.safetensors (deflated 7%)
  adding: model/model.safetensors (deflated 7%)
  adding: model/adapter_config.json (deflated 55%)
  adding: model/special_tokens_map.json (deflated 42%)
  adding: model/tokenizer_config.json (deflated 75%)
  adding: model/checkpoint-2027/ (stored 0%)
  adding: model/checkpoint-2027/training_args.bin (deflated 52%)
  adding: model/checkpoint-2027/trainer_state.json (deflated 62%)
  adding: model/checkpoint-2027/vocab.txt (deflated 53%)
  adding: model/checkpoint-2027/adapter_model.safetensors (deflated 7%)
  adding: model/checkpoint-2027/model.safetensors (deflated 7%)
  adding: model/checkpoint-2027/adapter_config.json (deflated 55%)
  adding: model/checkpoint-2027/special_tokens_map.json (deflated 42%)
  adding: model/checkpoint-2027/tokenizer_config.json (deflated 75%)
  adding: model/checkpoint-2027/optimizer.

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>