<a href="https://colab.research.google.com/github/redfrog66/NLP_Fake_News_Detection/blob/main/NLP_Fake_news_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fake news detection using NLP

Load the dataset:

In [1]:
!pip install transformers evaluate datasets peft --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m552.2 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/GonzaloA/fake_news/" + splits["train"])
df_validation = pd.read_parquet("hf://datasets/GonzaloA/fake_news/" + splits["validation"])
df_test = pd.read_parquet("hf://datasets/GonzaloA/fake_news/" + splits["test"])

df = pd.concat([df_train,df_validation,df_test])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 40587 entries, 0 to 8116
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  40587 non-null  int64 
 1   title       40587 non-null  object
 2   text        40587 non-null  object
 3   label       40587 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.5+ MB


Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,‘Maury’ Show Official Facebook Posts F*CKED U...,Maury is perhaps one of the trashiest shows on...,0
1,1,Trump’s Favorite News Channel Tries To Soothe...,"Yesterday, after the father of one of the UCLA...",0
2,2,"Russia warns Iraq, Kurds not to destabilize Mi...",MOSCOW (Reuters) - Russia on Wednesday warned ...,1
3,3,WATCH STEVE SCALISE Throw A Strike At The Nati...,"House Majority Whip Steve Scalise (R., La.) th...",0
4,4,Trump Will HATE What Stephen Colbert Just Did...,It can be said that Late Show host Stephen Col...,0


Drop unwanted properties

In [4]:
df = df.drop(['Unnamed: 0', 'title'], axis=1)

Text preprocessing

In [5]:
import re
import string
def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'\[.*?\]|\W|https?://\S+|www\.\S+|<.*?>+|\n|\w*\d\w*', ' ', text)
  return text
df['text'] = df['text'].fillna('').apply(preprocess_text)

Delete empty rows

In [6]:
empty_rows_count = (df['text'].str.strip() == '').sum()
empty_rows_count

np.int64(57)

In [7]:
df = df[df['text'].str.strip().astype(bool)]

In [8]:
df.head()

Unnamed: 0,text,label
0,maury is perhaps one of the trashiest shows on...,0
1,yesterday after the father of one of the ucla...,0
2,moscow reuters russia on wednesday warned ...,1
3,house majority whip steve scalise r la th...,0
4,it can be said that late show host stephen col...,0


Split data to train and test

In [9]:
from sklearn.model_selection import train_test_split
x = df['text']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)

## Logistic regression:

Vectorize the data

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [11]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(xv_train,y_train)
pred_lr = lr.predict(xv_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_lr))
from sklearn.metrics import accuracy_score
print('Model accuracy:')
accuracy_score(y_test, pred_lr)

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      3685
           1       0.98      0.97      0.97      4421

    accuracy                           0.97      8106
   macro avg       0.97      0.97      0.97      8106
weighted avg       0.97      0.97      0.97      8106

Model accuracy:


0.9706390328151986

TODO: Preprocess data differently(?), apply more models(BERT, LoRa, NN(?)), Add magic

## BERT

Tokenize the texts

In [30]:
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [31]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

df_train_split = pd.DataFrame({'text': x_train, 'label': y_train})
df_test_split = pd.DataFrame({'text': x_test, 'label': y_test})

train_dataset = Dataset.from_pandas(df_train_split)
test_dataset = Dataset.from_pandas(df_test_split)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/32424 [00:00<?, ? examples/s]

Map:   0%|          | 0/8106 [00:00<?, ? examples/s]

In [32]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [33]:
import evaluate

accuracy = evaluate.load("accuracy")

In [34]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [35]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

Load the model

In [48]:
from transformers import AutoModelForSequenceClassification

model_bert = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train the model

In [49]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="model",
    report_to="none",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    label_names=["NEGATIVE", "POSITIVE"]
)

In [56]:
from transformers import Trainer

trainer_bert = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [57]:
trainer_bert.train()

Epoch,Training Loss,Validation Loss
1,0.4556,No log


TrainOutput(global_step=2027, training_loss=0.5288494638933239, metrics={'train_runtime': 2775.0387, 'train_samples_per_second': 11.684, 'train_steps_per_second': 0.73, 'total_flos': 8538012266977152.0, 'train_loss': 0.5288494638933239, 'epoch': 1.0})

Evaluation

In [2]:
bert_eval_results = trainer_bert.evaluate()
print(bert_eval_results)

NameError: name 'trainer_bert' is not defined

In [1]:
print(bert_eval_results.get("eval_accuracy"))

NameError: name 'bert_eval_results' is not defined

## LoRA

In [51]:
from peft import LoraConfig
from peft import get_peft_model

In [52]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=2,
    lora_alpha=8,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none"
)

In [53]:
model_lora = get_peft_model(model_bert, lora_config)

In [54]:
from transformers import Trainer

trainer_lora = Trainer(
    model=model_lora,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [55]:
trainer_lora.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

Evaluation

In [None]:
lora_eval_results = trainer_lora.evaluate()
print(lora_eval_results)

In [None]:
print(lora_eval_results.get("eval_accuracy"))