# Fake News Detection: an application of classic NLP techniques
**Universidade de Brasília**<br>
School of Technology<br>
Graduate Program in Electrical Engineering (PPGEE)

## Author: Stefano M P C Souza (stefanomozart@ieee.org)<br> Advisor: Daniel G Silva<br>Advisor: Anderson C A Nascimento



In [1]:
from sentence_transformers import SentenceTransformer
import time
import numpy as np
import torch

In [2]:
import joblib
datasets = joblib.load('datasets.pyd')
experiments = joblib.load('experiments.pyd')

In [3]:
# Expanding the NLP preprocessing experiments map, in order to compare runtimes
experiments['E11'] = {'preprocessing_time': {}, 'name': 'stsb-distilbert-base'}
experiments['E12'] = {'preprocessing_time': {}, 'name': 'paraphrase-multilingual-mpnet-base-v2'}

In [4]:
# Encode texts
for name in ['stsb-distilbert-base', 'paraphrase-multilingual-mpnet-base-v2']:
    t = SentenceTransformer(name)
    for d in datasets:
        print(name, d['name'])

        # Mark start time
        start = time.process_time()
        
        # Encode train, valid, test and train_valid sets
        train = t.encode(d['train'].text.tolist(), show_progress_bar=True)
        valid = t.encode(d['valid'].text.tolist(), show_progress_bar=True)
        train_valid = t.encode(d['train.valid'].text.tolist(), show_progress_bar=True)
        test = t.encode(d['test'].text.tolist(), show_progress_bar=True)
        
        # Compute elapsed time
        experiments[e]['preprocessing_time'][d['name']] = time.process_time() - start
        
        # Save encodings as torch tensors
        torch.save(torch.tensor(train), f"datasets/{d['name']}/train.{name}.pt", _use_new_zipfile_serialization=False)
        torch.save(torch.tensor(valid), f"datasets/{d['name']}/valid.{name}.pt", _use_new_zipfile_serialization=False)
        torch.save(torch.tensor(train_valid), f"datasets/{d['name']}/train.valid.{name}.pt", _use_new_zipfile_serialization=False)
        torch.save(torch.tensor(test), f"datasets/{d['name']}/test.{name}.pt", _use_new_zipfile_serialization=False)
        
        # Save encodings as numpy arrays
        np.save(f"datasets/{d['name']}/train.{name}.npy", train, allow_pickle=True)
        np.save(f"datasets/{d['name']}/valid.{name}.npy", valid, allow_pickle=True)
        np.save(f"datasets/{d['name']}/train.valid.{name}.npy", train_valid, allow_pickle=True)
        np.save(f"datasets/{d['name']}/test.{name}.npy", test, allow_pickle=True)

joblib.dump(experiments, 'experiments.pyd')

stsb-distilbert-base liar


Batches:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

Batches:   0%|          | 0/320 [00:00<?, ?it/s]

Batches:   0%|          | 0/80 [00:00<?, ?it/s]

stsb-distilbert-base sbnc


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Batches:   0%|          | 0/51 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

stsb-distilbert-base fake.br


Batches:   0%|          | 0/144 [00:00<?, ?it/s]

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

Batches:   0%|          | 0/180 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

stsb-distilbert-base factck.br


Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

paraphrase-multilingual-mpnet-base-v2 liar


Batches:   0%|          | 0/256 [00:00<?, ?it/s]

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

Batches:   0%|          | 0/320 [00:00<?, ?it/s]

Batches:   0%|          | 0/80 [00:00<?, ?it/s]

paraphrase-multilingual-mpnet-base-v2 sbnc


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Batches:   0%|          | 0/51 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

paraphrase-multilingual-mpnet-base-v2 fake.br


Batches:   0%|          | 0/144 [00:00<?, ?it/s]

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

Batches:   0%|          | 0/180 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

paraphrase-multilingual-mpnet-base-v2 factck.br


Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

['experiments.pyd']

In [5]:
# Save labels
for d in datasets:
    y_train = d['train'].label.astype(int).tolist()
    y_valid = d['valid'].label.astype(int).tolist()
    y_train_valid = d['train.valid'].label.astype(int).tolist()
    y_test = d['test'].label.astype(dtype=int).tolist()
    
    torch.save(torch.tensor(y_train, dtype=torch.long), f"datasets/{d['name']}/train.labels.pth")
    torch.save(torch.tensor(y_valid, dtype=torch.long), f"datasets/{d['name']}/valid.labels.pth")
    torch.save(torch.tensor(y_train_valid, dtype=torch.long), f"datasets/{d['name']}/train.valid.labels.pth")
    torch.save(torch.tensor(y_test, dtype=torch.long), f"datasets/{d['name']}/test.labels.pth")
    
    np.save(f"datasets/{d['name']}/train.labels.npy", y_train, allow_pickle=True)
    np.save(f"datasets/{d['name']}/valid.labels.npy", y_valid, allow_pickle=True)
    np.save(f"datasets/{d['name']}/train.valid.labels.npy", y_train_valid, allow_pickle=True)
    np.save(f"datasets/{d['name']}/test.labels.npy", y_test, allow_pickle=True)
