## Check for Available GPU

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


##Import Kaggle Datasets

In [None]:
!pip install opendatasets
import opendatasets as od
od.download("https://www.kaggle.com/datasets/emineyetm/fake-news-detection-datasets/data")

Skipping, found downloaded files in "./fake-news-detection-datasets" (use force=True to force download)


In [None]:
import pandas as pd

# Import datasets from Kaggle
true_df = pd.read_csv("/content/fake-news-detection-datasets/News _dataset/True.csv")
fake_df = pd.read_csv("/content/fake-news-detection-datasets/News _dataset/Fake.csv")

# Output numbers of articles in both dfs
print('Number of True Articles: {:,}'.format(true_df.shape[0]))
print('Number of Fake Articles: {:,}'.format(fake_df.shape[0]))

# Output 10 random articles to make sure datasets are working
true_df.sample(10)


Number of True Articles: 21,417
Number of Fake Articles: 23,481


Unnamed: 0,title,text,subject,date
1605,House Republican unveils tax bill to aid hurri...,WASHINGTON (Reuters) - The top Republican tax ...,politicsNews,"September 22, 2017"
8106,Clinton condemns 'apparent terrorist attacks' ...,WASHINGTON (Reuters) - Democratic presidential...,politicsNews,"September 18, 2016"
525,U.S. court to hold hearing Monday on who will ...,WASHINGTON (Reuters) - A federal court in the ...,politicsNews,"November 27, 2017"
15464,Britain says expects most EU citizens can stay...,LONDON (Reuters) - The British government said...,worldnews,"November 7, 2017"
10115,Pentagon to send about a dozen Guantanamo inma...,WASHINGTON (Reuters) - The Pentagon plans to t...,politicsNews,"March 31, 2016"
2375,Illinois governor rejects school funding legis...,CHICAGO (Reuters) - Illinois Governor Bruce Ra...,politicsNews,"August 1, 2017"
918,Congress should weigh U.S. regulation of gun '...,WASHINGTON (Reuters) - Congress should conside...,politicsNews,"November 1, 2017"
8699,"On bike, horse and foot, police keep order out...",CLEVELAND (Reuters) - City police on bicycles ...,politicsNews,"July 20, 2016"
8854,Sanders endorsement of Clinton could come as e...,WASHINGTON (Reuters) - Democratic presumptive ...,politicsNews,"July 6, 2016"
15019,Former intelligence officials say Trump is bei...,WASHINGTON (Reuters) - Two former top U.S. int...,worldnews,"November 12, 2017"


## Combine Both Datasets, Assign Labels, Remove Title to Only Analyze Article


In [None]:
# Assign Truth Labels
true_df['label'] = 1
fake_df['label'] = 0

# Combine Datasets
combined_df = pd.concat([true_df, fake_df], ignore_index=True)

# Remove Dates, Subject, and Title (Way Easier to Discern Real/Fake from Title, adds Challenge)
combined_df = combined_df.drop(['date'], axis=1)
combined_df = combined_df.drop(['subject'], axis=1)
combined_df = combined_df.drop(['title'], axis=1)

# Remove Duplicates
combined_df.drop_duplicates(subset='text', keep='first', inplace=True)

# Shuffle dataset
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

# Create lists of corresponding articles and labels
articles = combined_df.text.values
labels = combined_df.label.values

# Display Sample of Combined DF
combined_df.head(10)

Unnamed: 0,text,label
0,Just when you thought Kentucky County Clerk Ki...,0
1,PARIS (Reuters) - France s foreign ministry sa...,1
2,BERLIN (Reuters) - Germany s departing finance...,1
3,"In a pair of tweets on Thursday afternoon, Don...",0
4,GENEVA (Reuters) - Beijing is waging a campaig...,1
5,WASHINGTON (Reuters) - U.S. President Donald T...,1
6,"HUNTINGTON, WV (Reuters) - When Carissa Sellar...",1
7,BUENOS AIRES (Reuters) - Dozens of relatives o...,1
8,History has shown that one of the most cost-ef...,0
9,The worst thing is the number of white people ...,0


## Import BERT

In [None]:
from transformers import BertTokenizer

# Load Tokenizer
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Loading BERT tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Tokenize Titles and Articles

In [None]:
# Tokenize all articles and map the tokens to their word IDs.
input_ids = []
attention_masks = []
max_len = 512 # Cutoff Point for Longer Articles

# Print num of articles for reference
print("Num of Articles:", len(articles))
track_num = 0

# Encode all articles
for article in articles:
    encoded_dict = tokenizer.encode_plus(
                        article,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        padding = 'max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        truncation=True, # Truncate all articles > 512
                        return_tensors = 'pt'     # Return pytorch tensors.
                   )

    # Add encoded sentence to list
    input_ids.append(encoded_dict['input_ids'])

    # Add attention mask to list
    attention_masks.append(encoded_dict['attention_mask'])

    # Print article tracking (took long time to tokenize)
    track_num += 1
    if track_num % 1000 == 0:
      print("Articles Encoded:", track_num)

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print article 0 as a list of IDs
print('Original: ', articles[0])
print('Token IDs:', input_ids[0])

Num of Articles: 38646
Articles Encoded: 1000
Articles Encoded: 2000
Articles Encoded: 3000
Articles Encoded: 4000
Articles Encoded: 5000
Articles Encoded: 6000
Articles Encoded: 7000
Articles Encoded: 8000
Articles Encoded: 9000
Articles Encoded: 10000
Articles Encoded: 11000
Articles Encoded: 12000
Articles Encoded: 13000
Articles Encoded: 14000
Articles Encoded: 15000
Articles Encoded: 16000
Articles Encoded: 17000
Articles Encoded: 18000
Articles Encoded: 19000
Articles Encoded: 20000
Articles Encoded: 21000
Articles Encoded: 22000
Articles Encoded: 23000
Articles Encoded: 24000
Articles Encoded: 25000
Articles Encoded: 26000
Articles Encoded: 27000
Articles Encoded: 28000
Articles Encoded: 29000
Articles Encoded: 30000
Articles Encoded: 31000
Articles Encoded: 32000
Articles Encoded: 33000
Articles Encoded: 34000
Articles Encoded: 35000
Articles Encoded: 36000
Articles Encoded: 37000
Articles Encoded: 38000
Original:  Just when you thought Kentucky County Clerk Kim Davis would dri

## Training - Validation - Test Split

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the inputs into a TensorDataset
dataset = TensorDataset(input_ids, attention_masks, labels)

# 70 - 10 - 20 split for training, validation, and test
train_size =  round(0.7 * len(dataset))
val_size =  round(0.1 * len(dataset))
test_size =  len(dataset) - train_size - val_size

# Divide the dataset by randomly selecting samples
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
print('{:>5,} test samples'.format(test_size))

27,052 training samples
3,865 validation samples
7,729 test samples


## Iterator for Data

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

# Load training and validation data
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )
validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

## Model Training


####Import BERT Sequence Classification

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions = False, output_hidden_states = False)

# Run model on GPU.
model.cuda()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

#### Optimizer and LR Scheduler

In [None]:
from transformers import get_linear_schedule_with_warmup

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.00005)

# Number of epochs
epochs = 3

# Total number of training steps
total_steps = batch_size * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

#### Accuracy and Time Helpers

In [None]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

#### Training Loop - Copied From Notebook 4


In [None]:
import random
import numpy as numpy

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # 0 the gradients
        optimizer.zero_grad()

        # Forward pass, calculate logit predictions.
        # token_type_ids is the same as the "segment ids", which
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        result = model(b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask,
                       labels=b_labels,
                       return_dict=True)

        # Get the loss and "logits" output by the model. The "logits" are the
        # output values prior to applying an activation function like the
        # softmax.
        loss = result.loss
        logits = result.logits

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():

            # Forward pass, calculate logit predictions.
            result = model(b_input_ids,
                           token_type_ids=None,
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        loss = result.loss
        logits = result.logits

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)


    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of    846.    Elapsed: 0:01:55.
  Batch    80  of    846.    Elapsed: 0:03:54.
  Batch   120  of    846.    Elapsed: 0:05:53.
  Batch   160  of    846.    Elapsed: 0:07:53.
  Batch   200  of    846.    Elapsed: 0:09:53.
  Batch   240  of    846.    Elapsed: 0:11:52.
  Batch   280  of    846.    Elapsed: 0:13:51.
  Batch   320  of    846.    Elapsed: 0:15:51.
  Batch   360  of    846.    Elapsed: 0:17:50.
  Batch   400  of    846.    Elapsed: 0:19:50.
  Batch   440  of    846.    Elapsed: 0:21:49.
  Batch   480  of    846.    Elapsed: 0:23:49.
  Batch   520  of    846.    Elapsed: 0:25:48.
  Batch   560  of    846.    Elapsed: 0:27:48.
  Batch   600  of    846.    Elapsed: 0:29:47.
  Batch   640  of    846.    Elapsed: 0:31:47.
  Batch   680  of    846.    Elapsed: 0:33:47.
  Batch   720  of    846.    Elapsed: 0:35:46.
  Batch   760  of    846.    Elapsed: 0:37:45.
  Batch   800  of    846.    Elapsed: 0:39:45.
  Batch   840  of    846.    Elapsed: 0:41:44.


#### Summary of Training

In [None]:
import pandas as pd

# Display floats with two decimal places.
pd.set_option('display.precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.02,0.0084,1.0,0:42:02,0:01:58
2,0.01,0.0084,1.0,0:42:06,0:01:58
3,0.01,0.0084,1.0,0:42:04,0:01:58


## Testing the Model

In [None]:
# Load testing data
prediction_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = batch_size # Same batch size as with training
        )

# Prediction on test set
print('Predicting labels for {:,} test sentences...'.format(len(test_dataset)))

# Put model in evaluation mode
model.eval()

# Tracking variables
predictions , true_labels = [], []

# Predict
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)

  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch

  # Telling the model not to compute or store gradients, saving memory and
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions.
      result = model(b_input_ids,
                     token_type_ids=None,
                     attention_mask=b_input_mask,
                     return_dict=True)

  logits = result.logits

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 7,729 test sentences...
    DONE.


Using MCC

In [None]:
from sklearn.metrics import matthews_corrcoef

matthews_set = []

# Evaluate each test batch using Matthew's correlation coefficient
print('Calculating Matthews Corr. Coef. for each batch...')

# For each input batch...
for i in range(len(true_labels)):

  # The predictions for this batch are a 2-column ndarray (one column for "0"
  # and one column for "1"). Pick the label with the highest value and turn this
  # in to a list of 0s and 1s.
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()

  # Calculate and store the coef for this batch.
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)
  matthews_set.append(matthews)

Calculating Matthews Corr. Coef. for each batch...


In [None]:
# Combine the results across all batches.
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

# Calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('Total MCC: %.3f' % mcc)

Total MCC: 0.998
