# Real or Fake using LSTM & BERT

* This project was created in collaboration with Tomer Segall

In This note book we train two models:
* LSTM (Long Short-Term Memory)
* BERT (Bidirectional Encoder Representations from Transformers)

The models will predict if a news report is real or fake

In [None]:
#Imports

import torch
import torch.nn as nn
import torch.optim as optim
from random import sample
import numpy as np
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
from random import shuffle
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# Imports BERT

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [None]:
# Read dataset and create input column

filepath = "../input/real-and-fake-news-dataset/news.csv"
df = pd.read_csv(filepath)
df.drop('Unnamed: 0', axis=1, inplace=True)
df['titletext'] = df['title'] + " " + df['text']
# Cap the sentences length
df['titletext'] = df = df[df['titletext'].str.split().str.len().lt(1000)]
df.head()

In [None]:
#Vocabulary class

UNK_TOKEN = 9
class Vocab:
    def __init__(self):
        self.word2id = {"__unk__": UNK_TOKEN}
        self.id2word = {UNK_TOKEN: "__unk__"}
        self.n_words = 1

        self.tag2id = {"FAKE": 0, "REAL": 1}
        self.id2tag = {0: "FAKE", 1: "REAL"}

    def index_words(self, words):
        word_indexes = [self.index_word(w) for w in words]
        return word_indexes

    def index_tags(self, tag):
        tag_index = self.tag2id[tag]
        return tag_index

    def index_word(self, w):
        if w not in self.word2id:
            self.word2id[w] = self.n_words
            self.id2word[self.n_words] = w
            self.n_words += 1
        return self.word2id[w]




In [None]:
vocab = Vocab()
def prepare_data(data, vocab, input_field):
    data_sequences = []

    for _, row in data.iterrows():
        words = row[input_field].split()
        tags = row["label"]
        word_ids = torch.tensor(vocab.index_words(words), dtype=torch.long).to(DEVICE)
        tag_ids = torch.tensor(vocab.index_tags(tags), dtype=torch.long).to(DEVICE)
        data_sequences.append([word_ids, tag_ids])

    return data_sequences, vocab



In [None]:
#Create data sequnce

sequences, vocab = prepare_data(df, vocab, "titletext")
x = [i[0] for i in sequences]
y = [i[1] for i in sequences]

# pad sentences to use batches
padded_x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True)
x = [i for i in padded_x]

In [None]:
# Number of unique words

print(vocab.n_words)

In [None]:
# Split data to train, validation and test

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
test_sequences = list(zip(x_test,y_test))
test_sequences = [list(x) for x in test_sequences]
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=0)
train_sequences = list(zip(x_train,y_train))
train_sequences = [list(x) for x in train_sequences]
val_sequences = list(zip(x_val,y_val))
val_sequences = [list(x) for x in val_sequences]

In [None]:
#LSTM class, architecture and forward

class LSTMNERNet(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, n_layers, directions, dropout):
        super(LSTMNERNet, self).__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.directions = directions
        self.bidirectional = True if directions == 2 else False
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, bidirectional=self.bidirectional, batch_first=True)
        self.fc1 = nn.Linear(hidden_size*directions, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_sentence):
        num_dimensions = len(input_sentence)
        sentence = input_sentence.clone().detach().to(DEVICE)
        embedded = self.embedding(sentence)
        packed_output, (hidden, cell) = self.lstm(embedded.view(num_dimensions, sentence.size()[1],embedding_size))
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        output = self.dropout(self.fc1(hidden))
        output = self.out(output)

        return output

In [None]:
def evaluate(eval_sequences, batch_size):
  eval_loader = DataLoader(eval_sequences, batch_size=batch_size, shuffle=True)
  preds = []
  tags = []
  with torch.no_grad():
      for words, tag in eval_loader:
          preds.append(model(words).argmax(dim=1).cpu().data.numpy()) 
          tags.append(tag.cpu().data.numpy())
  preds = np.concatenate(preds).ravel()
  tags = np.concatenate(tags).ravel()
  accuracy = (preds == tags).sum() / len(tags) * 100
  return accuracy

In [None]:
def train_loop(model, n_epochs, batch_size, train_set, test_set):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)

    for e in range(1, n_epochs + 1):
        count = 0
        for words, tags in  iter(train_loader):
            model.zero_grad()
            seq_len = len(words)
            sentence_loss = 0
            output = model(words)
            sentence_loss = criterion(output, tags)
            sentence_loss.backward()
            optimizer.step()
            if count % 100 == 0:
                print(f"Epoch #{e}, Batch: {count},  Loss: {sentence_loss}")
            count += 1


        train_accuracy = evaluate(train_set, batch_size)
        print(f"Epoch {e}, Training Accuracy: {train_accuracy}%")

        test_accuracy = evaluate(test_set, batch_size)
        print(f"Epoch {e}, Validation Accuracy: {test_accuracy}%")

In [None]:
# See what more important, title or text

# PreProcess data
title_sequences, vocab = prepare_data(df, vocab, "title")
title_x = [i[0] for i in title_sequences]
title_y = [i[1] for i in title_sequences]

# pad sentences to use batches
title_padded_x = torch.nn.utils.rnn.pad_sequence(title_x, batch_first=True)
title_x = [i for i in title_padded_x]

title_x_train, title_x_test, title_y_train, title_y_test = train_test_split(title_x, title_y, test_size=0.2, random_state=42)
title_test_sequences = list(zip(title_x_test,title_y_test))
title_test_sequences = [list(x) for x in title_test_sequences]
title_train_sequences = list(zip(title_x_train,title_y_train))
title_train_sequences = [list(x) for x in title_train_sequences]

input_size = vocab.n_words
embedding_size = 300
hidden_sizes = [300, 500]
output_size = len(vocab.id2tag)
n_layers = [2,3]
directions = [2]
n_epochs = 10
dropouts = [0.2]
batch_sizes = [32]
result_df = pd.DataFrame([])

#Only title
print("Train only on Titles")
for hidden_size in hidden_sizes:
    for layers in n_layers:
        for direction in directions:
            for dropout in dropouts:
              for batch_size in batch_sizes:
                caption = f"hidden_size - {hidden_size}, n_layers - {layers}, directions - {direction}, dropout {dropout}"
                print(caption)
                model = LSTMNERNet(input_size, embedding_size, hidden_size, output_size, layers, direction, dropout)
                train_loop(model, n_epochs, batch_size, title_train_sequences, title_test_sequences)
                train_accuracy = evaluate(title_train_sequences, batch_size)
                test_accuracy = evaluate(title_test_sequences, batch_size)
                temp_df = pd.DataFrame([[train_accuracy, test_accuracy]], index=[caption], columns=["training_accuracy", "test_accuracy"])
                result_df = result_df.append(temp_df)

# PreProcess data
text_sequences, vocab = prepare_data(df, vocab, "text")
text_x = [i[0] for i in text_sequences]
text_y = [i[1] for i in text_sequences]

# pad sentences to use batches
text_padded_x = torch.nn.utils.rnn.pad_sequence(text_x, batch_first=True)
text_x = [i for i in text_padded_x]

text_x_train, text_x_test, text_y_train, text_y_test = train_test_split(text_x, text_y, test_size=0.2, random_state=42)
text_test_sequences = list(zip(text_x_test,text_y_test))
text_test_sequences = [list(x) for x in text_test_sequences]
text_train_sequences = list(zip(text_x_train,text_y_train))
text_train_sequences = [list(x) for x in text_train_sequences]

input_size = vocab.n_words
embedding_size = 300
hidden_sizes = [500]
output_size = len(vocab.id2tag)
n_layers = [2]
directions = [2]
n_epochs = 6
dropouts = [0.2]
batch_sizes = [32]
result_df = pd.DataFrame([])


#Only on text
print("Train only on Text")
for hidden_size in hidden_sizes:
    for layers in n_layers:
        for direction in directions:
            for dropout in dropouts:
              for batch_size in batch_sizes:
                caption = f"hidden_size - {hidden_size}, n_layers - {layers}, directions - {direction}, dropout {dropout}"
                print(caption)
                model = LSTMNERNet(input_size, embedding_size, hidden_size, output_size, layers, direction, dropout)
                train_loop(model, n_epochs, batch_size, text_train_sequences, text_test_sequences)
                train_accuracy = evaluate(text_train_sequences, batch_size)
                test_accuracy = evaluate(text_test_sequences, batch_size)
                temp_df = pd.DataFrame([[train_accuracy, test_accuracy]], index=[caption], columns=["training_accuracy", "test_accuracy"])
                result_df = result_df.append(temp_df)

In [None]:
# Try diffrent Hyper params for the model

input_size = vocab.n_words
embedding_size = 300
hidden_sizes = [300, 500]
output_size = len(vocab.id2tag)
n_layers = [2,3]
directions = [2]
n_epochs = 10
dropouts = [0.2]
batch_sizes = [32]
result_df = pd.DataFrame([])

for hidden_size in hidden_sizes:
    for layers in n_layers:
        for direction in directions:
            for dropout in dropouts:
              for batch_size in batch_sizes:
                caption = f"hidden_size - {hidden_size}, n_layers - {layers}, directions - {direction}, dropout {dropout}"
                print(caption)
                model = LSTMNERNet(input_size, embedding_size, hidden_size, output_size, layers, direction, dropout)
                train_loop(model, n_epochs, batch_size, train_sequences, test_sequences)
                train_accuracy = evaluate(train_sequences, batch_size)
                test_accuracy = evaluate(test_sequences, batch_size)
                temp_df = pd.DataFrame([[train_accuracy, test_accuracy]], index=[caption], columns=["training_accuracy", "test_accuracy"])
                result_df = result_df.append(temp_df)

In [None]:
result_df

In [None]:
raw_data_path = '../input/real-and-fake-news-dataset/news.csv'
destination_folder = './'
source_folder = '../input/real-and-fake-news-dataset'

train_test_ratio = 0.10
train_valid_ratio = 0.80

first_n_words = 200

In [None]:
def trim_string(x):

    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x

In [None]:
# Read raw data
df_raw = pd.read_csv(raw_data_path)

# Prepare columns
df_raw['label'] = (df_raw['label'] == 'FAKE').astype('int')
df_raw['titletext'] = df_raw['title'] + ". " + df_raw['text']
df_raw = df_raw.reindex(columns=['label', 'title', 'text', 'titletext'])

# Drop rows with empty text
df_raw.drop( df_raw[df_raw.text.str.len() < 5].index, inplace=True)

# Trim text and titletext to first_n_words
df_raw['text'] = df_raw['text'].apply(trim_string)
df_raw['titletext'] = df_raw['titletext'].apply(trim_string) 

# Split according to label
df_real = df_raw[df_raw['label'] == 0]
df_fake = df_raw[df_raw['label'] == 1]

# Train-test split
df_real_full_train, df_real_test = train_test_split(df_real, train_size = train_test_ratio, random_state = 1)
df_fake_full_train, df_fake_test = train_test_split(df_fake, train_size = train_test_ratio, random_state = 1)

# Train-valid split
df_real_train, df_real_valid = train_test_split(df_real_full_train, train_size = train_valid_ratio, random_state = 1)
df_fake_train, df_fake_valid = train_test_split(df_fake_full_train, train_size = train_valid_ratio, random_state = 1)

# Concatenate splits of different labels
df_train = pd.concat([df_real_train, df_fake_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_real_valid, df_fake_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_real_test, df_fake_test], ignore_index=True, sort=False)

# Write preprocessed data
df_train.to_csv(destination_folder + '/train.csv', index=False)
df_valid.to_csv(destination_folder + '/valid.csv', index=False)
df_test.to_csv(destination_folder + '/test.csv', index=False)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Model parameter
MAX_SEQ_LEN = 128
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('label', label_field), ('title', text_field), ('text', text_field), ('titletext', text_field)]

# TabularDataset

train, valid, test = TabularDataset.splits(path=source_folder, train='train.csv', validation='valid.csv',
                                           test='test.csv', format='CSV', fields=fields, skip_header=True)

# Iterators

train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.text),
                            device=DEVICE, train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.text),
                            device=DEVICE, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=16, device=DEVICE, train=False, shuffle=False, sort=False)


In [None]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        options_name = "bert-base-uncased"
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea

In [None]:
# Training Function

def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = 5,
          eval_every = len(train_iter) // 2,
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        for (labels, title, text, titletext), _ in train_loader:
            labels = labels.type(torch.LongTensor)           
            labels = labels.to(DEVICE)
            titletext = titletext.type(torch.LongTensor)  
            titletext = titletext.to(DEVICE)
            output = model(titletext, labels)
            loss, _ = output

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    

                    # validation loop
                    for (labels, title, text, titletext), _ in valid_loader:
                        labels = labels.type(torch.LongTensor)           
                        labels = labels.to(DEVICE)
                        titletext = titletext.type(torch.LongTensor)  
                        titletext = titletext.to(DEVICE)
                        output = model(titletext, labels)
                        loss, _ = output
                        
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
    print('Finished Training!')

model = BERT().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=2e-5)

train(model=model, optimizer=optimizer)

In [None]:
# Evaluation Function

def evaluate(model, test_loader):
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for (labels, title, text, titletext), _ in test_loader:

                labels = labels.type(torch.LongTensor)           
                labels = labels.to(DEVICE)
                titletext = titletext.type(torch.LongTensor)  
                titletext = titletext.to(DEVICE)
                output = model(titletext, labels)

                _, output = output
                y_pred.extend(torch.argmax(output, 1).tolist())
                y_true.extend(labels.tolist())
    
    print('Classification Report:')
    print(classification_report(y_true, y_pred, labels=[1,0], digits=4))
    
    cm = confusion_matrix(y_true, y_pred, labels=[1,0])
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")

    ax.set_title('Confusion Matrix')

    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')

    ax.xaxis.set_ticklabels(['FAKE', 'REAL'])
    ax.yaxis.set_ticklabels(['FAKE', 'REAL'])


evaluate(model, test_iter)

## Thank you for reading