In [1]:
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
from torchnlp.datasets import imdb_dataset
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

Using TensorFlow backend.


In [53]:
df = pd.read_csv('news_gen_just_text.csv')

In [54]:
df.head()

Unnamed: 0,Article Content,Real or Fake
0,Health Secretary Matt Hancock has told MPs the...,real
1,EXCLUSIVE: Poplar and Limehouse in east London...,real
2,Perri Kiely received a staggering perfect scor...,real
3,Every UK nation has reported coronavirus cases...,real
4,Vet Janey Lowes had a glam WAG life with her p...,real


In [18]:
from nltk.tokenize import sent_tokenize, word_tokenize

data = "All work and no play makes jack a dull boy, all work and no play"
print(word_tokenize(data))

['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']


In [56]:
df.insert(2,'tokens',[0]*len(df),True)

In [57]:
for i in range(len(df)): 
    df['tokens'][i] = word_tokenize(df['Article Content'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [58]:
df['length'] = df['tokens'].apply(lambda x: len(x))


In [59]:
df = df[df['length']>=300]

In [65]:
df_ts = df.sample(frac =.20, random_state=42) 

In [66]:
df_tr = df.drop(df_ts.index)

In [67]:
train_data = [{'text': text, 'type': type_data } for text in list(df_tr['Article Content']) for type_data in list(df_tr['Real or Fake'])]
test_data = [{'text': text, 'type': type_data } for text in list(df_ts['Article Content']) for type_data in list(df_ts['Real or Fake'])]

In [68]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['type']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['type']), test_data)))

In [119]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], test_texts))

train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))

train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")

In [120]:
train_y = np.array(train_labels) == 'fake'
test_y = np.array(test_labels) == 'fake'

In [121]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()
        
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [122]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [123]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

In [124]:
BATCH_SIZE = 14
EPOCHS = 1

In [125]:
train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

In [126]:
test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [127]:
bert_clf = BertBinaryClassifier()
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=1.5e-5)

for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        token_ids = torch.tensor(token_ids).to(torch.int64)
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

  if __name__ == '__main__':


Epoch:  1
0/2524.5714285714284 loss: 0.7103530764579773 
Epoch:  1
1/2524.5714285714284 loss: 0.6883845925331116 
Epoch:  1
2/2524.5714285714284 loss: 0.6884773770968119 
Epoch:  1
3/2524.5714285714284 loss: 0.686048611998558 
Epoch:  1
4/2524.5714285714284 loss: 0.6944819331169129 
Epoch:  1
5/2524.5714285714284 loss: 0.6862620413303375 
Epoch:  1
6/2524.5714285714284 loss: 0.7057885357311794 
Epoch:  1
7/2524.5714285714284 loss: 0.7175383269786835 
Epoch:  1
8/2524.5714285714284 loss: 0.721152643362681 
Epoch:  1
9/2524.5714285714284 loss: 0.7320354163646698 
Epoch:  1
10/2524.5714285714284 loss: 0.7333576895973899 
Epoch:  1
11/2524.5714285714284 loss: 0.728767603635788 
Epoch:  1
12/2524.5714285714284 loss: 0.7285027137169471 
Epoch:  1
13/2524.5714285714284 loss: 0.7320672443934849 
Epoch:  1
14/2524.5714285714284 loss: 0.733008881409963 
Epoch:  1
15/2524.5714285714284 loss: 0.731410875916481 
Epoch:  1
16/2524.5714285714284 loss: 0.7313590189989876 
Epoch:  1
17/2524.57142857142

KeyboardInterrupt: 

In [99]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)

#token_ids = token_ids.astype(np.float32)
logits = bert_clf(token_ids, masks)
loss_func = nn.BCELoss()
loss = loss_func(logits, labels)
numpy_logits = logits.cpu().detach().numpy()
        
bert_predicted += list(numpy_logits[:, 0] > 0.5)
all_logits += list(numpy_logits[:, 0])

print(classification_report(test_y, bert_predicted))

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.IntTensor instead (while checking arguments for embedding)